xref: /openbsd/sys/arch/amd64/amd64/pmap.c (revision 91f110e0)
1 /*	$OpenBSD: pmap.c,v 1.68 2014/03/07 16:56:57 guenther Exp $	*/
2 /*	$NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $	*/
3 
4 /*
5  *
6  * Copyright (c) 1997 Charles D. Cranor and Washington University.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *      This product includes software developed by Charles D. Cranor and
20  *      Washington University.
21  * 4. The name of the author may not be used to endorse or promote products
22  *    derived from this software without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
29  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
33  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  */
35 
36 /*
37  * Copyright 2001 (c) Wasabi Systems, Inc.
38  * All rights reserved.
39  *
40  * Written by Frank van der Linden for Wasabi Systems, Inc.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. All advertising materials mentioning features or use of this software
51  *    must display the following acknowledgement:
52  *      This product includes software developed for the NetBSD Project by
53  *      Wasabi Systems, Inc.
54  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
55  *    or promote products derived from this software without specific prior
56  *    written permission.
57  *
58  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
59  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
60  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
61  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
62  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
63  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
64  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
65  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
66  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
67  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
68  * POSSIBILITY OF SUCH DAMAGE.
69  */
70 
71 /*
72  * This is the i386 pmap modified and generalized to support x86-64
73  * as well. The idea is to hide the upper N levels of the page tables
74  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
75  * is mostly untouched, except that it uses some more generalized
76  * macros and interfaces.
77  *
78  * This pmap has been tested on the i386 as well, and it can be easily
79  * adapted to PAE.
80  *
81  * fvdl@wasabisystems.com 18-Jun-2001
82  */
83 
84 /*
85  * pmap.c: i386 pmap module rewrite
86  * Chuck Cranor <chuck@ccrc.wustl.edu>
87  * 11-Aug-97
88  *
89  * history of this pmap module: in addition to my own input, i used
90  *    the following references for this rewrite of the i386 pmap:
91  *
92  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
93  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
94  *     it was then ported to the i386 by William Jolitz of UUNET
95  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
96  *     project fixed some bugs and provided some speed ups.
97  *
98  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
99  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
100  *     and David Greenman.
101  *
102  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
103  *     between several processors.   the VAX version was done by
104  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
105  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
106  *     David Golub, and Richard Draves.    the alpha version was
107  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
108  *     (NetBSD/alpha).
109  */
110 
111 #include <sys/param.h>
112 #include <sys/systm.h>
113 #include <sys/proc.h>
114 #include <sys/malloc.h>
115 #include <sys/pool.h>
116 #include <sys/user.h>
117 #include <sys/kernel.h>
118 #include <sys/mutex.h>
119 #include <sys/sched.h>
120 
121 #include <uvm/uvm.h>
122 
123 #include <machine/atomic.h>
124 #include <machine/lock.h>
125 #include <machine/cpu.h>
126 #include <machine/specialreg.h>
127 
128 #include <dev/isa/isareg.h>
129 #include <machine/isa_machdep.h>
130 
131 /*
132  * general info:
133  *
134  *  - for an explanation of how the i386 MMU hardware works see
135  *    the comments in <machine/pte.h>.
136  *
137  *  - for an explanation of the general memory structure used by
138  *    this pmap (including the recursive mapping), see the comments
139  *    in <machine/pmap.h>.
140  *
141  * this file contains the code for the "pmap module."   the module's
142  * job is to manage the hardware's virtual to physical address mappings.
143  * note that there are two levels of mapping in the VM system:
144  *
145  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
146  *      to map ranges of virtual address space to objects/files.  for
147  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
148  *      to the file /bin/ls starting at offset zero."   note that
149  *      the upper layer mapping is not concerned with how individual
150  *      vm_pages are mapped.
151  *
152  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
153  *      from virtual addresses.   it is concerned with which vm_page is
154  *      mapped where.   for example, when you run /bin/ls and start
155  *      at page 0x1000 the fault routine may lookup the correct page
156  *      of the /bin/ls file and then ask the pmap layer to establish
157  *      a mapping for it.
158  *
159  * note that information in the lower layer of the VM system can be
160  * thrown away since it can easily be reconstructed from the info
161  * in the upper layer.
162  *
163  * data structures we use include:
164  *  - struct pmap: describes the address space of one process
165  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
166  *  - struct pg_to_free: a list of virtual addresses whose mappings
167  *	have been changed.   used for TLB flushing.
168  */
169 
170 /*
171  * memory allocation
172  *
173  *  - there are three data structures that we must dynamically allocate:
174  *
175  * [A] new process' page directory page (PDP)
176  *	- plan 1: done at pmap_create() we use
177  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
178  *	  allocation.
179  *
180  * if we are low in free physical memory then we sleep in
181  * uvm_km_alloc -- in this case this is ok since we are creating
182  * a new pmap and should not be holding any locks.
183  *
184  * if the kernel is totally out of virtual space
185  * (i.e. uvm_km_alloc returns NULL), then we panic.
186  *
187  * XXX: the fork code currently has no way to return an "out of
188  * memory, try again" error code since uvm_fork [fka vm_fork]
189  * is a void function.
190  *
191  * [B] new page tables pages (PTP)
192  * 	call uvm_pagealloc()
193  * 		=> success: zero page, add to pm_pdir
194  * 		=> failure: we are out of free vm_pages, let pmap_enter()
195  *		   tell UVM about it.
196  *
197  * note: for kernel PTPs, we start with NKPTP of them.   as we map
198  * kernel memory (at uvm_map time) we check to see if we've grown
199  * the kernel pmap.   if so, we call the optional function
200  * pmap_growkernel() to grow the kernel PTPs in advance.
201  *
202  * [C] pv_entry structures
203  *	- try to allocate one from the pool.
204  *	If we fail, we simply let pmap_enter() tell UVM about it.
205  */
206 
207 vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
208 int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
209 long nkptp[] = NKPTP_INITIALIZER;
210 long nkptpmax[] = NKPTPMAX_INITIALIZER;
211 long nbpd[] = NBPD_INITIALIZER;
212 pd_entry_t *normal_pdes[] = PDES_INITIALIZER;
213 pd_entry_t *alternate_pdes[] = APDES_INITIALIZER;
214 
215 /* int nkpde = NKPTP; */
216 
217 #define PMAP_MAP_TO_HEAD_LOCK()		/* null */
218 #define PMAP_MAP_TO_HEAD_UNLOCK()	/* null */
219 
220 #define PMAP_HEAD_TO_MAP_LOCK()		/* null */
221 #define PMAP_HEAD_TO_MAP_UNLOCK()	/* null */
222 
223 #define COUNT(x)	/* nothing */
224 
225 /*
226  * global data structures
227  */
228 
229 struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
230 
231 /*
232  * pmap_pg_wc: if our processor supports PAT then we set this
233  * to be the pte bits for Write Combining. Else we fall back to
234  * UC- so mtrrs can override the cacheability;
235  */
236 int pmap_pg_wc = PG_UCMINUS;
237 
238 /*
239  * other data structures
240  */
241 
242 pt_entry_t protection_codes[8];     /* maps MI prot to i386 prot code */
243 boolean_t pmap_initialized = FALSE; /* pmap_init done yet? */
244 
245 /*
246  * pv management structures.
247  */
248 struct pool pmap_pv_pool;
249 
250 /*
251  * linked list of all non-kernel pmaps
252  */
253 
254 struct pmap_head pmaps;
255 
256 /*
257  * pool that pmap structures are allocated from
258  */
259 
260 struct pool pmap_pmap_pool;
261 
262 /*
263  * When we're freeing a ptp, we need to delay the freeing until all
264  * tlb shootdown has been done. This is the list of the to-be-freed pages.
265  */
266 TAILQ_HEAD(pg_to_free, vm_page);
267 
268 /*
269  * pool that PDPs are allocated from
270  */
271 
272 struct pool pmap_pdp_pool;
273 void pmap_pdp_ctor(pd_entry_t *);
274 
275 extern vaddr_t msgbuf_vaddr;
276 extern paddr_t msgbuf_paddr;
277 
278 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
279 extern paddr_t idt_paddr;
280 
281 extern vaddr_t lo32_vaddr;
282 extern vaddr_t lo32_paddr;
283 
284 vaddr_t virtual_avail;
285 extern int end;
286 
287 /*
288  * local prototypes
289  */
290 
291 void  pmap_enter_pv(struct vm_page *, struct pv_entry *, struct pmap *,
292     vaddr_t, struct vm_page *);
293 struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, pd_entry_t **);
294 struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
295 void pmap_free_ptp(struct pmap *, struct vm_page *,
296     vaddr_t, pt_entry_t *, pd_entry_t **, struct pg_to_free *);
297 void pmap_freepage(struct pmap *, struct vm_page *, int, struct pg_to_free *);
298 static boolean_t pmap_is_active(struct pmap *, int);
299 void pmap_map_ptes(struct pmap *, pt_entry_t **, pd_entry_t ***);
300 struct pv_entry *pmap_remove_pv(struct vm_page *, struct pmap *, vaddr_t);
301 void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
302 boolean_t pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
303     vaddr_t, int);
304 void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t,
305     vaddr_t, vaddr_t, int);
306 #define PMAP_REMOVE_ALL		0	/* remove all mappings */
307 #define PMAP_REMOVE_SKIPWIRED	1	/* skip wired mappings */
308 
309 void pmap_unmap_ptes(struct pmap *);
310 boolean_t pmap_get_physpage(vaddr_t, int, paddr_t *);
311 boolean_t pmap_pdes_valid(vaddr_t, pd_entry_t **, pd_entry_t *);
312 void pmap_alloc_level(pd_entry_t **, vaddr_t, int, long *);
313 void pmap_apte_flush(struct pmap *pmap);
314 
315 void pmap_sync_flags_pte(struct vm_page *, u_long);
316 
317 /*
318  * p m a p   i n l i n e   h e l p e r   f u n c t i o n s
319  */
320 
321 /*
322  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
323  *		of course the kernel is always loaded
324  */
325 
326 static __inline boolean_t
327 pmap_is_curpmap(struct pmap *pmap)
328 {
329 	return((pmap == pmap_kernel()) ||
330 	       (pmap->pm_pdirpa == (paddr_t) rcr3()));
331 }
332 
333 /*
334  * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
335  */
336 
337 static __inline boolean_t
338 pmap_is_active(struct pmap *pmap, int cpu_id)
339 {
340 	return (pmap == pmap_kernel() ||
341 	    (pmap->pm_cpus & (1ULL << cpu_id)) != 0);
342 }
343 
344 static __inline u_int
345 pmap_pte2flags(u_long pte)
346 {
347 	return (((pte & PG_U) ? PG_PMAP_REF : 0) |
348 	    ((pte & PG_M) ? PG_PMAP_MOD : 0));
349 }
350 
351 void
352 pmap_sync_flags_pte(struct vm_page *pg, u_long pte)
353 {
354 	if (pte & (PG_U|PG_M)) {
355 		atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte));
356 	}
357 }
358 
359 void
360 pmap_apte_flush(struct pmap *pmap)
361 {
362 	pmap_tlb_shoottlb();
363 	pmap_tlb_shootwait();
364 }
365 
366 /*
367  * pmap_map_ptes: map a pmap's PTEs into KVM
368  *
369  * => we lock enough pmaps to keep things locked in
370  * => must be undone with pmap_unmap_ptes before returning
371  */
372 
373 void
374 pmap_map_ptes(struct pmap *pmap, pt_entry_t **ptepp, pd_entry_t ***pdeppp)
375 {
376 	pd_entry_t opde, npde;
377 
378 	/* if curpmap then we are always mapped */
379 	if (pmap_is_curpmap(pmap)) {
380 		*ptepp = PTE_BASE;
381 		*pdeppp = normal_pdes;
382 		return;
383 	}
384 
385 	/* need to load a new alternate pt space into curpmap? */
386 	opde = *APDP_PDE;
387 	if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdirpa) {
388 		npde = (pd_entry_t) (pmap->pm_pdirpa | PG_RW | PG_V);
389 		*APDP_PDE = npde;
390 		if (pmap_valid_entry(opde))
391 			pmap_apte_flush(curpcb->pcb_pmap);
392 	}
393 	*ptepp = APTE_BASE;
394 	*pdeppp = alternate_pdes;
395 }
396 
397 void
398 pmap_unmap_ptes(struct pmap *pmap)
399 {
400 	if (pmap_is_curpmap(pmap))
401 		return;
402 
403 #if defined(MULTIPROCESSOR)
404 	*APDP_PDE = 0;
405 	pmap_apte_flush(curpcb->pcb_pmap);
406 #endif
407 	COUNT(apdp_pde_unmap);
408 }
409 
410 /*
411  * p m a p   k e n t e r   f u n c t i o n s
412  *
413  * functions to quickly enter/remove pages from the kernel address
414  * space.   pmap_kremove is exported to MI kernel.  we make use of
415  * the recursive PTE mappings.
416  */
417 
418 /*
419  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
420  *
421  * => no need to lock anything, assume va is already allocated
422  * => should be faster than normal pmap enter function
423  */
424 
425 void
426 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
427 {
428 	pt_entry_t *pte, opte, npte;
429 
430 	pte = kvtopte(va);
431 
432 	npte = (pa & PMAP_PA_MASK) | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
433 	    ((pa & PMAP_NOCACHE) ? PG_N : 0) |
434 	    ((pa & PMAP_WC) ? pmap_pg_wc : 0) | PG_V;
435 
436 	/* special 1:1 mappings in the first 2MB must not be global */
437 	if (va >= (vaddr_t)NBPD_L2)
438 		npte |= PG_G;
439 
440 	if ((cpu_feature & CPUID_NXE) && !(prot & VM_PROT_EXECUTE))
441 		npte |= PG_NX;
442 	opte = pmap_pte_set(pte, npte);
443 #ifdef LARGEPAGES
444 	/* XXX For now... */
445 	if (opte & PG_PS)
446 		panic("pmap_kenter_pa: PG_PS");
447 #endif
448 	if (pmap_valid_entry(opte)) {
449 		if (pa & PMAP_NOCACHE && (opte & PG_N) == 0)
450 			wbinvd();
451 		/* This shouldn't happen */
452 		pmap_tlb_shootpage(pmap_kernel(), va);
453 		pmap_tlb_shootwait();
454 	}
455 }
456 
457 /*
458  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
459  *
460  * => no need to lock anything
461  * => caller must dispose of any vm_page mapped in the va range
462  * => note: not an inline function
463  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
464  * => we assume kernel only unmaps valid addresses and thus don't bother
465  *    checking the valid bit before doing TLB flushing
466  */
467 
468 void
469 pmap_kremove(vaddr_t sva, vsize_t len)
470 {
471 	pt_entry_t *pte, opte;
472 	vaddr_t va, eva;
473 
474 	eva = sva + len;
475 
476 	for (va = sva; va != eva; va += PAGE_SIZE) {
477 		pte = kvtopte(va);
478 
479 		opte = pmap_pte_set(pte, 0);
480 #ifdef LARGEPAGES
481 		KASSERT((opte & PG_PS) == 0);
482 #endif
483 		KASSERT((opte & PG_PVLIST) == 0);
484 	}
485 
486 	pmap_tlb_shootrange(pmap_kernel(), sva, eva);
487 	pmap_tlb_shootwait();
488 }
489 
490 /*
491  * p m a p   i n i t   f u n c t i o n s
492  *
493  * pmap_bootstrap and pmap_init are called during system startup
494  * to init the pmap module.   pmap_bootstrap() does a low level
495  * init just to get things rolling.   pmap_init() finishes the job.
496  */
497 
498 /*
499  * pmap_bootstrap: get the system in a state where it can run with VM
500  *	properly enabled (called before main()).   the VM system is
501  *      fully init'd later...
502  *
503  * => on i386, locore.s has already enabled the MMU by allocating
504  *	a PDP for the kernel, and nkpde PTP's for the kernel.
505  * => kva_start is the first free virtual address in kernel space
506  */
507 
508 paddr_t
509 pmap_bootstrap(paddr_t first_avail, paddr_t max_pa)
510 {
511 	vaddr_t kva, kva_end, kva_start = VM_MIN_KERNEL_ADDRESS;
512 	struct pmap *kpm;
513 	int i;
514 	unsigned long p1i;
515 	pt_entry_t pg_nx = (cpu_feature & CPUID_NXE? PG_NX : 0);
516 	long ndmpdp;
517 	paddr_t dmpd, dmpdp;
518 
519 	/*
520 	 * define the boundaries of the managed kernel virtual address
521 	 * space.
522 	 */
523 
524 	virtual_avail = kva_start;		/* first free KVA */
525 
526 	/*
527 	 * set up protection_codes: we need to be able to convert from
528 	 * a MI protection code (some combo of VM_PROT...) to something
529 	 * we can jam into a i386 PTE.
530 	 */
531 
532 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
533 	protection_codes[VM_PROT_EXECUTE] = PG_RO;		/* --x */
534 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
535 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO;	/* -rx */
536 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
537 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW;/* w-x */
538 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
539 								/* wr- */
540 	protection_codes[VM_PROT_ALL] = PG_RW;			/* wrx */
541 
542 	/*
543 	 * now we init the kernel's pmap
544 	 *
545 	 * the kernel pmap's pm_obj is not used for much.   however, in
546 	 * user pmaps the pm_obj contains the list of active PTPs.
547 	 * the pm_obj currently does not have a pager.   it might be possible
548 	 * to add a pager that would allow a process to read-only mmap its
549 	 * own page tables (fast user level vtophys?).   this may or may not
550 	 * be useful.
551 	 */
552 
553 	kpm = pmap_kernel();
554 	for (i = 0; i < PTP_LEVELS - 1; i++) {
555 		uvm_objinit(&kpm->pm_obj[i], NULL, 1);
556 		kpm->pm_ptphint[i] = NULL;
557 	}
558 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
559 	kpm->pm_pdir = (pd_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
560 	kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3;
561 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
562 		atop(kva_start - VM_MIN_KERNEL_ADDRESS);
563 
564 	/*
565 	 * the above is just a rough estimate and not critical to the proper
566 	 * operation of the system.
567 	 */
568 
569 	curpcb->pcb_pmap = kpm;	/* proc0's pcb */
570 
571 	/*
572 	 * enable global TLB entries.
573 	 */
574 	/* add PG_G attribute to already mapped kernel pages */
575 #if KERNBASE == VM_MIN_KERNEL_ADDRESS
576 	for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ;
577 #else
578 	kva_end = roundup((vaddr_t)&end, PAGE_SIZE);
579 	for (kva = KERNBASE; kva < kva_end ;
580 #endif
581 	     kva += PAGE_SIZE) {
582 		p1i = pl1_i(kva);
583 		if (pmap_valid_entry(PTE_BASE[p1i]))
584 			PTE_BASE[p1i] |= PG_G;
585 	}
586 
587 	/*
588 	 * Map the direct map. The first 4GB were mapped in locore, here
589 	 * we map the rest if it exists. We actually use the direct map
590 	 * here to set up the page tables, we're assuming that we're still
591 	 * operating in the lower 4GB of memory.
592 	 */
593 	ndmpdp = (max_pa + NBPD_L3 - 1) >> L3_SHIFT;
594 	if (ndmpdp < NDML2_ENTRIES)
595 		ndmpdp = NDML2_ENTRIES;		/* At least 4GB */
596 
597 	dmpdp = kpm->pm_pdir[PDIR_SLOT_DIRECT] & PG_FRAME;
598 
599 	dmpd = first_avail; first_avail += ndmpdp * PAGE_SIZE;
600 
601 	for (i = NDML2_ENTRIES; i < NPDPG * ndmpdp; i++) {
602 		paddr_t pdp;
603 		vaddr_t va;
604 
605 		pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]);
606 		va = PMAP_DIRECT_MAP(pdp);
607 
608 		*((pd_entry_t *)va) = ((paddr_t)i << L2_SHIFT);
609 		*((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | PG_G | PG_U |
610 		    PG_M;
611 	}
612 
613 	for (i = NDML2_ENTRIES; i < ndmpdp; i++) {
614 		paddr_t pdp;
615 		vaddr_t va;
616 
617 		pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
618 		va = PMAP_DIRECT_MAP(pdp);
619 
620 		*((pd_entry_t *)va) = dmpd + (i << PAGE_SHIFT);
621 		*((pd_entry_t *)va) |= PG_RW | PG_V | PG_U | PG_M;
622 	}
623 
624 	kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_V | PG_KW | PG_U |
625 	    PG_M;
626 
627 	tlbflush();
628 
629 	msgbuf_vaddr = virtual_avail;
630 	virtual_avail += round_page(MSGBUFSIZE);
631 
632 	idt_vaddr = virtual_avail;
633 	virtual_avail += 2 * PAGE_SIZE;
634 	idt_paddr = first_avail;			/* steal a page */
635 	first_avail += 2 * PAGE_SIZE;
636 
637 #if defined(MULTIPROCESSOR) || \
638     (NACPI > 0 && !defined(SMALL_KERNEL))
639 	/*
640 	 * Grab a page below 4G for things that need it (i.e.
641 	 * having an initial %cr3 for the MP trampoline).
642 	 */
643 	lo32_vaddr = virtual_avail;
644 	virtual_avail += PAGE_SIZE;
645 	lo32_paddr = first_avail;
646 	first_avail += PAGE_SIZE;
647 #endif
648 
649 	/*
650 	 * init the global lists.
651 	 */
652 	LIST_INIT(&pmaps);
653 
654 	/*
655 	 * initialize the pmap pool.
656 	 */
657 
658 	pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, 0, 0, "pmappl",
659 	    &pool_allocator_nointr);
660 	pool_init(&pmap_pv_pool, sizeof(struct pv_entry), 0, 0, 0, "pvpl",
661 	    &pool_allocator_nointr);
662 	pool_sethiwat(&pmap_pv_pool, 32 * 1024);
663 
664 	/*
665 	 * initialize the PDE pool.
666 	 */
667 
668 	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, 0, "pdppl",
669 	    &pool_allocator_nointr);
670 
671 	/*
672 	 * ensure the TLB is sync'd with reality by flushing it...
673 	 */
674 
675 	tlbflush();
676 
677 	return first_avail;
678 }
679 
680 /*
681  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
682  * trampoline code can be entered.
683  */
684 paddr_t
685 pmap_prealloc_lowmem_ptps(paddr_t first_avail)
686 {
687 	pd_entry_t *pdes;
688 	int level;
689 	paddr_t newp;
690 
691 	pdes = pmap_kernel()->pm_pdir;
692 	level = PTP_LEVELS;
693 	for (;;) {
694 		newp = first_avail; first_avail += PAGE_SIZE;
695 		memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE);
696 		pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
697 		level--;
698 		if (level <= 1)
699 			break;
700 		pdes = normal_pdes[level - 2];
701 	}
702 
703 	return first_avail;
704 }
705 
706 /*
707  * pmap_init: called from uvm_init, our job is to get the pmap
708  * system ready to manage mappings... this mainly means initing
709  * the pv_entry stuff.
710  */
711 
712 void
713 pmap_init(void)
714 {
715 	/*
716 	 * done: pmap module is up (and ready for business)
717 	 */
718 
719 	pmap_initialized = TRUE;
720 }
721 
722 /*
723  * p v _ e n t r y   f u n c t i o n s
724  */
725 
726 /*
727  * main pv_entry manipulation functions:
728  *   pmap_enter_pv: enter a mapping onto a pv list
729  *   pmap_remove_pv: remove a mapping from a pv list
730  */
731 
732 /*
733  * pmap_enter_pv: enter a mapping onto a pv list
734  *
735  * => caller should adjust ptp's wire_count before calling
736  *
737  * pve: preallocated pve for us to use
738  * ptp: PTP in pmap that maps this VA
739  */
740 
741 void
742 pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, struct pmap *pmap,
743     vaddr_t va, struct vm_page *ptp)
744 {
745 	pve->pv_pmap = pmap;
746 	pve->pv_va = va;
747 	pve->pv_ptp = ptp;			/* NULL for kernel pmap */
748 	pve->pv_next = pg->mdpage.pv_list;	/* add to ... */
749 	pg->mdpage.pv_list = pve;		/* ... list */
750 }
751 
752 /*
753  * pmap_remove_pv: try to remove a mapping from a pv_list
754  *
755  * => caller should adjust ptp's wire_count and free PTP if needed
756  * => we return the removed pve
757  */
758 
759 struct pv_entry *
760 pmap_remove_pv(struct vm_page *pg, struct pmap *pmap, vaddr_t va)
761 {
762 	struct pv_entry *pve, **prevptr;
763 
764 	prevptr = &pg->mdpage.pv_list;
765 	while ((pve = *prevptr) != NULL) {
766 		if (pve->pv_pmap == pmap && pve->pv_va == va) {	/* match? */
767 			*prevptr = pve->pv_next;		/* remove it! */
768 			break;
769 		}
770 		prevptr = &pve->pv_next;		/* previous pointer */
771 	}
772 	return(pve);				/* return removed pve */
773 }
774 
775 /*
776  * p t p   f u n c t i o n s
777  */
778 
779 struct vm_page *
780 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
781 {
782 	int lidx = level - 1;
783 	struct vm_page *pg;
784 
785 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
786 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
787 		return (pmap->pm_ptphint[lidx]);
788 	}
789 	if (lidx == 0)
790 		pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
791 	else {
792 		pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
793 	}
794 	return pg;
795 }
796 
797 void
798 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level,
799     struct pg_to_free *pagelist)
800 {
801 	int lidx;
802 	struct uvm_object *obj;
803 
804 	lidx = level - 1;
805 
806 	obj = &pmap->pm_obj[lidx];
807 	pmap->pm_stats.resident_count--;
808 	if (pmap->pm_ptphint[lidx] == ptp)
809 		pmap->pm_ptphint[lidx] = RB_ROOT(&obj->memt);
810 	ptp->wire_count = 0;
811 	uvm_pagerealloc(ptp, NULL, 0);
812 	TAILQ_INSERT_TAIL(pagelist, ptp, pageq);
813 }
814 
815 void
816 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
817     pt_entry_t *ptes, pd_entry_t **pdes, struct pg_to_free *pagelist)
818 {
819 	unsigned long index;
820 	int level;
821 	vaddr_t invaladdr;
822 	pd_entry_t opde;
823 
824 	level = 1;
825 	do {
826 		pmap_freepage(pmap, ptp, level, pagelist);
827 		index = pl_i(va, level + 1);
828 		opde = pmap_pte_set(&pdes[level - 1][index], 0);
829 		invaladdr = level == 1 ? (vaddr_t)ptes :
830 		    (vaddr_t)pdes[level - 2];
831 		pmap_tlb_shootpage(curpcb->pcb_pmap,
832 		    invaladdr + index * PAGE_SIZE);
833 #if defined(MULTIPROCESSOR)
834 		invaladdr = level == 1 ? (vaddr_t)PTE_BASE :
835 		    (vaddr_t)normal_pdes[level - 2];
836 		pmap_tlb_shootpage(pmap, invaladdr + index * PAGE_SIZE);
837 #endif
838 		if (level < PTP_LEVELS - 1) {
839 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
840 			ptp->wire_count--;
841 			if (ptp->wire_count > 1)
842 				break;
843 		}
844 	} while (++level < PTP_LEVELS);
845 }
846 
847 /*
848  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
849  *
850  * => pmap should NOT be pmap_kernel()
851  */
852 
853 
854 struct vm_page *
855 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t **pdes)
856 {
857 	struct vm_page *ptp, *pptp;
858 	int i;
859 	unsigned long index;
860 	pd_entry_t *pva;
861 	paddr_t ppa, pa;
862 	struct uvm_object *obj;
863 
864 	ptp = NULL;
865 	pa = (paddr_t)-1;
866 
867 	/*
868 	 * Loop through all page table levels seeing if we need to
869 	 * add a new page to that level.
870 	 */
871 	for (i = PTP_LEVELS; i > 1; i--) {
872 		/*
873 		 * Save values from previous round.
874 		 */
875 		pptp = ptp;
876 		ppa = pa;
877 
878 		index = pl_i(va, i);
879 		pva = pdes[i - 2];
880 
881 		if (pmap_valid_entry(pva[index])) {
882 			ppa = pva[index] & PG_FRAME;
883 			ptp = NULL;
884 			continue;
885 		}
886 
887 		obj = &pmap->pm_obj[i-2];
888 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
889 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
890 
891 		if (ptp == NULL)
892 			return NULL;
893 
894 		atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
895 		ptp->wire_count = 1;
896 		pmap->pm_ptphint[i - 2] = ptp;
897 		pa = VM_PAGE_TO_PHYS(ptp);
898 		pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V);
899 		pmap->pm_stats.resident_count++;
900 		/*
901 		 * If we're not in the top level, increase the
902 		 * wire count of the parent page.
903 		 */
904 		if (i < PTP_LEVELS) {
905 			if (pptp == NULL)
906 				pptp = pmap_find_ptp(pmap, va, ppa, i);
907 #ifdef DIAGNOSTIC
908 			if (pptp == NULL)
909 				panic("pde page disappeared");
910 #endif
911 			pptp->wire_count++;
912 		}
913 	}
914 
915 	/*
916 	 * ptp is not NULL if we just allocated a new ptp. If it's
917 	 * still NULL, we must look up the existing one.
918 	 */
919 	if (ptp == NULL) {
920 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
921 #ifdef DIAGNOSTIC
922 		if (ptp == NULL) {
923 			printf("va %lx ppa %lx\n", (unsigned long)va,
924 			    (unsigned long)ppa);
925 			panic("pmap_get_ptp: unmanaged user PTP");
926 		}
927 #endif
928 	}
929 
930 	pmap->pm_ptphint[0] = ptp;
931 	return(ptp);
932 }
933 
934 /*
935  * p m a p  l i f e c y c l e   f u n c t i o n s
936  */
937 
938 /*
939  * pmap_pdp_ctor: constructor for the PDP cache.
940  */
941 
942 void
943 pmap_pdp_ctor(pd_entry_t *pdir)
944 {
945 	paddr_t pdirpa;
946 	int npde;
947 
948 	/* fetch the physical address of the page directory. */
949 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
950 
951 	/* zero init area */
952 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
953 
954 	/* put in recursive PDE to map the PTEs */
955 	pdir[PDIR_SLOT_PTE] = pdirpa | PG_V | PG_KW;
956 
957 	npde = nkptp[PTP_LEVELS - 1];
958 
959 	/* put in kernel VM PDEs */
960 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
961 	    npde * sizeof(pd_entry_t));
962 
963 	/* zero the rest */
964 	memset(&pdir[PDIR_SLOT_KERN + npde], 0,
965 	    (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
966 
967 	pdir[PDIR_SLOT_DIRECT] = pmap_kernel()->pm_pdir[PDIR_SLOT_DIRECT];
968 
969 #if VM_MIN_KERNEL_ADDRESS != KERNBASE
970 	pdir[pl4_pi(KERNBASE)] = PDP_BASE[pl4_pi(KERNBASE)];
971 #endif
972 }
973 
974 /*
975  * pmap_create: create a pmap
976  *
977  * => note: old pmap interface took a "size" args which allowed for
978  *	the creation of "software only" pmaps (not in bsd).
979  */
980 
981 struct pmap *
982 pmap_create(void)
983 {
984 	struct pmap *pmap;
985 	int i;
986 
987 	pmap = pool_get(&pmap_pmap_pool, PR_WAITOK);
988 
989 	/* init uvm_object */
990 	for (i = 0; i < PTP_LEVELS - 1; i++) {
991 		uvm_objinit(&pmap->pm_obj[i], NULL, 1);
992 		pmap->pm_ptphint[i] = NULL;
993 	}
994 	pmap->pm_stats.wired_count = 0;
995 	pmap->pm_stats.resident_count = 1;	/* count the PDP allocd below */
996 	pmap->pm_cpus = 0;
997 
998 	/* allocate PDP */
999 
1000 	/*
1001 	 * note that there is no need to splvm to protect us from
1002 	 * malloc since malloc allocates out of a submap and we should
1003 	 * have already allocated kernel PTPs to cover the range...
1004 	 */
1005 
1006 	pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
1007 	pmap_pdp_ctor(pmap->pm_pdir);
1008 
1009 	pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME;
1010 
1011 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
1012 	return (pmap);
1013 }
1014 
1015 /*
1016  * pmap_destroy: drop reference count on pmap.   free pmap if
1017  *	reference count goes to zero.
1018  */
1019 
1020 void
1021 pmap_destroy(struct pmap *pmap)
1022 {
1023 	struct vm_page *pg;
1024 	int refs;
1025 	int i;
1026 
1027 	/*
1028 	 * drop reference count
1029 	 */
1030 
1031 	refs = --pmap->pm_obj[0].uo_refs;
1032 	if (refs > 0) {
1033 		return;
1034 	}
1035 
1036 	/*
1037 	 * reference count is zero, free pmap resources and then free pmap.
1038 	 */
1039 
1040 #ifdef DIAGNOSTIC
1041 	if (pmap->pm_cpus != 0)
1042 		printf("pmap_destroy: pmap %p cpus=0x%llx\n",
1043 		    (void *)pmap, pmap->pm_cpus);
1044 #endif
1045 
1046 	/*
1047 	 * remove it from global list of pmaps
1048 	 */
1049 	LIST_REMOVE(pmap, pm_list);
1050 
1051 	/*
1052 	 * free any remaining PTPs
1053 	 */
1054 
1055 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1056 		while ((pg = RB_ROOT(&pmap->pm_obj[i].memt)) != NULL) {
1057 			KASSERT((pg->pg_flags & PG_BUSY) == 0);
1058 
1059 			pg->wire_count = 0;
1060 			uvm_pagefree(pg);
1061 		}
1062 	}
1063 
1064 	/*
1065 	 * MULTIPROCESSOR -- no need to flush out of other processors'
1066 	 * APTE space because we do that in pmap_unmap_ptes().
1067 	 */
1068 	/* XXX: need to flush it out of other processor's APTE space? */
1069 	pool_put(&pmap_pdp_pool, pmap->pm_pdir);
1070 
1071 	pool_put(&pmap_pmap_pool, pmap);
1072 }
1073 
1074 /*
1075  *	Add a reference to the specified pmap.
1076  */
1077 
1078 void
1079 pmap_reference(struct pmap *pmap)
1080 {
1081 	pmap->pm_obj[0].uo_refs++;
1082 }
1083 
1084 /*
1085  * pmap_activate: activate a process' pmap (fill in %cr3)
1086  *
1087  * => called from cpu_fork() and when switching pmaps during exec
1088  * => if p is the curproc, then load it into the MMU
1089  */
1090 
1091 void
1092 pmap_activate(struct proc *p)
1093 {
1094 	struct pcb *pcb = &p->p_addr->u_pcb;
1095 	struct pmap *pmap = p->p_vmspace->vm_map.pmap;
1096 
1097 	pcb->pcb_pmap = pmap;
1098 	pcb->pcb_cr3 = pmap->pm_pdirpa;
1099 	if (p == curproc) {
1100 		lcr3(pcb->pcb_cr3);
1101 
1102 		/*
1103 		 * mark the pmap in use by this processor.
1104 		 */
1105 		x86_atomic_setbits_u64(&pmap->pm_cpus, (1ULL << cpu_number()));
1106 	}
1107 }
1108 
1109 /*
1110  * pmap_deactivate: deactivate a process' pmap
1111  */
1112 
1113 void
1114 pmap_deactivate(struct proc *p)
1115 {
1116 	struct pmap *pmap = p->p_vmspace->vm_map.pmap;
1117 
1118 	/*
1119 	 * mark the pmap no longer in use by this processor.
1120 	 */
1121 	x86_atomic_clearbits_u64(&pmap->pm_cpus, (1ULL << cpu_number()));
1122 }
1123 
1124 /*
1125  * end of lifecycle functions
1126  */
1127 
1128 /*
1129  * some misc. functions
1130  */
1131 
1132 boolean_t
1133 pmap_pdes_valid(vaddr_t va, pd_entry_t **pdes, pd_entry_t *lastpde)
1134 {
1135 	int i;
1136 	unsigned long index;
1137 	pd_entry_t pde;
1138 
1139 	for (i = PTP_LEVELS; i > 1; i--) {
1140 		index = pl_i(va, i);
1141 		pde = pdes[i - 2][index];
1142 		if ((pde & PG_V) == 0)
1143 			return FALSE;
1144 	}
1145 	if (lastpde != NULL)
1146 		*lastpde = pde;
1147 	return TRUE;
1148 }
1149 
1150 /*
1151  * pmap_extract: extract a PA for the given VA
1152  */
1153 
1154 boolean_t
1155 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
1156 {
1157 	pt_entry_t *ptes, pte;
1158 	pd_entry_t pde, **pdes;
1159 
1160 	if (pmap == pmap_kernel() && va >= PMAP_DIRECT_BASE &&
1161 	    va < PMAP_DIRECT_END) {
1162 		*pap = va - PMAP_DIRECT_BASE;
1163 		return (TRUE);
1164 	}
1165 
1166 	pmap_map_ptes(pmap, &ptes, &pdes);
1167 	if (pmap_pdes_valid(va, pdes, &pde) == FALSE) {
1168 		return FALSE;
1169 	}
1170 
1171 	if (pde & PG_PS) {
1172 		if (pap != NULL)
1173 			*pap = (pde & PG_LGFRAME) | (va & 0x1fffff);
1174 		pmap_unmap_ptes(pmap);
1175 		return (TRUE);
1176 	}
1177 
1178 	pte = ptes[pl1_i(va)];
1179 	pmap_unmap_ptes(pmap);
1180 
1181 	if (__predict_true((pte & PG_V) != 0)) {
1182 		if (pap != NULL)
1183 			*pap = (pte & PG_FRAME) | (va & 0xfff);
1184 		return (TRUE);
1185 	}
1186 
1187 	return FALSE;
1188 }
1189 
1190 /*
1191  * pmap_zero_page: zero a page
1192  */
1193 
1194 void
1195 pmap_zero_page(struct vm_page *pg)
1196 {
1197 	pagezero(pmap_map_direct(pg));
1198 }
1199 
1200 /*
1201  * pmap_flush_cache: flush the cache for a virtual address.
1202  */
1203 void
1204 pmap_flush_cache(vaddr_t addr, vsize_t len)
1205 {
1206 	vaddr_t	i;
1207 
1208 	if (curcpu()->ci_cflushsz == 0) {
1209 		wbinvd();
1210 		return;
1211 	}
1212 
1213 	/* all cpus that have clflush also have mfence. */
1214 	mfence();
1215 	for (i = addr; i < addr + len; i += curcpu()->ci_cflushsz)
1216 		clflush(i);
1217 	mfence();
1218 }
1219 
1220 /*
1221  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
1222  * Returns TRUE if the page was zero'd, FALSE if we aborted for
1223  * some reason.
1224  */
1225 
1226 boolean_t
1227 pmap_pageidlezero(struct vm_page *pg)
1228 {
1229 	vaddr_t va = pmap_map_direct(pg);
1230 	boolean_t rv = TRUE;
1231 	long *ptr;
1232 	int i;
1233 
1234 	/*
1235 	 * XXX - We'd really like to do this uncached. But at this moment
1236  	 *       we're never called, so just pretend that this works.
1237 	 *       It shouldn't be too hard to create a second direct map
1238 	 *       with uncached mappings.
1239 	 */
1240 	for (i = 0, ptr = (long *) va; i < PAGE_SIZE / sizeof(long); i++) {
1241 		if (!curcpu_is_idle()) {
1242 
1243 			/*
1244 			 * A process has become ready.  Abort now,
1245 			 * so we don't keep it waiting while we
1246 			 * do slow memory access to finish this
1247 			 * page.
1248 			 */
1249 
1250 			rv = FALSE;
1251 			break;
1252 		}
1253 		*ptr++ = 0;
1254 	}
1255 
1256 	return (rv);
1257 }
1258 
1259 /*
1260  * pmap_copy_page: copy a page
1261  */
1262 
1263 void
1264 pmap_copy_page(struct vm_page *srcpg, struct vm_page *dstpg)
1265 {
1266 	vaddr_t srcva = pmap_map_direct(srcpg);
1267 	vaddr_t dstva = pmap_map_direct(dstpg);
1268 
1269 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
1270 }
1271 
1272 /*
1273  * p m a p   r e m o v e   f u n c t i o n s
1274  *
1275  * functions that remove mappings
1276  */
1277 
1278 /*
1279  * pmap_remove_ptes: remove PTEs from a PTP
1280  *
1281  * => must have proper locking on pmap_master_lock
1282  * => PTP must be mapped into KVA
1283  * => PTP should be null if pmap == pmap_kernel()
1284  */
1285 
1286 void
1287 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
1288     vaddr_t startva, vaddr_t endva, int flags)
1289 {
1290 	struct pv_entry *pve;
1291 	pt_entry_t *pte = (pt_entry_t *) ptpva;
1292 	struct vm_page *pg;
1293 	pt_entry_t opte;
1294 
1295 	/*
1296 	 * note that ptpva points to the PTE that maps startva.   this may
1297 	 * or may not be the first PTE in the PTP.
1298 	 *
1299 	 * we loop through the PTP while there are still PTEs to look at
1300 	 * and the wire_count is greater than 1 (because we use the wire_count
1301 	 * to keep track of the number of real PTEs in the PTP).
1302 	 */
1303 
1304 	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
1305 			     ; pte++, startva += PAGE_SIZE) {
1306 		if (!pmap_valid_entry(*pte))
1307 			continue;			/* VA not mapped */
1308 		if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
1309 			continue;
1310 		}
1311 
1312 		/* atomically save the old PTE and zap! it */
1313 		opte = pmap_pte_set(pte, 0);
1314 
1315 		if (opte & PG_W)
1316 			pmap->pm_stats.wired_count--;
1317 		pmap->pm_stats.resident_count--;
1318 
1319 		if (ptp)
1320 			ptp->wire_count--;		/* dropping a PTE */
1321 
1322 		pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
1323 
1324 		/*
1325 		 * if we are not on a pv list we are done.
1326 		 */
1327 
1328 		if ((opte & PG_PVLIST) == 0) {
1329 #ifdef DIAGNOSTIC
1330 			if (pg != NULL)
1331 				panic("pmap_remove_ptes: managed page without "
1332 				      "PG_PVLIST for 0x%lx", startva);
1333 #endif
1334 			continue;
1335 		}
1336 
1337 #ifdef DIAGNOSTIC
1338 		if (pg == NULL)
1339 			panic("pmap_remove_ptes: unmanaged page marked "
1340 			      "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
1341 			      startva, (u_long)(opte & PG_FRAME));
1342 #endif
1343 
1344 		/* sync R/M bits */
1345 		pmap_sync_flags_pte(pg, opte);
1346 		pve = pmap_remove_pv(pg, pmap, startva);
1347 
1348 		if (pve) {
1349 			pool_put(&pmap_pv_pool, pve);
1350 		}
1351 
1352 		/* end of "for" loop: time for next pte */
1353 	}
1354 }
1355 
1356 
1357 /*
1358  * pmap_remove_pte: remove a single PTE from a PTP
1359  *
1360  * => must have proper locking on pmap_master_lock
1361  * => PTP must be mapped into KVA
1362  * => PTP should be null if pmap == pmap_kernel()
1363  * => returns true if we removed a mapping
1364  */
1365 
1366 boolean_t
1367 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
1368     vaddr_t va, int flags)
1369 {
1370 	struct pv_entry *pve;
1371 	struct vm_page *pg;
1372 	pt_entry_t opte;
1373 
1374 	if (!pmap_valid_entry(*pte))
1375 		return(FALSE);		/* VA not mapped */
1376 	if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
1377 		return(FALSE);
1378 	}
1379 
1380 	/* atomically save the old PTE and zap! it */
1381 	opte = pmap_pte_set(pte, 0);
1382 
1383 	if (opte & PG_W)
1384 		pmap->pm_stats.wired_count--;
1385 	pmap->pm_stats.resident_count--;
1386 
1387 	if (ptp)
1388 		ptp->wire_count--;		/* dropping a PTE */
1389 
1390 	pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
1391 
1392 	/*
1393 	 * if we are not on a pv list we are done.
1394 	 */
1395 	if ((opte & PG_PVLIST) == 0) {
1396 #ifdef DIAGNOSTIC
1397 		if (pg != NULL)
1398 			panic("pmap_remove_pte: managed page without "
1399 			      "PG_PVLIST for 0x%lx", va);
1400 #endif
1401 		return(TRUE);
1402 	}
1403 
1404 #ifdef DIAGNOSTIC
1405 	if (pg == NULL)
1406 		panic("pmap_remove_pte: unmanaged page marked "
1407 		    "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va,
1408 		    (u_long)(opte & PG_FRAME));
1409 #endif
1410 
1411 	/* sync R/M bits */
1412 	pmap_sync_flags_pte(pg, opte);
1413 	pve = pmap_remove_pv(pg, pmap, va);
1414 	if (pve)
1415 		pool_put(&pmap_pv_pool, pve);
1416 	return(TRUE);
1417 }
1418 
1419 /*
1420  * pmap_remove: top level mapping removal function
1421  *
1422  * => caller should not be holding any pmap locks
1423  */
1424 
1425 void
1426 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
1427 {
1428 	pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
1429 }
1430 
1431 /*
1432  * pmap_do_remove: mapping removal guts
1433  *
1434  * => caller should not be holding any pmap locks
1435  */
1436 
1437 void
1438 pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
1439 {
1440 	pt_entry_t *ptes;
1441 	pd_entry_t **pdes, pde;
1442 	boolean_t result;
1443 	paddr_t ptppa;
1444 	vaddr_t blkendva;
1445 	struct vm_page *ptp;
1446 	vaddr_t va;
1447 	int shootall = 0;
1448 	struct pg_to_free empty_ptps;
1449 
1450 	TAILQ_INIT(&empty_ptps);
1451 
1452 	PMAP_MAP_TO_HEAD_LOCK();
1453 	pmap_map_ptes(pmap, &ptes, &pdes);
1454 
1455 	/*
1456 	 * removing one page?  take shortcut function.
1457 	 */
1458 
1459 	if (sva + PAGE_SIZE == eva) {
1460 		if (pmap_pdes_valid(sva, pdes, &pde)) {
1461 
1462 			/* PA of the PTP */
1463 			ptppa = pde & PG_FRAME;
1464 
1465 			/* get PTP if non-kernel mapping */
1466 
1467 			if (pmap == pmap_kernel()) {
1468 				/* we never free kernel PTPs */
1469 				ptp = NULL;
1470 			} else {
1471 				ptp = pmap_find_ptp(pmap, sva, ptppa, 1);
1472 #ifdef DIAGNOSTIC
1473 				if (ptp == NULL)
1474 					panic("pmap_remove: unmanaged "
1475 					      "PTP detected");
1476 #endif
1477 			}
1478 
1479 			/* do it! */
1480 			result = pmap_remove_pte(pmap, ptp,
1481 			    &ptes[pl1_i(sva)], sva, flags);
1482 
1483 			/*
1484 			 * if mapping removed and the PTP is no longer
1485 			 * being used, free it!
1486 			 */
1487 
1488 			if (result && ptp && ptp->wire_count <= 1)
1489 				pmap_free_ptp(pmap, ptp, sva, ptes, pdes,
1490 				    &empty_ptps);
1491 			pmap_tlb_shootpage(pmap, sva);
1492 		}
1493 
1494 		pmap_tlb_shootwait();
1495 		pmap_unmap_ptes(pmap);
1496 		PMAP_MAP_TO_HEAD_UNLOCK();
1497 
1498 		while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1499 			TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1500 			uvm_pagefree(ptp);
1501                 }
1502 
1503 		return;
1504 	}
1505 
1506 	if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
1507 		shootall = 1;
1508 
1509 	for (va = sva; va < eva; va = blkendva) {
1510 		/* determine range of block */
1511 		blkendva = x86_round_pdr(va + 1);
1512 		if (blkendva > eva)
1513 			blkendva = eva;
1514 
1515 		/*
1516 		 * XXXCDC: our PTE mappings should never be removed
1517 		 * with pmap_remove!  if we allow this (and why would
1518 		 * we?) then we end up freeing the pmap's page
1519 		 * directory page (PDP) before we are finished using
1520 		 * it when we hit in in the recursive mapping.  this
1521 		 * is BAD.
1522 		 *
1523 		 * long term solution is to move the PTEs out of user
1524 		 * address space.  and into kernel address space (up
1525 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
1526 		 * be VM_MAX_ADDRESS.
1527 		 */
1528 
1529 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
1530 			/* XXXCDC: ugly hack to avoid freeing PDP here */
1531 			continue;
1532 
1533 		if (!pmap_pdes_valid(va, pdes, &pde))
1534 			continue;
1535 
1536 		/* PA of the PTP */
1537 		ptppa = pde & PG_FRAME;
1538 
1539 		/* get PTP if non-kernel mapping */
1540 		if (pmap == pmap_kernel()) {
1541 			/* we never free kernel PTPs */
1542 			ptp = NULL;
1543 		} else {
1544 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
1545 #ifdef DIAGNOSTIC
1546 			if (ptp == NULL)
1547 				panic("pmap_remove: unmanaged PTP "
1548 				      "detected");
1549 #endif
1550 		}
1551 		pmap_remove_ptes(pmap, ptp,
1552 		    (vaddr_t)&ptes[pl1_i(va)], va, blkendva, flags);
1553 
1554 		/* if PTP is no longer being used, free it! */
1555 		if (ptp && ptp->wire_count <= 1) {
1556 			pmap_free_ptp(pmap, ptp, va, ptes, pdes, &empty_ptps);
1557 		}
1558 	}
1559 
1560 	if (shootall)
1561 		pmap_tlb_shoottlb();
1562 	else
1563 		pmap_tlb_shootrange(pmap, sva, eva);
1564 
1565 	pmap_tlb_shootwait();
1566 
1567 	pmap_unmap_ptes(pmap);
1568 	PMAP_MAP_TO_HEAD_UNLOCK();
1569 
1570 	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1571 		TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1572 		uvm_pagefree(ptp);
1573 	}
1574 }
1575 
1576 /*
1577  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
1578  *
1579  * => R/M bits are sync'd back to attrs
1580  */
1581 
1582 void
1583 pmap_page_remove(struct vm_page *pg)
1584 {
1585 	struct pv_entry *pve;
1586 	pt_entry_t *ptes, opte;
1587 	pd_entry_t **pdes;
1588 #ifdef DIAGNOSTIC
1589 	pd_entry_t pde;
1590 #endif
1591 	struct pg_to_free empty_ptps;
1592 	struct vm_page *ptp;
1593 
1594 	TAILQ_INIT(&empty_ptps);
1595 
1596 	PMAP_HEAD_TO_MAP_LOCK();
1597 
1598 	while ((pve = pg->mdpage.pv_list) != NULL) {
1599 		pg->mdpage.pv_list = pve->pv_next;
1600 
1601 		pmap_map_ptes(pve->pv_pmap, &ptes, &pdes);
1602 
1603 #ifdef DIAGNOSTIC
1604 		if (pve->pv_ptp && pmap_pdes_valid(pve->pv_va, pdes, &pde) &&
1605 		   (pde & PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
1606 			printf("pmap_page_remove: pg=%p: va=%lx, pv_ptp=%p\n",
1607 			       pg, pve->pv_va, pve->pv_ptp);
1608 			printf("pmap_page_remove: PTP's phys addr: "
1609 			       "actual=%lx, recorded=%lx\n",
1610 			       (unsigned long)(pde & PG_FRAME),
1611 				VM_PAGE_TO_PHYS(pve->pv_ptp));
1612 			panic("pmap_page_remove: mapped managed page has "
1613 			      "invalid pv_ptp field");
1614 		}
1615 #endif
1616 
1617 		/* atomically save the old PTE and zap it */
1618 		opte = pmap_pte_set(&ptes[pl1_i(pve->pv_va)], 0);
1619 
1620 		if (opte & PG_W)
1621 			pve->pv_pmap->pm_stats.wired_count--;
1622 		pve->pv_pmap->pm_stats.resident_count--;
1623 
1624 		pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
1625 
1626 		pmap_sync_flags_pte(pg, opte);
1627 
1628 		/* update the PTP reference count.  free if last reference. */
1629 		if (pve->pv_ptp) {
1630 			pve->pv_ptp->wire_count--;
1631 			if (pve->pv_ptp->wire_count <= 1) {
1632 				pmap_free_ptp(pve->pv_pmap, pve->pv_ptp,
1633 				    pve->pv_va, ptes, pdes, &empty_ptps);
1634 			}
1635 		}
1636 		pmap_unmap_ptes(pve->pv_pmap);
1637 		pool_put(&pmap_pv_pool, pve);
1638 	}
1639 
1640 	PMAP_HEAD_TO_MAP_UNLOCK();
1641 	pmap_tlb_shootwait();
1642 
1643 	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1644 		TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1645 		uvm_pagefree(ptp);
1646 	}
1647 }
1648 
1649 /*
1650  * p m a p   a t t r i b u t e  f u n c t i o n s
1651  * functions that test/change managed page's attributes
1652  * since a page can be mapped multiple times we must check each PTE that
1653  * maps it by going down the pv lists.
1654  */
1655 
1656 /*
1657  * pmap_test_attrs: test a page's attributes
1658  */
1659 
1660 boolean_t
1661 pmap_test_attrs(struct vm_page *pg, unsigned int testbits)
1662 {
1663 	struct pv_entry *pve;
1664 	pt_entry_t *ptes, pte;
1665 	pd_entry_t **pdes;
1666 	u_long mybits, testflags;
1667 
1668 	testflags = pmap_pte2flags(testbits);
1669 
1670 	if (pg->pg_flags & testflags)
1671 		return (TRUE);
1672 
1673 	PMAP_HEAD_TO_MAP_LOCK();
1674 	mybits = 0;
1675 	for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0;
1676 	    pve = pve->pv_next) {
1677 		pmap_map_ptes(pve->pv_pmap, &ptes, &pdes);
1678 		pte = ptes[pl1_i(pve->pv_va)];
1679 		pmap_unmap_ptes(pve->pv_pmap);
1680 		mybits |= (pte & testbits);
1681 	}
1682 	PMAP_HEAD_TO_MAP_UNLOCK();
1683 
1684 	if (mybits == 0)
1685 		return (FALSE);
1686 
1687 	atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits));
1688 
1689 	return (TRUE);
1690 }
1691 
1692 /*
1693  * pmap_clear_attrs: change a page's attributes
1694  *
1695  * => we return TRUE if we cleared one of the bits we were asked to
1696  */
1697 
1698 boolean_t
1699 pmap_clear_attrs(struct vm_page *pg, unsigned long clearbits)
1700 {
1701 	struct pv_entry *pve;
1702 	pt_entry_t *ptes, opte;
1703 	pd_entry_t **pdes;
1704 	u_long clearflags;
1705 	int result;
1706 
1707 	clearflags = pmap_pte2flags(clearbits);
1708 
1709 	PMAP_HEAD_TO_MAP_LOCK();
1710 
1711 	result = pg->pg_flags & clearflags;
1712 	if (result)
1713 		atomic_clearbits_int(&pg->pg_flags, clearflags);
1714 
1715 	for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) {
1716 		pmap_map_ptes(pve->pv_pmap, &ptes, &pdes);
1717 #ifdef DIAGNOSTIC
1718 		if (!pmap_pdes_valid(pve->pv_va, pdes, NULL))
1719 			panic("pmap_change_attrs: mapping without PTP "
1720 			      "detected");
1721 #endif
1722 
1723 		opte = ptes[pl1_i(pve->pv_va)];
1724 		if (opte & clearbits) {
1725 			result = 1;
1726 			pmap_pte_clearbits(&ptes[pl1_i(pve->pv_va)],
1727 			    (opte & clearbits));
1728 			pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
1729 		}
1730 		pmap_unmap_ptes(pve->pv_pmap);
1731 	}
1732 
1733 	PMAP_HEAD_TO_MAP_UNLOCK();
1734 
1735 	pmap_tlb_shootwait();
1736 
1737 	return (result != 0);
1738 }
1739 
1740 /*
1741  * p m a p   p r o t e c t i o n   f u n c t i o n s
1742  */
1743 
1744 /*
1745  * pmap_page_protect: change the protection of all recorded mappings
1746  *	of a managed page
1747  *
1748  * => NOTE: this is an inline function in pmap.h
1749  */
1750 
1751 /* see pmap.h */
1752 
1753 /*
1754  * pmap_protect: set the protection in of the pages in a pmap
1755  *
1756  * => NOTE: this is an inline function in pmap.h
1757  */
1758 
1759 /* see pmap.h */
1760 
1761 /*
1762  * pmap_write_protect: write-protect pages in a pmap
1763  */
1764 
1765 void
1766 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
1767 {
1768 	pt_entry_t nx, *ptes, *spte, *epte;
1769 	pd_entry_t **pdes;
1770 	vaddr_t blockend;
1771 	int shootall = 0;
1772 	vaddr_t va;
1773 
1774 	pmap_map_ptes(pmap, &ptes, &pdes);
1775 
1776 	/* should be ok, but just in case ... */
1777 	sva &= PG_FRAME;
1778 	eva &= PG_FRAME;
1779 
1780 	nx = 0;
1781 	if ((cpu_feature & CPUID_NXE) && !(prot & VM_PROT_EXECUTE))
1782 		nx = PG_NX;
1783 
1784 	if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
1785 		shootall = 1;
1786 
1787 	for (va = sva; va < eva ; va = blockend) {
1788 		blockend = (va & L2_FRAME) + NBPD_L2;
1789 		if (blockend > eva)
1790 			blockend = eva;
1791 
1792 		/*
1793 		 * XXXCDC: our PTE mappings should never be write-protected!
1794 		 *
1795 		 * long term solution is to move the PTEs out of user
1796 		 * address space.  and into kernel address space (up
1797 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
1798 		 * be VM_MAX_ADDRESS.
1799 		 */
1800 
1801 		/* XXXCDC: ugly hack to avoid freeing PDP here */
1802 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
1803 			continue;
1804 
1805 		/* empty block? */
1806 		if (!pmap_pdes_valid(va, pdes, NULL))
1807 			continue;
1808 
1809 #ifdef DIAGNOSTIC
1810 		if (va >= VM_MAXUSER_ADDRESS && va < VM_MAX_ADDRESS)
1811 			panic("pmap_write_protect: PTE space");
1812 #endif
1813 
1814 		spte = &ptes[pl1_i(va)];
1815 		epte = &ptes[pl1_i(blockend)];
1816 
1817 		for (/*null */; spte < epte ; spte++) {
1818 			if (!(*spte & PG_V))
1819 				continue;
1820 			pmap_pte_clearbits(spte, PG_RW);
1821 			pmap_pte_setbits(spte, nx);
1822 		}
1823 	}
1824 
1825 	if (shootall)
1826 		pmap_tlb_shoottlb();
1827 	else
1828 		pmap_tlb_shootrange(pmap, sva, eva);
1829 
1830 	pmap_tlb_shootwait();
1831 
1832 	pmap_unmap_ptes(pmap);
1833 }
1834 
1835 /*
1836  * end of protection functions
1837  */
1838 
1839 /*
1840  * pmap_unwire: clear the wired bit in the PTE
1841  *
1842  * => mapping should already be in map
1843  */
1844 
1845 void
1846 pmap_unwire(struct pmap *pmap, vaddr_t va)
1847 {
1848 	pt_entry_t *ptes;
1849 	pd_entry_t **pdes;
1850 
1851 	pmap_map_ptes(pmap, &ptes, &pdes);
1852 
1853 	if (pmap_pdes_valid(va, pdes, NULL)) {
1854 
1855 #ifdef DIAGNOSTIC
1856 		if (!pmap_valid_entry(ptes[pl1_i(va)]))
1857 			panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
1858 #endif
1859 		if ((ptes[pl1_i(va)] & PG_W) != 0) {
1860 			pmap_pte_clearbits(&ptes[pl1_i(va)], PG_W);
1861 			pmap->pm_stats.wired_count--;
1862 		}
1863 #ifdef DIAGNOSTIC
1864 		else {
1865 			printf("pmap_unwire: wiring for pmap %p va 0x%lx "
1866 			       "didn't change!\n", pmap, va);
1867 		}
1868 #endif
1869 		pmap_unmap_ptes(pmap);
1870 	}
1871 #ifdef DIAGNOSTIC
1872 	else {
1873 		panic("pmap_unwire: invalid PDE");
1874 	}
1875 #endif
1876 }
1877 
1878 /*
1879  * pmap_collect: free resources held by a pmap
1880  *
1881  * => optional function.
1882  * => called when a process is swapped out to free memory.
1883  */
1884 
1885 void
1886 pmap_collect(struct pmap *pmap)
1887 {
1888 	/*
1889 	 * free all of the pt pages by removing the physical mappings
1890 	 * for its entire address space.
1891 	 */
1892 
1893 /*	pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAX_ADDRESS,
1894 	    PMAP_REMOVE_SKIPWIRED);
1895 */
1896 }
1897 
1898 /*
1899  * pmap_copy: copy mappings from one pmap to another
1900  *
1901  * => optional function
1902  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
1903  */
1904 
1905 /*
1906  * defined as macro in pmap.h
1907  */
1908 
1909 /*
1910  * pmap_enter: enter a mapping into a pmap
1911  *
1912  * => must be done "now" ... no lazy-evaluation
1913  */
1914 
1915 int
1916 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags)
1917 {
1918 	pt_entry_t *ptes, opte, npte;
1919 	pd_entry_t **pdes;
1920 	struct vm_page *ptp, *pg = NULL;
1921 	struct pv_entry *pve = NULL;
1922 	int ptpdelta, wireddelta, resdelta;
1923 	boolean_t wired = (flags & PMAP_WIRED) != 0;
1924 	boolean_t nocache = (pa & PMAP_NOCACHE) != 0;
1925 	boolean_t wc = (pa & PMAP_WC) != 0;
1926 	int error;
1927 
1928 	KASSERT(!(wc && nocache));
1929 	pa &= PMAP_PA_MASK;
1930 
1931 #ifdef DIAGNOSTIC
1932 	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
1933 		panic("pmap_enter: trying to map over PDP/APDP!");
1934 
1935 	/* sanity check: kernel PTPs should already have been pre-allocated */
1936 	if (va >= VM_MIN_KERNEL_ADDRESS &&
1937 	    !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
1938 		panic("pmap_enter: missing kernel PTP for va %lx!", va);
1939 
1940 #endif
1941 
1942 	/* get lock */
1943 	PMAP_MAP_TO_HEAD_LOCK();
1944 
1945 	/*
1946 	 * map in ptes and get a pointer to our PTP (unless we are the kernel)
1947 	 */
1948 
1949 	pmap_map_ptes(pmap, &ptes, &pdes);
1950 	if (pmap == pmap_kernel()) {
1951 		ptp = NULL;
1952 	} else {
1953 		ptp = pmap_get_ptp(pmap, va, pdes);
1954 		if (ptp == NULL) {
1955 			if (flags & PMAP_CANFAIL) {
1956 				error = ENOMEM;
1957 				goto out;
1958 			}
1959 			panic("pmap_enter: get ptp failed");
1960 		}
1961 	}
1962 	opte = ptes[pl1_i(va)];		/* old PTE */
1963 
1964 	/*
1965 	 * is there currently a valid mapping at our VA?
1966 	 */
1967 
1968 	if (pmap_valid_entry(opte)) {
1969 		/*
1970 		 * first, calculate pm_stats updates.  resident count will not
1971 		 * change since we are replacing/changing a valid mapping.
1972 		 * wired count might change...
1973 		 */
1974 
1975 		resdelta = 0;
1976 		if (wired && (opte & PG_W) == 0)
1977 			wireddelta = 1;
1978 		else if (!wired && (opte & PG_W) != 0)
1979 			wireddelta = -1;
1980 		else
1981 			wireddelta = 0;
1982 		ptpdelta = 0;
1983 
1984 		/*
1985 		 * is the currently mapped PA the same as the one we
1986 		 * want to map?
1987 		 */
1988 
1989 		if ((opte & PG_FRAME) == pa) {
1990 
1991 			/* if this is on the PVLIST, sync R/M bit */
1992 			if (opte & PG_PVLIST) {
1993 				pg = PHYS_TO_VM_PAGE(pa);
1994 #ifdef DIAGNOSTIC
1995 				if (pg == NULL)
1996 					panic("pmap_enter: same pa PG_PVLIST "
1997 					      "mapping with unmanaged page "
1998 					      "pa = 0x%lx (0x%lx)", pa,
1999 					      atop(pa));
2000 #endif
2001 				pmap_sync_flags_pte(pg, opte);
2002 			} else {
2003 #ifdef DIAGNOSTIC
2004 				if (PHYS_TO_VM_PAGE(pa) != NULL)
2005 					panic("pmap_enter: same pa, managed "
2006 					    "page, no PG_VLIST pa: 0x%lx\n",
2007 					    pa);
2008 #endif
2009 			}
2010 			goto enter_now;
2011 		}
2012 
2013 		/*
2014 		 * changing PAs: we must remove the old one first
2015 		 */
2016 
2017 		/*
2018 		 * if current mapping is on a pvlist,
2019 		 * remove it (sync R/M bits)
2020 		 */
2021 
2022 		if (opte & PG_PVLIST) {
2023 			pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
2024 #ifdef DIAGNOSTIC
2025 			if (pg == NULL)
2026 				panic("pmap_enter: PG_PVLIST mapping with "
2027 				      "unmanaged page "
2028 				      "pa = 0x%lx (0x%lx)", pa, atop(pa));
2029 #endif
2030 			pmap_sync_flags_pte(pg, opte);
2031 			pve = pmap_remove_pv(pg, pmap, va);
2032 			pg = NULL; /* This is not the page we are looking for */
2033 		}
2034 	} else {	/* opte not valid */
2035 		pve = NULL;
2036 		resdelta = 1;
2037 		if (wired)
2038 			wireddelta = 1;
2039 		else
2040 			wireddelta = 0;
2041 		if (ptp)
2042 			ptpdelta = 1;
2043 		else
2044 			ptpdelta = 0;
2045 	}
2046 
2047 	/*
2048 	 * pve is either NULL or points to a now-free pv_entry structure
2049 	 * (the latter case is if we called pmap_remove_pv above).
2050 	 *
2051 	 * if this entry is to be on a pvlist, enter it now.
2052 	 */
2053 
2054 	if (pmap_initialized)
2055 		pg = PHYS_TO_VM_PAGE(pa);
2056 
2057 	if (pg != NULL) {
2058 		if (pve == NULL) {
2059 			pve = pool_get(&pmap_pv_pool, PR_NOWAIT);
2060 			if (pve == NULL) {
2061 				if (flags & PMAP_CANFAIL) {
2062 					error = ENOMEM;
2063 					goto out;
2064 				}
2065 				panic("pmap_enter: no pv entries available");
2066 			}
2067 		}
2068 		pmap_enter_pv(pg, pve, pmap, va, ptp);
2069 	} else {
2070 		/* new mapping is not PG_PVLIST.   free pve if we've got one */
2071 		if (pve)
2072 			pool_put(&pmap_pv_pool, pve);
2073 	}
2074 
2075 enter_now:
2076 	/*
2077 	 * at this point pg is !NULL if we want the PG_PVLIST bit set
2078 	 */
2079 
2080 	pmap->pm_stats.resident_count += resdelta;
2081 	pmap->pm_stats.wired_count += wireddelta;
2082 	if (ptp)
2083 		ptp->wire_count += ptpdelta;
2084 
2085 	if (pg != PHYS_TO_VM_PAGE(pa))
2086 		panic("wtf?");
2087 
2088 	npte = pa | protection_codes[prot] | PG_V;
2089 	if (pg != NULL) {
2090 		npte |= PG_PVLIST;
2091 		/*
2092 		 * make sure that if the page is write combined all
2093 		 * instances of pmap_enter make it so.
2094 		 */
2095 		if (pg->pg_flags & PG_PMAP_WC) {
2096 			KASSERT(nocache == 0);
2097 			wc = TRUE;
2098 		}
2099 	}
2100 	if (wc)
2101 		npte |= pmap_pg_wc;
2102 	if (wired)
2103 		npte |= PG_W;
2104 	if (nocache)
2105 		npte |= PG_N;
2106 	if (va < VM_MAXUSER_ADDRESS)
2107 		npte |= PG_u;
2108 	else if (va < VM_MAX_ADDRESS)
2109 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
2110 	if (pmap == pmap_kernel())
2111 		npte |= PG_G;
2112 
2113 	ptes[pl1_i(va)] = npte;		/* zap! */
2114 
2115 	/*
2116 	 * If we changed anything other than modified/used bits,
2117 	 * flush the TLB.  (is this overkill?)
2118 	 */
2119 	if (opte & PG_V) {
2120 		if (nocache && (opte & PG_N) == 0)
2121 			wbinvd();
2122 		pmap_tlb_shootpage(pmap, va);
2123 		pmap_tlb_shootwait();
2124 	}
2125 
2126 	error = 0;
2127 
2128 out:
2129 	pmap_unmap_ptes(pmap);
2130 	PMAP_MAP_TO_HEAD_UNLOCK();
2131 
2132 	return error;
2133 }
2134 
2135 boolean_t
2136 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
2137 {
2138 	struct vm_page *ptp;
2139 	struct pmap *kpm = pmap_kernel();
2140 
2141 	if (uvm.page_init_done == FALSE) {
2142 		vaddr_t va;
2143 
2144 		/*
2145 		 * we're growing the kernel pmap early (from
2146 		 * uvm_pageboot_alloc()).  this case must be
2147 		 * handled a little differently.
2148 		 */
2149 
2150 		va = pmap_steal_memory(PAGE_SIZE, NULL, NULL);
2151 		*paddrp = PMAP_DIRECT_UNMAP(va);
2152 	} else {
2153 		ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
2154 				    ptp_va2o(va, level), NULL,
2155 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2156 		if (ptp == NULL)
2157 			panic("pmap_get_physpage: out of memory");
2158 		atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
2159 		ptp->wire_count = 1;
2160 		*paddrp = VM_PAGE_TO_PHYS(ptp);
2161 	}
2162 	kpm->pm_stats.resident_count++;
2163 	return TRUE;
2164 }
2165 
2166 /*
2167  * Allocate the amount of specified ptps for a ptp level, and populate
2168  * all levels below accordingly, mapping virtual addresses starting at
2169  * kva.
2170  *
2171  * Used by pmap_growkernel.
2172  */
2173 void
2174 pmap_alloc_level(pd_entry_t **pdes, vaddr_t kva, int lvl, long *needed_ptps)
2175 {
2176 	unsigned long i;
2177 	vaddr_t va;
2178 	paddr_t pa;
2179 	unsigned long index, endindex;
2180 	int level;
2181 	pd_entry_t *pdep;
2182 
2183 	for (level = lvl; level > 1; level--) {
2184 		if (level == PTP_LEVELS)
2185 			pdep = pmap_kernel()->pm_pdir;
2186 		else
2187 			pdep = pdes[level - 2];
2188 		va = kva;
2189 		index = pl_i(kva, level);
2190 		endindex = index + needed_ptps[level - 1];
2191 		/*
2192 		 * XXX special case for first time call.
2193 		 */
2194 		if (nkptp[level - 1] != 0)
2195 			index++;
2196 		else
2197 			endindex--;
2198 
2199 		for (i = index; i <= endindex; i++) {
2200 			pmap_get_physpage(va, level - 1, &pa);
2201 			pdep[i] = pa | PG_RW | PG_V;
2202 			nkptp[level - 1]++;
2203 			va += nbpd[level - 1];
2204 		}
2205 	}
2206 }
2207 
2208 /*
2209  * pmap_growkernel: increase usage of KVM space
2210  *
2211  * => we allocate new PTPs for the kernel and install them in all
2212  *	the pmaps on the system.
2213  */
2214 
2215 static vaddr_t pmap_maxkvaddr = VM_MIN_KERNEL_ADDRESS;
2216 
2217 vaddr_t
2218 pmap_growkernel(vaddr_t maxkvaddr)
2219 {
2220 	struct pmap *kpm = pmap_kernel(), *pm;
2221 	int s, i;
2222 	unsigned newpdes;
2223 	long needed_kptp[PTP_LEVELS], target_nptp, old;
2224 
2225 	if (maxkvaddr <= pmap_maxkvaddr)
2226 		return pmap_maxkvaddr;
2227 
2228 	maxkvaddr = x86_round_pdr(maxkvaddr);
2229 	old = nkptp[PTP_LEVELS - 1];
2230 	/*
2231 	 * This loop could be optimized more, but pmap_growkernel()
2232 	 * is called infrequently.
2233 	 */
2234 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
2235 		target_nptp = pl_i(maxkvaddr, i + 1) -
2236 		    pl_i(VM_MIN_KERNEL_ADDRESS, i + 1);
2237 		/*
2238 		 * XXX only need to check toplevel.
2239 		 */
2240 		if (target_nptp > nkptpmax[i])
2241 			panic("out of KVA space");
2242 		needed_kptp[i] = target_nptp - nkptp[i] + 1;
2243 	}
2244 
2245 
2246 	s = splhigh();	/* to be safe */
2247 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS,
2248 	    needed_kptp);
2249 
2250 	/*
2251 	 * If the number of top level entries changed, update all
2252 	 * pmaps.
2253 	 */
2254 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
2255 		newpdes = nkptp[PTP_LEVELS - 1] - old;
2256 		LIST_FOREACH(pm, &pmaps, pm_list) {
2257 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
2258 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
2259 			       newpdes * sizeof (pd_entry_t));
2260 		}
2261 
2262 		/* Invalidate the PDP cache. */
2263 #if 0
2264 		pool_cache_invalidate(&pmap_pdp_cache);
2265 #endif
2266 	}
2267 	pmap_maxkvaddr = maxkvaddr;
2268 	splx(s);
2269 
2270 	return maxkvaddr;
2271 }
2272 
2273 vaddr_t
2274 pmap_steal_memory(vsize_t size, vaddr_t *start, vaddr_t *end)
2275 {
2276 	int segno;
2277 	u_int npg;
2278 	vaddr_t va;
2279 	paddr_t pa;
2280 	struct vm_physseg *seg;
2281 
2282 	size = round_page(size);
2283 	npg = atop(size);
2284 
2285 	for (segno = 0, seg = vm_physmem; segno < vm_nphysseg; segno++, seg++) {
2286 		if (seg->avail_end - seg->avail_start < npg)
2287 			continue;
2288 		/*
2289 		 * We can only steal at an ``unused'' segment boundary,
2290 		 * i.e. either at the start or at the end.
2291 		 */
2292 		if (seg->avail_start == seg->start ||
2293 		    seg->avail_end == seg->end)
2294 			break;
2295 	}
2296 	if (segno == vm_nphysseg) {
2297 		panic("pmap_steal_memory: out of memory");
2298 	} else {
2299 		if (seg->avail_start == seg->start) {
2300 			pa = ptoa(seg->avail_start);
2301 			seg->avail_start += npg;
2302 			seg->start += npg;
2303 		} else {
2304 			pa = ptoa(seg->avail_end) - size;
2305 			seg->avail_end -= npg;
2306 			seg->end -= npg;
2307 		}
2308 		/*
2309 		 * If all the segment has been consumed now, remove it.
2310 		 * Note that the crash dump code still knows about it
2311 		 * and will dump it correctly.
2312 		 */
2313 		if (seg->start == seg->end) {
2314 			if (vm_nphysseg-- == 1)
2315 				panic("pmap_steal_memory: out of memory");
2316 			while (segno < vm_nphysseg) {
2317 				seg[0] = seg[1]; /* struct copy */
2318 				seg++;
2319 				segno++;
2320 			}
2321 		}
2322 
2323 		va = PMAP_DIRECT_MAP(pa);
2324 		memset((void *)va, 0, size);
2325 	}
2326 
2327 	if (start != NULL)
2328 		*start = virtual_avail;
2329 	if (end != NULL)
2330 		*end = VM_MAX_KERNEL_ADDRESS;
2331 
2332 	return (va);
2333 }
2334 
2335 #ifdef DEBUG
2336 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
2337 
2338 /*
2339  * pmap_dump: dump all the mappings from a pmap
2340  *
2341  * => caller should not be holding any pmap locks
2342  */
2343 
2344 void
2345 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
2346 {
2347 	pt_entry_t *ptes, *pte;
2348 	pd_entry_t **pdes;
2349 	vaddr_t blkendva;
2350 
2351 	/*
2352 	 * if end is out of range truncate.
2353 	 * if (end == start) update to max.
2354 	 */
2355 
2356 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
2357 		eva = VM_MAXUSER_ADDRESS;
2358 
2359 
2360 	PMAP_MAP_TO_HEAD_LOCK();
2361 	pmap_map_ptes(pmap, &ptes, &pdes);
2362 
2363 	/*
2364 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
2365 	 */
2366 
2367 	for (/* null */ ; sva < eva ; sva = blkendva) {
2368 
2369 		/* determine range of block */
2370 		blkendva = x86_round_pdr(sva+1);
2371 		if (blkendva > eva)
2372 			blkendva = eva;
2373 
2374 		/* valid block? */
2375 		if (!pmap_pdes_valid(sva, pdes, NULL))
2376 			continue;
2377 
2378 		pte = &ptes[pl1_i(sva)];
2379 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
2380 			if (!pmap_valid_entry(*pte))
2381 				continue;
2382 			printf("va %#lx -> pa %#lx (pte=%#lx)\n",
2383 			       sva, *pte, *pte & PG_FRAME);
2384 		}
2385 	}
2386 	pmap_unmap_ptes(pmap);
2387 	PMAP_MAP_TO_HEAD_UNLOCK();
2388 }
2389 #endif
2390 
2391 void
2392 pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp)
2393 {
2394 	*vstartp = virtual_avail;
2395 	*vendp = VM_MAX_KERNEL_ADDRESS;
2396 }
2397 
2398 #ifdef MULTIPROCESSOR
2399 /*
2400  * Locking for tlb shootdown.
2401  *
2402  * We lock by setting tlb_shoot_wait to the number of cpus that will
2403  * receive our tlb shootdown. After sending the IPIs, we don't need to
2404  * worry about locking order or interrupts spinning for the lock because
2405  * the call that grabs the "lock" isn't the one that releases it. And
2406  * there is nothing that can block the IPI that releases the lock.
2407  *
2408  * The functions are organized so that we first count the number of
2409  * cpus we need to send the IPI to, then we grab the counter, then
2410  * we send the IPIs, then we finally do our own shootdown.
2411  *
2412  * Our shootdown is last to make it parallel with the other cpus
2413  * to shorten the spin time.
2414  *
2415  * Notice that we depend on failures to send IPIs only being able to
2416  * happen during boot. If they happen later, the above assumption
2417  * doesn't hold since we can end up in situations where noone will
2418  * release the lock if we get an interrupt in a bad moment.
2419  */
2420 
2421 volatile long tlb_shoot_wait;
2422 
2423 volatile vaddr_t tlb_shoot_addr1;
2424 volatile vaddr_t tlb_shoot_addr2;
2425 
2426 void
2427 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va)
2428 {
2429 	struct cpu_info *ci, *self = curcpu();
2430 	CPU_INFO_ITERATOR cii;
2431 	long wait = 0;
2432 	u_int64_t mask = 0;
2433 
2434 	CPU_INFO_FOREACH(cii, ci) {
2435 		if (ci == self || !pmap_is_active(pm, ci->ci_cpuid) ||
2436 		    !(ci->ci_flags & CPUF_RUNNING))
2437 			continue;
2438 		mask |= (1ULL << ci->ci_cpuid);
2439 		wait++;
2440 	}
2441 
2442 	if (wait > 0) {
2443 		int s = splvm();
2444 
2445 		while (x86_atomic_cas_ul(&tlb_shoot_wait, 0, wait) != 0) {
2446 			while (tlb_shoot_wait != 0)
2447 				SPINLOCK_SPIN_HOOK;
2448 		}
2449 		tlb_shoot_addr1 = va;
2450 		CPU_INFO_FOREACH(cii, ci) {
2451 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
2452 				continue;
2453 			if (x86_fast_ipi(ci, LAPIC_IPI_INVLPG) != 0)
2454 				panic("pmap_tlb_shootpage: ipi failed");
2455 		}
2456 		splx(s);
2457 	}
2458 
2459 	if (pmap_is_curpmap(pm))
2460 		pmap_update_pg(va);
2461 }
2462 
2463 void
2464 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva)
2465 {
2466 	struct cpu_info *ci, *self = curcpu();
2467 	CPU_INFO_ITERATOR cii;
2468 	long wait = 0;
2469 	u_int64_t mask = 0;
2470 	vaddr_t va;
2471 
2472 	CPU_INFO_FOREACH(cii, ci) {
2473 		if (ci == self || !pmap_is_active(pm, ci->ci_cpuid) ||
2474 		    !(ci->ci_flags & CPUF_RUNNING))
2475 			continue;
2476 		mask |= (1ULL << ci->ci_cpuid);
2477 		wait++;
2478 	}
2479 
2480 	if (wait > 0) {
2481 		int s = splvm();
2482 
2483 		while (x86_atomic_cas_ul(&tlb_shoot_wait, 0, wait) != 0) {
2484 			while (tlb_shoot_wait != 0)
2485 				SPINLOCK_SPIN_HOOK;
2486 		}
2487 		tlb_shoot_addr1 = sva;
2488 		tlb_shoot_addr2 = eva;
2489 		CPU_INFO_FOREACH(cii, ci) {
2490 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
2491 				continue;
2492 			if (x86_fast_ipi(ci, LAPIC_IPI_INVLRANGE) != 0)
2493 				panic("pmap_tlb_shootrange: ipi failed");
2494 		}
2495 		splx(s);
2496 	}
2497 
2498 	if (pmap_is_curpmap(pm))
2499 		for (va = sva; va < eva; va += PAGE_SIZE)
2500 			pmap_update_pg(va);
2501 }
2502 
2503 void
2504 pmap_tlb_shoottlb(void)
2505 {
2506 	struct cpu_info *ci, *self = curcpu();
2507 	CPU_INFO_ITERATOR cii;
2508 	long wait = 0;
2509 	u_int64_t mask = 0;
2510 
2511 	CPU_INFO_FOREACH(cii, ci) {
2512 		if (ci == self || !(ci->ci_flags & CPUF_RUNNING))
2513 			continue;
2514 		mask |= (1ULL << ci->ci_cpuid);
2515 		wait++;
2516 	}
2517 
2518 	if (wait) {
2519 		int s = splvm();
2520 
2521 		while (x86_atomic_cas_ul(&tlb_shoot_wait, 0, wait) != 0) {
2522 			while (tlb_shoot_wait != 0)
2523 				SPINLOCK_SPIN_HOOK;
2524 		}
2525 
2526 		CPU_INFO_FOREACH(cii, ci) {
2527 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
2528 				continue;
2529 			if (x86_fast_ipi(ci, LAPIC_IPI_INVLTLB) != 0)
2530 				panic("pmap_tlb_shoottlb: ipi failed");
2531 		}
2532 		splx(s);
2533 	}
2534 
2535 	tlbflush();
2536 }
2537 
2538 void
2539 pmap_tlb_shootwait(void)
2540 {
2541 	while (tlb_shoot_wait != 0)
2542 		SPINLOCK_SPIN_HOOK;
2543 }
2544 
2545 #else
2546 
2547 void
2548 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va)
2549 {
2550 	if (pmap_is_curpmap(pm))
2551 		pmap_update_pg(va);
2552 
2553 }
2554 
2555 void
2556 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva)
2557 {
2558 	vaddr_t va;
2559 
2560 	for (va = sva; va < eva; va += PAGE_SIZE)
2561 		pmap_update_pg(va);
2562 
2563 }
2564 
2565 void
2566 pmap_tlb_shoottlb(void)
2567 {
2568 	tlbflush();
2569 }
2570 #endif /* MULTIPROCESSOR */
2571