xref: /openbsd/sys/arch/i386/i386/pmap.c (revision 1ee36744)
1 /*	$OpenBSD: pmap.c,v 1.226 2024/11/08 13:18:29 jsg Exp $	*/
2 /*	$NetBSD: pmap.c,v 1.91 2000/06/02 17:46:37 thorpej Exp $	*/
3 
4 /*
5  * Copyright (c) 1997 Charles D. Cranor and Washington University.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * pmap.c: i386 pmap module rewrite
31  * Chuck Cranor <chuck@ccrc.wustl.edu>
32  * 11-Aug-97
33  *
34  * history of this pmap module: in addition to my own input, i used
35  *    the following references for this rewrite of the i386 pmap:
36  *
37  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
38  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
39  *     it was then ported to the i386 by William Jolitz of UUNET
40  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
41  *     project fixed some bugs and provided some speed ups.
42  *
43  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
44  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
45  *     and David Greenman.
46  *
47  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
48  *     between several processors.   the VAX version was done by
49  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
50  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
51  *     David Golub, and Richard Draves.    the alpha version was
52  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
53  *     (NetBSD/alpha).
54  */
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/atomic.h>
59 #include <sys/proc.h>
60 #include <sys/pool.h>
61 #include <sys/user.h>
62 #include <sys/mutex.h>
63 
64 #include <uvm/uvm.h>
65 
66 #include <machine/specialreg.h>
67 
68 #include <sys/msgbuf.h>
69 #include <stand/boot/bootarg.h>
70 
71 /* #define PMAP_DEBUG */
72 
73 #ifdef PMAP_DEBUG
74 #define DPRINTF(x...)	do { printf(x); } while(0)
75 #else
76 #define DPRINTF(x...)
77 #endif	/* PMAP_DEBUG */
78 
79 /*
80  * this file contains the code for the "pmap module."   the module's
81  * job is to manage the hardware's virtual to physical address mappings.
82  * note that there are two levels of mapping in the VM system:
83  *
84  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
85  *      to map ranges of virtual address space to objects/files.  for
86  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
87  *      to the file /bin/ls starting at offset zero."   note that
88  *      the upper layer mapping is not concerned with how individual
89  *      vm_pages are mapped.
90  *
91  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
92  *      from virtual addresses.   it is concerned with which vm_page is
93  *      mapped where.   for example, when you run /bin/ls and start
94  *      at page 0x1000 the fault routine may lookup the correct page
95  *      of the /bin/ls file and then ask the pmap layer to establish
96  *      a mapping for it.
97  *
98  * note that information in the lower layer of the VM system can be
99  * thrown away since it can easily be reconstructed from the info
100  * in the upper layer.
101  *
102  * data structures we use include:
103  *
104  *  - struct pmap: describes the address space of one thread
105  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
106  *  - struct pv_head: there is one pv_head per managed page of
107  *	physical memory.   the pv_head points to a list of pv_entry
108  *	structures which describe all the <PMAP,VA> pairs that this
109  *      page is mapped in.    this is critical for page based operations
110  *      such as pmap_page_protect() [change protection on _all_ mappings
111  *      of a page]
112  */
113 /*
114  * i386 MMU hardware structure:
115  *
116  * the i386 MMU is a two-level MMU which maps 4GB of virtual memory.
117  * the pagesize is 4K (4096 [0x1000] bytes), although newer pentium
118  * processors can support a 4MB pagesize as well.
119  *
120  * the first level table (segment table?) is called a "page directory"
121  * and it contains 1024 page directory entries (PDEs).   each PDE is
122  * 4 bytes (an int), so a PD fits in a single 4K page.   this page is
123  * the page directory page (PDP).  each PDE in a PDP maps 4MB of space
124  * (1024 * 4MB = 4GB).   a PDE contains the physical address of the
125  * second level table: the page table.   or, if 4MB pages are being used,
126  * then the PDE contains the PA of the 4MB page being mapped.
127  *
128  * a page table consists of 1024 page table entries (PTEs).  each PTE is
129  * 4 bytes (an int), so a page table also fits in a single 4K page.  a
130  * 4K page being used as a page table is called a page table page (PTP).
131  * each PTE in a PTP maps one 4K page (1024 * 4K = 4MB).   a PTE contains
132  * the physical address of the page it maps and some flag bits (described
133  * below).
134  *
135  * the processor has a special register, "cr3", which points to the
136  * the PDP which is currently controlling the mappings of the virtual
137  * address space.
138  *
139  * the following picture shows the translation process for a 4K page:
140  *
141  * %cr3 register [PA of PDP]
142  *      |
143  *      |
144  *      |   bits <31-22> of VA         bits <21-12> of VA   bits <11-0>
145  *      |   index the PDP (0 - 1023)   index the PTP        are the page offset
146  *      |         |                           |                  |
147  *      |         v                           |                  |
148  *      +--->+----------+                     |                  |
149  *           | PD Page  |   PA of             v                  |
150  *           |          |---PTP-------->+------------+           |
151  *           | 1024 PDE |               | page table |--PTE--+   |
152  *           | entries  |               | (aka PTP)  |       |   |
153  *           +----------+               | 1024 PTE   |       |   |
154  *                                      | entries    |       |   |
155  *                                      +------------+       |   |
156  *                                                           |   |
157  *                                                bits <31-12>   bits <11-0>
158  *                                                p h y s i c a l  a d d r
159  *
160  * the i386 caches PTEs in a TLB.   it is important to flush out old
161  * TLB mappings when making a change to a mapping.   writing to the
162  * %cr3 will flush the entire TLB.    newer processors also have an
163  * instruction that will invalidate the mapping of a single page (which
164  * is useful if you are changing a single mapping because it preserves
165  * all the cached TLB entries).
166  *
167  * as shows, bits 31-12 of the PTE contain PA of the page being mapped.
168  * the rest of the PTE is defined as follows:
169  *   bit#	name	use
170  *   11		n/a	available for OS use, hardware ignores it
171  *   10		n/a	available for OS use, hardware ignores it
172  *   9		n/a	available for OS use, hardware ignores it
173  *   8		G	global bit (see discussion below)
174  *   7		PS	page size [for PDEs] (0=4k, 1=4M <if supported>)
175  *   6		D	dirty (modified) page
176  *   5		A	accessed (referenced) page
177  *   4		PCD	cache disable
178  *   3		PWT	prevent write through (cache)
179  *   2		U/S	user/supervisor bit (0=supervisor only, 1=both u&s)
180  *   1		R/W	read/write bit (0=read only, 1=read-write)
181  *   0		P	present (valid)
182  *
183  * notes:
184  *  - on the i386 the R/W bit is ignored if processor is in supervisor
185  *    state (bug!)
186  *  - PS is only supported on newer processors
187  *  - PTEs with the G bit are global in the sense that they are not
188  *    flushed from the TLB when %cr3 is written (to flush, use the
189  *    "flush single page" instruction).   this is only supported on
190  *    newer processors.    this bit can be used to keep the kernel's
191  *    TLB entries around while context switching.   since the kernel
192  *    is mapped into all processes at the same place it does not make
193  *    sense to flush these entries when switching from one process'
194  *    pmap to another.
195  */
196 /*
197  * A pmap describes a process' 4GB virtual address space.  This
198  * virtual address space can be broken up into 1024 4MB regions which
199  * are described by PDEs in the PDP.  The PDEs are defined as follows:
200  *
201  * Ranges are inclusive -> exclusive, just like vm_map_entry start/end.
202  * The following assumes that KERNBASE is 0xd0000000.
203  *
204  * PDE#s	VA range		Usage
205  * 0->831	0x0 -> 0xcfc00000	user address space, note that the
206  *					max user address is 0xcfbfe000
207  *					the final two pages in the last 4MB
208  *					used to be reserved for the UAREA
209  *					but now are no longer used.
210  * 831		0xcfc00000->		recursive mapping of PDP (used for
211  *			0xd0000000	linear mapping of PTPs).
212  * 832->1023	0xd0000000->		kernel address space (constant
213  *			0xffc00000	across all pmaps/processes).
214  * 1023		0xffc00000->		"alternate" recursive PDP mapping
215  *			<end>		(for other pmaps).
216  *
217  *
218  * Note: A recursive PDP mapping provides a way to map all the PTEs for
219  * a 4GB address space into a linear chunk of virtual memory.  In other
220  * words, the PTE for page 0 is the first int mapped into the 4MB recursive
221  * area.  The PTE for page 1 is the second int.  The very last int in the
222  * 4MB range is the PTE that maps VA 0xffffe000 (the last page in a 4GB
223  * address).
224  *
225  * All pmaps' PDs must have the same values in slots 832->1023 so that
226  * the kernel is always mapped in every process.  These values are loaded
227  * into the PD at pmap creation time.
228  *
229  * At any one time only one pmap can be active on a processor.  This is
230  * the pmap whose PDP is pointed to by processor register %cr3.  This pmap
231  * will have all its PTEs mapped into memory at the recursive mapping
232  * point (slot #831 as show above).  When the pmap code wants to find the
233  * PTE for a virtual address, all it has to do is the following:
234  *
235  * Address of PTE = (831 * 4MB) + (VA / PAGE_SIZE) * sizeof(pt_entry_t)
236  *                = 0xcfc00000 + (VA / 4096) * 4
237  *
238  * What happens if the pmap layer is asked to perform an operation
239  * on a pmap that is not the one which is currently active?  In that
240  * case we take the PA of the PDP of the non-active pmap and put it in
241  * slot 1023 of the active pmap.  This causes the non-active pmap's
242  * PTEs to get mapped in the final 4MB of the 4GB address space
243  * (e.g. starting at 0xffc00000).
244  *
245  * The following figure shows the effects of the recursive PDP mapping:
246  *
247  *   PDP (%cr3)
248  *   +----+
249  *   |   0| -> PTP#0 that maps VA 0x0 -> 0x400000
250  *   |    |
251  *   |    |
252  *   | 831| -> points back to PDP (%cr3) mapping VA 0xcfc00000 -> 0xd0000000
253  *   | 832| -> first kernel PTP (maps 0xd0000000 -> 0xe0400000)
254  *   |    |
255  *   |1023| -> points to alternate pmap's PDP (maps 0xffc00000 -> end)
256  *   +----+
257  *
258  * Note that the PDE#831 VA (0xcfc00000) is defined as "PTE_BASE".
259  * Note that the PDE#1023 VA (0xffc00000) is defined as "APTE_BASE".
260  *
261  * Starting at VA 0xcfc00000 the current active PDP (%cr3) acts as a
262  * PTP:
263  *
264  * PTP#831 == PDP(%cr3) => maps VA 0xcfc00000 -> 0xd0000000
265  *   +----+
266  *   |   0| -> maps the contents of PTP#0 at VA 0xcfc00000->0xcfc01000
267  *   |    |
268  *   |    |
269  *   | 831| -> maps the contents of PTP#831 (the PDP) at VA 0xcff3f000
270  *   | 832| -> maps the contents of first kernel PTP
271  *   |    |
272  *   |1023|
273  *   +----+
274  *
275  * Note that mapping of the PDP at PTP#831's VA (0xcff3f000) is
276  * defined as "PDP_BASE".... within that mapping there are two
277  * defines:
278  *   "PDP_PDE" (0xcff3fcfc) is the VA of the PDE in the PDP
279  *      which points back to itself.
280  *   "APDP_PDE" (0xcff3fffc) is the VA of the PDE in the PDP which
281  *      establishes the recursive mapping of the alternate pmap.
282  *      To set the alternate PDP, one just has to put the correct
283  *	PA info in *APDP_PDE.
284  *
285  * Note that in the APTE_BASE space, the APDP appears at VA
286  * "APDP_BASE" (0xfffff000).
287  */
288 #define PG_FRAME	0xfffff000	/* page frame mask */
289 #define PG_LGFRAME	0xffc00000	/* large (4M) page frame mask */
290 
291 /*
292  * The following defines give the virtual addresses of various MMU
293  * data structures:
294  * PTE_BASE and APTE_BASE: the base VA of the linear PTE mappings
295  * PDP_PDE and APDP_PDE: the VA of the PDE that points back to the PDP/APDP
296  */
297 #define PTE_BASE	((pt_entry_t *) (PDSLOT_PTE * NBPD))
298 #define APTE_BASE	((pt_entry_t *) (PDSLOT_APTE * NBPD))
299 #define PDP_BASE ((pd_entry_t *)(((char *)PTE_BASE) + (PDSLOT_PTE * NBPG)))
300 #define APDP_BASE ((pd_entry_t *)(((char *)APTE_BASE) + (PDSLOT_APTE * NBPG)))
301 #define PDP_PDE		(PDP_BASE + PDSLOT_PTE)
302 #define APDP_PDE	(PDP_BASE + PDSLOT_APTE)
303 
304 /*
305  * pdei/ptei: generate index into PDP/PTP from a VA
306  */
307 #define PD_MASK		0xffc00000	/* page directory address bits */
308 #define PT_MASK		0x003ff000	/* page table address bits */
309 #define pdei(VA)	(((VA) & PD_MASK) >> PDSHIFT)
310 #define ptei(VA)	(((VA) & PT_MASK) >> PGSHIFT)
311 
312 /*
313  * Mach derived conversion macros
314  */
315 #define i386_round_pdr(x)	((((unsigned)(x)) + ~PD_MASK) & PD_MASK)
316 
317 /*
318  * various address macros
319  *
320  *  vtopte: return a pointer to the PTE mapping a VA
321  */
322 #define vtopte(VA)	(PTE_BASE + atop((vaddr_t)VA))
323 
324 /*
325  * PTP macros:
326  *   A PTP's index is the PD index of the PDE that points to it.
327  *   A PTP's offset is the byte-offset in the PTE space that this PTP is at.
328  *   A PTP's VA is the first VA mapped by that PTP.
329  *
330  * Note that NBPG == number of bytes in a PTP (4096 bytes == 1024 entries)
331  *           NBPD == number of bytes a PTP can map (4MB)
332  */
333 
334 #define ptp_i2o(I)	((I) * NBPG)	/* index => offset */
335 #define ptp_o2i(O)	((O) / NBPG)	/* offset => index */
336 #define ptp_i2v(I)	((I) * NBPD)	/* index => VA */
337 #define ptp_v2i(V)	((V) / NBPD)	/* VA => index (same as pdei) */
338 
339 /*
340  * Access PD and PT
341  */
342 #define PDE(pm,i)	(((pd_entry_t *)(pm)->pm_pdir)[(i)])
343 
344 /*
345  * here we define the data types for PDEs and PTEs
346  */
347 typedef u_int32_t pd_entry_t;		/* PDE */
348 typedef u_int32_t pt_entry_t;		/* PTE */
349 
350 /*
351  * Number of PTEs per cache line. 4 byte pte, 64-byte cache line
352  * Used to avoid false sharing of cache lines.
353  */
354 #define NPTECL			16
355 
356 /*
357  * global data structures
358  */
359 
360 /* The kernel's pmap (proc0), 32 byte aligned in case we are using PAE */
361 struct pmap __attribute__ ((aligned (32))) kernel_pmap_store;
362 
363 /*
364  * nkpde is the number of kernel PTPs allocated for the kernel at
365  * boot time (NKPTP is a compile time override).   this number can
366  * grow dynamically as needed (but once allocated, we never free
367  * kernel PTPs).
368  */
369 
370 int nkpde = NKPTP;
371 int nkptp_max = 1024 - (KERNBASE / NBPD) - 1;
372 
373 /*
374  * pg_g_kern:  if CPU is affected by Meltdown pg_g_kern is 0,
375  * otherwise it is set to PG_G.  pmap_pg_g will be derived
376  * from pg_g_kern, see pmap_bootstrap().
377  */
378 extern int pg_g_kern;
379 
380 /*
381  * pmap_pg_g: if our processor supports PG_G in the PTE then we
382  * set pmap_pg_g to PG_G (otherwise it is zero).
383  */
384 
385 int pmap_pg_g = 0;
386 
387 /*
388  * pmap_pg_wc: if our processor supports PAT then we set this
389  * to be the pte bits for Write Combining. Else we fall back to
390  * UC- so mtrrs can override the cacheability
391  */
392 int pmap_pg_wc = PG_UCMINUS;
393 
394 /*
395  * other data structures
396  */
397 
398 uint32_t protection_codes[8];		/* maps MI prot to i386 prot code */
399 int pmap_initialized = 0;	/* pmap_init done yet? */
400 
401 /*
402  * MULTIPROCESSOR: special VAs/ PTEs are actually allocated inside a
403  * MAXCPUS*NPTECL array of PTEs, to avoid cache line thrashing
404  * due to false sharing.
405  */
406 
407 #ifdef MULTIPROCESSOR
408 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
409 #define VASLEW(va,id) ((va)+(id)*NPTECL*NBPG)
410 #else
411 #define PTESLEW(pte, id) (pte)
412 #define VASLEW(va,id) (va)
413 #endif
414 
415 /*
416  * pv management structures.
417  */
418 struct pool pmap_pv_pool;
419 
420 #define PVE_LOWAT (PVE_PER_PVPAGE / 2)	/* free pv_entry low water mark */
421 #define PVE_HIWAT (PVE_LOWAT + (PVE_PER_PVPAGE * 2))
422 					/* high water mark */
423 
424 /*
425  * the following two vaddr_t's are used during system startup
426  * to keep track of how much of the kernel's VM space we have used.
427  * once the system is started, the management of the remaining kernel
428  * VM space is turned over to the kernel_map vm_map.
429  */
430 
431 static vaddr_t virtual_avail;	/* VA of first free KVA */
432 static vaddr_t virtual_end;	/* VA of last free KVA */
433 
434 /*
435  * linked list of all non-kernel pmaps
436  */
437 
438 struct pmap_head pmaps;
439 struct mutex pmaps_lock = MUTEX_INITIALIZER(IPL_VM);
440 
441 /*
442  * pool that pmap structures are allocated from
443  */
444 
445 struct pool pmap_pmap_pool;
446 
447 /*
448  * special VAs and the PTEs that map them
449  */
450 
451 pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *flsh_pte;
452 caddr_t pmap_csrcp, pmap_cdstp, pmap_zerop, pmap_ptpp, pmap_flshp;
453 caddr_t vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
454 
455 extern uint32_t cpu_meltdown;
456 
457 /*
458  * local prototypes
459  */
460 struct vm_page	*pmap_alloc_ptp_86(struct pmap *, int, pt_entry_t);
461 struct vm_page	*pmap_get_ptp_86(struct pmap *, int);
462 pt_entry_t	*pmap_map_ptes_86(struct pmap *);
463 void		 pmap_unmap_ptes_86(struct pmap *);
464 void		 pmap_do_remove_86(struct pmap *, vaddr_t, vaddr_t, int);
465 void		 pmap_remove_ptes_86(struct pmap *, struct vm_page *, vaddr_t,
466 		    vaddr_t, vaddr_t, int, struct pv_entry **);
467 void		*pmap_pv_page_alloc(struct pool *, int, int *);
468 void		pmap_pv_page_free(struct pool *, void *);
469 
470 struct pool_allocator pmap_pv_page_allocator = {
471 	pmap_pv_page_alloc, pmap_pv_page_free,
472 };
473 
474 void		 pmap_sync_flags_pte_86(struct vm_page *, pt_entry_t);
475 
476 void		 pmap_drop_ptp_86(struct pmap *, vaddr_t, struct vm_page *,
477     pt_entry_t *);
478 
479 void		 setcslimit(struct pmap *, struct trapframe *, struct pcb *,
480 		     vaddr_t);
481 void		 pmap_pinit_pd_86(struct pmap *);
482 
483 static __inline u_int
pmap_pte2flags(pt_entry_t pte)484 pmap_pte2flags(pt_entry_t pte)
485 {
486 	return (((pte & PG_U) ? PG_PMAP_REF : 0) |
487 	    ((pte & PG_M) ? PG_PMAP_MOD : 0));
488 }
489 
490 void
pmap_sync_flags_pte_86(struct vm_page * pg,pt_entry_t pte)491 pmap_sync_flags_pte_86(struct vm_page *pg, pt_entry_t pte)
492 {
493 	if (pte & (PG_U|PG_M)) {
494 		atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte));
495 	}
496 }
497 
498 void
pmap_apte_flush(void)499 pmap_apte_flush(void)
500 {
501 	pmap_tlb_shoottlb();
502 	pmap_tlb_shootwait();
503 }
504 
505 /*
506  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
507  *
508  * => we lock enough pmaps to keep things locked in
509  * => must be undone with pmap_unmap_ptes before returning
510  */
511 
512 pt_entry_t *
pmap_map_ptes_86(struct pmap * pmap)513 pmap_map_ptes_86(struct pmap *pmap)
514 {
515 	pd_entry_t opde;
516 
517 	/* the kernel's pmap is always accessible */
518 	if (pmap == pmap_kernel()) {
519 		return(PTE_BASE);
520 	}
521 
522 	mtx_enter(&pmap->pm_mtx);
523 
524 	/* if curpmap then we are always mapped */
525 	if (pmap_is_curpmap(pmap)) {
526 		return(PTE_BASE);
527 	}
528 
529 	mtx_enter(&curcpu()->ci_curpmap->pm_apte_mtx);
530 
531 	/* need to load a new alternate pt space into curpmap? */
532 	opde = *APDP_PDE;
533 #if defined(MULTIPROCESSOR) && defined(DIAGNOSTIC)
534 	if (pmap_valid_entry(opde))
535 		panic("pmap_map_ptes_86: APTE valid");
536 #endif
537 	if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdirpa) {
538 		*APDP_PDE = (pd_entry_t) (pmap->pm_pdirpa | PG_RW | PG_V |
539 		    PG_U | PG_M);
540 		if (pmap_valid_entry(opde))
541 			pmap_apte_flush();
542 	}
543 	return(APTE_BASE);
544 }
545 
546 /*
547  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
548  */
549 
550 void
pmap_unmap_ptes_86(struct pmap * pmap)551 pmap_unmap_ptes_86(struct pmap *pmap)
552 {
553 	if (pmap == pmap_kernel())
554 		return;
555 
556 	if (!pmap_is_curpmap(pmap)) {
557 #if defined(MULTIPROCESSOR)
558 		*APDP_PDE = 0;
559 		pmap_apte_flush();
560 #endif
561 		mtx_leave(&curcpu()->ci_curpmap->pm_apte_mtx);
562 	}
563 
564 	mtx_leave(&pmap->pm_mtx);
565 }
566 
567 void
pmap_exec_account(struct pmap * pm,vaddr_t va,uint32_t opte,uint32_t npte)568 pmap_exec_account(struct pmap *pm, vaddr_t va,
569     uint32_t opte, uint32_t npte)
570 {
571 	if (pm == pmap_kernel())
572 		return;
573 
574 	if (curproc->p_vmspace == NULL ||
575 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
576 		return;
577 
578 	if ((opte ^ npte) & PG_X)
579 		pmap_tlb_shootpage(pm, va);
580 
581 	if (cpu_pae)
582 		return;
583 
584 	/*
585 	 * Executability was removed on the last executable change.
586 	 * Reset the code segment to something conservative and
587 	 * let the trap handler deal with setting the right limit.
588 	 * We can't do that because of locking constraints on the vm map.
589 	 *
590 	 * XXX - floating cs - set this _really_ low.
591 	 */
592 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
593 		struct trapframe *tf = curproc->p_md.md_regs;
594 		struct pcb *pcb = &curproc->p_addr->u_pcb;
595 
596 		KERNEL_LOCK();
597 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
598 		setcslimit(pm, tf, pcb, I386_MAX_EXE_ADDR);
599 		KERNEL_UNLOCK();
600 	}
601 }
602 
603 /*
604  * Fixup the code segment to cover all potential executable mappings.
605  * Called by kernel SEGV trap handler.
606  * returns 0 if no changes to the code segment were made.
607  */
608 int
pmap_exec_fixup(struct vm_map * map,struct trapframe * tf,vaddr_t gdt_cs,struct pcb * pcb)609 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, vaddr_t gdt_cs,
610     struct pcb *pcb)
611 {
612 	struct vm_map_entry *ent;
613 	struct pmap *pm = vm_map_pmap(map);
614 	vaddr_t va = 0;
615 	vaddr_t pm_cs;
616 
617 	KERNEL_LOCK();
618 
619 	vm_map_lock(map);
620 	RBT_FOREACH_REVERSE(ent, uvm_map_addr, &map->addr) {
621 		if (ent->protection & PROT_EXEC)
622 			break;
623 	}
624 	/*
625 	 * This entry has greater va than the entries before.
626 	 * We need to make it point to the last page, not past it.
627 	 */
628 	if (ent)
629 		va = trunc_page(ent->end - 1);
630 	vm_map_unlock(map);
631 
632 	KERNEL_ASSERT_LOCKED();
633 
634 	pm_cs = SEGDESC_LIMIT(pm->pm_codeseg);
635 
636 	/*
637 	 * Another thread running on another cpu can change
638 	 * pm_hiexec and pm_codeseg. If this has happened
639 	 * during our timeslice, our gdt code segment will
640 	 * be stale. So only allow the fault through if the
641 	 * faulting address is less then pm_hiexec and our
642 	 * gdt code segment is not stale.
643 	 */
644 	if (va <= pm->pm_hiexec && pm_cs == pm->pm_hiexec &&
645 	    gdt_cs == pm->pm_hiexec) {
646 		KERNEL_UNLOCK();
647 		return (0);
648 	}
649 
650 	pm->pm_hiexec = va;
651 
652 	/*
653 	 * We have a new 'highest executable' va, so we need to update
654 	 * the value for the code segment limit, which is stored in the
655 	 * PCB.
656 	 */
657 	setcslimit(pm, tf, pcb, va);
658 
659 	KERNEL_UNLOCK();
660 	return (1);
661 }
662 
663 u_int32_t
pmap_pte_set_86(vaddr_t va,paddr_t pa,u_int32_t bits)664 pmap_pte_set_86(vaddr_t va, paddr_t pa, u_int32_t bits)
665 {
666 	pt_entry_t pte, *ptep = vtopte(va);
667 
668 	pa &= PMAP_PA_MASK;
669 
670 	pte = i386_atomic_testset_ul(ptep, pa | bits);  /* zap! */
671 	return (pte & ~PG_FRAME);
672 }
673 
674 u_int32_t
pmap_pte_setbits_86(vaddr_t va,u_int32_t set,u_int32_t clr)675 pmap_pte_setbits_86(vaddr_t va, u_int32_t set, u_int32_t clr)
676 {
677 	pt_entry_t *ptep = vtopte(va);
678 	pt_entry_t pte = *ptep;
679 
680 	*ptep = (pte | set) & ~clr;
681 	return (pte & ~PG_FRAME);
682 }
683 
684 u_int32_t
pmap_pte_bits_86(vaddr_t va)685 pmap_pte_bits_86(vaddr_t va)
686 {
687 	pt_entry_t *ptep = vtopte(va);
688 
689 	return (*ptep & ~PG_FRAME);
690 }
691 
692 paddr_t
pmap_pte_paddr_86(vaddr_t va)693 pmap_pte_paddr_86(vaddr_t va)
694 {
695 	pt_entry_t *ptep = vtopte(va);
696 
697 	return (*ptep & PG_FRAME);
698 }
699 
700 /*
701  * pmap_tmpmap_pa: map a page in for tmp usage
702  */
703 
704 vaddr_t
pmap_tmpmap_pa_86(paddr_t pa)705 pmap_tmpmap_pa_86(paddr_t pa)
706 {
707 #ifdef MULTIPROCESSOR
708 	int id = cpu_number();
709 #endif
710 	pt_entry_t *ptpte;
711 	caddr_t ptpva;
712 
713 	ptpte = PTESLEW(ptp_pte, id);
714 	ptpva = VASLEW(pmap_ptpp, id);
715 
716 #if defined(DIAGNOSTIC)
717 	if (*ptpte)
718 		panic("pmap_tmpmap_pa: ptp_pte in use?");
719 #endif
720 	*ptpte = PG_V | PG_RW | pa;	/* always a new mapping */
721 	return((vaddr_t)ptpva);
722 }
723 
724 
725 vaddr_t
pmap_tmpmap_pa(paddr_t pa)726 pmap_tmpmap_pa(paddr_t pa)
727 {
728 	if (cpu_pae)
729 		return pmap_tmpmap_pa_pae(pa);
730 
731 	return pmap_tmpmap_pa_86(pa);
732 }
733 
734 /*
735  * pmap_tmpunmap_pa: unmap a tmp use page (undoes pmap_tmpmap_pa)
736  */
737 
738 void
pmap_tmpunmap_pa_86(void)739 pmap_tmpunmap_pa_86(void)
740 {
741 #ifdef MULTIPROCESSOR
742 	int id = cpu_number();
743 #endif
744 	pt_entry_t *ptpte;
745 	caddr_t ptpva;
746 
747 	ptpte = PTESLEW(ptp_pte, id);
748 	ptpva = VASLEW(pmap_ptpp, id);
749 
750 #if defined(DIAGNOSTIC)
751 	if (!pmap_valid_entry(*ptpte))
752 		panic("pmap_tmpunmap_pa: our pte invalid?");
753 #endif
754 
755 	*ptpte = 0;
756 	pmap_update_pg((vaddr_t)ptpva);
757 #ifdef MULTIPROCESSOR
758 	/*
759 	 * No need for tlb shootdown here, since ptp_pte is per-CPU.
760 	 */
761 #endif
762 }
763 
764 void
pmap_tmpunmap_pa(void)765 pmap_tmpunmap_pa(void)
766 {
767 	if (cpu_pae) {
768 		pmap_tmpunmap_pa_pae();
769 		return;
770 	}
771 
772 	pmap_tmpunmap_pa_86();
773 }
774 
775 paddr_t
vtophys(vaddr_t va)776 vtophys(vaddr_t va)
777 {
778 	if (cpu_pae)
779 		return vtophys_pae(va);
780 	else
781 		return ((*vtopte(va) & PG_FRAME) | (va & ~PG_FRAME));
782 }
783 
784 void
setcslimit(struct pmap * pm,struct trapframe * tf,struct pcb * pcb,vaddr_t limit)785 setcslimit(struct pmap *pm, struct trapframe *tf, struct pcb *pcb,
786     vaddr_t limit)
787 {
788 	/*
789 	 * Called when we have a new 'highest executable' va, so we need
790 	 * to update the value for the code segment limit, which is stored
791 	 * in the PCB.
792 	 *
793 	 * There are no caching issues to be concerned with: the
794 	 * processor reads the whole descriptor from the GDT when the
795 	 * appropriate selector is loaded into a segment register, and
796 	 * this only happens on the return to userland.
797 	 *
798 	 * This also works in the MP case, since whichever CPU gets to
799 	 * run the process will pick up the right descriptor value from
800 	 * the PCB.
801 	 */
802 	limit = min(limit, VM_MAXUSER_ADDRESS - 1);
803 
804 	setsegment(&pm->pm_codeseg, 0, atop(limit),
805 	    SDT_MEMERA, SEL_UPL, 1, 1);
806 
807 	/* And update the GDT since we may be called by the
808 	 * trap handler (cpu_switch won't get a chance).
809 	 */
810 	curcpu()->ci_gdt[GUCODE_SEL].sd = pm->pm_codeseg;
811 
812 	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
813 }
814 
815 /*
816  * p m a p   k e n t e r   f u n c t i o n s
817  *
818  * functions to quickly enter/remove pages from the kernel address
819  * space.   pmap_kremove is exported to MI kernel.  we make use of
820  * the recursive PTE mappings.
821  */
822 
823 /*
824  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
825  *
826  * => no need to lock anything, assume va is already allocated
827  * => should be faster than normal pmap enter function
828  */
829 
830 void
pmap_kenter_pa(vaddr_t va,paddr_t pa,vm_prot_t prot)831 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
832 {
833 	uint32_t bits;
834 	uint32_t global = 0;
835 
836 	/* special 1:1 mappings in the first large page must not be global */
837 	if (!cpu_pae) {
838 		if (va >= (vaddr_t)NBPD)	/* 4MB pages on non-PAE */
839 			global = pmap_pg_g;
840 	} else {
841 		if (va >= (vaddr_t)NBPD / 2)	/* 2MB pages on PAE */
842 			global = pmap_pg_g;
843 	}
844 
845 	bits = pmap_pte_set(va, pa, ((prot & PROT_WRITE) ? PG_RW : PG_RO) |
846 		PG_V | global | PG_U | PG_M |
847 		((prot & PROT_EXEC) ? PG_X : 0) |
848 		((pa & PMAP_NOCACHE) ? PG_N : 0) |
849 		((pa & PMAP_WC) ? pmap_pg_wc : 0));
850 	if (pmap_valid_entry(bits)) {
851 		if (pa & PMAP_NOCACHE && (bits & PG_N) == 0)
852 			wbinvd_on_all_cpus();
853 		/* NB. - this should not happen. */
854 		pmap_tlb_shootpage(pmap_kernel(), va);
855 		pmap_tlb_shootwait();
856 	}
857 }
858 
859 /*
860  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
861  *
862  * => no need to lock anything
863  * => caller must dispose of any vm_page mapped in the va range
864  * => note: not an inline function
865  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
866  */
867 
868 void
pmap_kremove(vaddr_t sva,vsize_t len)869 pmap_kremove(vaddr_t sva, vsize_t len)
870 {
871 	uint32_t bits;
872 	vaddr_t va, eva;
873 
874 	eva = sva + len;
875 
876 	for (va = sva; va != eva; va += PAGE_SIZE) {
877 		bits = pmap_pte_set(va, 0, 0);
878 #ifdef DIAGNOSTIC
879 		if (bits & PG_PVLIST)
880 			panic("pmap_kremove: PG_PVLIST mapping for 0x%lx", va);
881 #endif
882 	}
883 	pmap_tlb_shootrange(pmap_kernel(), sva, eva);
884 	pmap_tlb_shootwait();
885 }
886 
887 /*
888  * Allocate a new PD for Intel's U-K.
889  */
890 void
pmap_alloc_pdir_intel_x86(struct pmap * pmap)891 pmap_alloc_pdir_intel_x86(struct pmap *pmap)
892 {
893 	vaddr_t va;
894 
895 	KASSERT(pmap->pm_pdir_intel == 0);
896 
897 	va = (vaddr_t)km_alloc(NBPG, &kv_any, &kp_zero, &kd_waitok);
898 	if (va == 0)
899 		panic("kernel_map out of virtual space");
900 	pmap->pm_pdir_intel = va;
901 	if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel,
902 	    &pmap->pm_pdirpa_intel))
903 		panic("can't locate PD page");
904 }
905 
906 /*
907  * p m a p   i n i t   f u n c t i o n s
908  *
909  * pmap_bootstrap and pmap_init are called during system startup
910  * to init the pmap module.   pmap_bootstrap() does a low level
911  * init just to get things rolling.   pmap_init() finishes the job.
912  */
913 
914 /*
915  * pmap_bootstrap: get the system in a state where it can run with VM
916  *	properly enabled (called before main()).   the VM system is
917  *      fully init'd later...
918  *
919  * => on i386, locore.s has already enabled the MMU by allocating
920  *	a PDP for the kernel, and nkpde PTPs for the kernel.
921  * => kva_start is the first free virtual address in kernel space
922  */
923 
924 void
pmap_bootstrap(vaddr_t kva_start)925 pmap_bootstrap(vaddr_t kva_start)
926 {
927 	struct pmap *kpm;
928 	vaddr_t kva;
929 	pt_entry_t *pte;
930 
931 	/*
932 	 * set the page size (default value is 4K which is ok)
933 	 */
934 
935 	uvm_setpagesize();
936 
937 	/*
938 	 * a quick sanity check
939 	 */
940 
941 	if (PAGE_SIZE != NBPG)
942 		panic("pmap_bootstrap: PAGE_SIZE != NBPG");
943 
944 	/*
945 	 * set up our local static global vars that keep track of the
946 	 * usage of KVM before kernel_map is set up
947 	 */
948 
949 	virtual_avail = kva_start;		/* first free KVA */
950 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
951 
952 	/*
953 	 * set up protection_codes: we need to be able to convert from
954 	 * a MI protection code (some combo of VM_PROT...) to something
955 	 * we can jam into a i386 PTE.
956 	 */
957 
958 	protection_codes[PROT_NONE] = 0;  			/* --- */
959 	protection_codes[PROT_EXEC] = PG_X;			/* --x */
960 	protection_codes[PROT_READ] = PG_RO;			/* -r- */
961 	protection_codes[PROT_READ | PROT_EXEC] = PG_X;		/* -rx */
962 	protection_codes[PROT_WRITE] = PG_RW;			/* w-- */
963 	protection_codes[PROT_WRITE | PROT_EXEC] = PG_RW|PG_X;	/* w-x */
964 	protection_codes[PROT_READ | PROT_WRITE] = PG_RW;	/* wr- */
965 	protection_codes[PROT_READ | PROT_WRITE | PROT_EXEC] = PG_RW|PG_X; /* wrx */
966 
967 	/*
968 	 * now we init the kernel's pmap
969 	 *
970 	 * the kernel pmap's pm_obj is not used for much.   however, in
971 	 * user pmaps the pm_obj contains the list of active PTPs.
972 	 * the pm_obj currently does not have a pager.   it might be possible
973 	 * to add a pager that would allow a process to read-only mmap its
974 	 * own page tables (fast user level vtophys?).   this may or may not
975 	 * be useful.
976 	 */
977 
978 	kpm = pmap_kernel();
979 	mtx_init(&kpm->pm_mtx, -1); /* must not be used */
980 	mtx_init(&kpm->pm_apte_mtx, IPL_VM);
981 	uvm_obj_init(&kpm->pm_obj, &pmap_pager, 1);
982 	bzero(&kpm->pm_list, sizeof(kpm->pm_list));  /* pm_list not used */
983 	kpm->pm_pdir = (vaddr_t)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
984 	kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3;
985 	kpm->pm_pdir_intel = 0;
986 	kpm->pm_pdirpa_intel = 0;
987 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
988 		atop(kva_start - VM_MIN_KERNEL_ADDRESS);
989 
990 	/*
991 	 * the above is just a rough estimate and not critical to the proper
992 	 * operation of the system.
993 	 */
994 
995 	/*
996 	 * enable global TLB entries if they are supported and the
997 	 * CPU is not affected by Meltdown.
998 	 */
999 
1000 	if (cpu_feature & CPUID_PGE) {
1001 		lcr4(rcr4() | CR4_PGE);	/* enable hardware (via %cr4) */
1002 		pmap_pg_g = pg_g_kern;	/* if safe to use, enable software */
1003 
1004 		/* add PG_G attribute to already mapped kernel pages */
1005 		for (kva = VM_MIN_KERNEL_ADDRESS; kva < virtual_avail;
1006 		     kva += PAGE_SIZE)
1007 			if (pmap_valid_entry(PTE_BASE[atop(kva)]))
1008 				PTE_BASE[atop(kva)] |= pmap_pg_g;
1009 	}
1010 
1011 	/*
1012 	 * now we allocate the "special" VAs which are used for tmp mappings
1013 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1014 	 * virtual_avail (note that there are no pages mapped at these VAs).
1015 	 * we find the PTE that maps the allocated VA via the linear PTE
1016 	 * mapping.
1017 	 */
1018 
1019 	pte = PTE_BASE + atop(virtual_avail);
1020 
1021 #ifdef MULTIPROCESSOR
1022 	/*
1023 	 * Waste some VA space to avoid false sharing of cache lines
1024 	 * for page table pages: Give each possible CPU a cache line
1025 	 * of PTEs (16) to play with, though we only need 4.  We could
1026 	 * recycle some of this waste by putting the idle stacks here
1027 	 * as well; we could waste less space if we knew the largest
1028 	 * CPU ID beforehand.
1029 	 */
1030 	pmap_csrcp = (caddr_t) virtual_avail;  csrc_pte = pte;
1031 
1032 	pmap_cdstp = (caddr_t) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1033 
1034 	pmap_zerop = (caddr_t) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1035 
1036 	pmap_ptpp = (caddr_t) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1037 
1038 	pmap_flshp = (caddr_t) virtual_avail+PAGE_SIZE*4;  flsh_pte = pte+4;
1039 
1040 	virtual_avail += PAGE_SIZE * MAXCPUS * NPTECL;
1041 	pte += MAXCPUS * NPTECL;
1042 #else
1043 	pmap_csrcp = (caddr_t) virtual_avail;  csrc_pte = pte;	/* allocate */
1044 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1045 
1046 	pmap_cdstp = (caddr_t) virtual_avail;  cdst_pte = pte;
1047 	virtual_avail += PAGE_SIZE; pte++;
1048 
1049 	pmap_zerop = (caddr_t) virtual_avail;  zero_pte = pte;
1050 	virtual_avail += PAGE_SIZE; pte++;
1051 
1052 	pmap_ptpp = (caddr_t) virtual_avail;  ptp_pte = pte;
1053 	virtual_avail += PAGE_SIZE; pte++;
1054 
1055 	pmap_flshp = (caddr_t) virtual_avail;  flsh_pte = pte;
1056 	virtual_avail += PAGE_SIZE; pte++;
1057 #endif
1058 
1059 	/* XXX: vmmap used by mem.c... should be uvm_map_reserve */
1060 	vmmap = (char *)virtual_avail;			/* don't need pte */
1061 	virtual_avail += PAGE_SIZE;
1062 
1063 	msgbufp = (struct msgbuf *)virtual_avail;	/* don't need pte */
1064 	virtual_avail += round_page(MSGBUFSIZE); pte++;
1065 
1066 	bootargp = (bootarg_t *)virtual_avail;
1067 	virtual_avail += round_page(bootargc); pte++;
1068 
1069 	/*
1070 	 * now we reserve some VM for mapping pages when doing a crash dump
1071 	 */
1072 
1073 	virtual_avail = reserve_dumppages(virtual_avail);
1074 
1075 	/*
1076 	 * init the static-global locks and global lists.
1077 	 */
1078 
1079 	LIST_INIT(&pmaps);
1080 
1081 	/*
1082 	 * initialize the pmap pool.
1083 	 */
1084 
1085 	pool_init(&pmap_pmap_pool, sizeof(struct pmap), 32, IPL_NONE, 0,
1086 	    "pmappl", NULL);
1087 	pool_init(&pmap_pv_pool, sizeof(struct pv_entry), 0, IPL_VM, 0,
1088 	    "pvpl", &pmap_pv_page_allocator);
1089 
1090 	/*
1091 	 * ensure the TLB is sync'd with reality by flushing it...
1092 	 */
1093 
1094 	tlbflush();
1095 }
1096 
1097 /*
1098  * Pre-allocate PTP 0 for low memory, so that 1:1 mappings for various
1099  * trampoline code can be entered.
1100  */
1101 void
pmap_prealloc_lowmem_ptp(void)1102 pmap_prealloc_lowmem_ptp(void)
1103 {
1104 	pt_entry_t *pte, npte;
1105 	vaddr_t ptpva = (vaddr_t)vtopte(0);
1106 
1107 	/* If PAE, use the PAE-specific preallocator */
1108 	if (cpu_pae) {
1109 		pmap_prealloc_lowmem_ptp_pae();
1110 		return;
1111 	}
1112 
1113 	/* enter pa for pte 0 into recursive map */
1114 	pte = vtopte(ptpva);
1115 	npte = PTP0_PA | PG_RW | PG_V | PG_U | PG_M;
1116 
1117 	i386_atomic_testset_ul(pte, npte);
1118 
1119 	/* make sure it is clean before using */
1120 	memset((void *)ptpva, 0, NBPG);
1121 }
1122 
1123 /*
1124  * pmap_init: called from uvm_init, our job is to get the pmap
1125  * system ready to manage mappings... this mainly means initing
1126  * the pv_entry stuff.
1127  */
1128 
1129 void
pmap_init(void)1130 pmap_init(void)
1131 {
1132 	/*
1133 	 * prime the pool with pv_entry structures to allow us to get
1134 	 * the kmem_map allocated and inited (done after this function
1135 	 * is finished).  we do this by setting a low water mark such
1136 	 * that we are more likely to have these around in extreme
1137 	 * memory starvation.
1138 	 */
1139 
1140 	pool_setlowat(&pmap_pv_pool, PVE_LOWAT);
1141 	pool_sethiwat(&pmap_pv_pool, PVE_HIWAT);
1142 
1143 	/*
1144 	 * done: pmap module is up (and ready for business)
1145 	 */
1146 
1147 	pmap_initialized = 1;
1148 }
1149 
1150 /*
1151  * p v _ e n t r y   f u n c t i o n s
1152  */
1153 
1154 void *
pmap_pv_page_alloc(struct pool * pp,int flags,int * slowdown)1155 pmap_pv_page_alloc(struct pool *pp, int flags, int *slowdown)
1156 {
1157 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1158 
1159 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1160 	kd.kd_slowdown = slowdown;
1161 
1162 	return (km_alloc(pp->pr_pgsize,
1163 	    pmap_initialized ? &kv_page : &kv_any, pp->pr_crange, &kd));
1164 }
1165 
1166 void
pmap_pv_page_free(struct pool * pp,void * v)1167 pmap_pv_page_free(struct pool *pp, void *v)
1168 {
1169 	km_free(v, pp->pr_pgsize, &kv_page, pp->pr_crange);
1170 }
1171 
1172 /*
1173  * main pv_entry manipulation functions:
1174  *   pmap_enter_pv: enter a mapping onto a pv list
1175  *   pmap_remove_pv: remove a mapping from a pv list
1176  */
1177 
1178 /*
1179  * pmap_enter_pv: enter a mapping onto a pv list
1180  *
1181  * => caller should have pmap locked
1182  * => we will gain the lock on the pv and allocate the new pv_entry
1183  * => caller should adjust ptp's wire_count before calling
1184  *
1185  * pve: preallocated pve for us to use
1186  * ptp: PTP in pmap that maps this VA
1187  */
1188 
1189 void
pmap_enter_pv(struct vm_page * pg,struct pv_entry * pve,struct pmap * pmap,vaddr_t va,struct vm_page * ptp)1190 pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, struct pmap *pmap,
1191     vaddr_t va, struct vm_page *ptp)
1192 {
1193 	pve->pv_pmap = pmap;
1194 	pve->pv_va = va;
1195 	pve->pv_ptp = ptp;			/* NULL for kernel pmap */
1196 	mtx_enter(&pg->mdpage.pv_mtx);
1197 	pve->pv_next = pg->mdpage.pv_list;	/* add to ... */
1198 	pg->mdpage.pv_list = pve;		/* ... locked list */
1199 	mtx_leave(&pg->mdpage.pv_mtx);
1200 }
1201 
1202 /*
1203  * pmap_remove_pv: try to remove a mapping from a pv_list
1204  *
1205  * => pmap should be locked
1206  * => caller should hold lock on pv [so that attrs can be adjusted]
1207  * => caller should adjust ptp's wire_count and free PTP if needed
1208  * => we return the removed pve
1209  */
1210 
1211 struct pv_entry *
pmap_remove_pv(struct vm_page * pg,struct pmap * pmap,vaddr_t va)1212 pmap_remove_pv(struct vm_page *pg, struct pmap *pmap, vaddr_t va)
1213 {
1214 	struct pv_entry *pve, **prevptr;
1215 
1216 	mtx_enter(&pg->mdpage.pv_mtx);
1217 	prevptr = &pg->mdpage.pv_list;		/* previous pv_entry pointer */
1218 	while ((pve = *prevptr) != NULL) {
1219 		if (pve->pv_pmap == pmap && pve->pv_va == va) {	/* match? */
1220 			*prevptr = pve->pv_next;		/* remove it! */
1221 			break;
1222 		}
1223 		prevptr = &pve->pv_next;		/* previous pointer */
1224 	}
1225 	mtx_leave(&pg->mdpage.pv_mtx);
1226 	return(pve);				/* return removed pve */
1227 }
1228 
1229 /*
1230  * p t p   f u n c t i o n s
1231  */
1232 
1233 /*
1234  * pmap_alloc_ptp: allocate a PTP for a PMAP
1235  *
1236  * => pmap should already be locked by caller
1237  * => we use the ptp's wire_count to count the number of active mappings
1238  *	in the PTP (we start it at one to prevent any chance this PTP
1239  *	will ever leak onto the active/inactive queues)
1240  */
1241 
1242 struct vm_page *
pmap_alloc_ptp_86(struct pmap * pmap,int pde_index,pt_entry_t pde_flags)1243 pmap_alloc_ptp_86(struct pmap *pmap, int pde_index, pt_entry_t pde_flags)
1244 {
1245 	struct vm_page *ptp;
1246 	pd_entry_t *pva_intel;
1247 
1248 	ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL,
1249 			    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1250 	if (ptp == NULL)
1251 		return (NULL);
1252 
1253 	/* got one! */
1254 	atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
1255 	ptp->wire_count = 1;	/* no mappings yet */
1256 	PDE(pmap, pde_index) = (pd_entry_t)(VM_PAGE_TO_PHYS(ptp) |
1257 	    PG_RW | PG_V | PG_M | PG_U | pde_flags);
1258 
1259 	/*
1260 	 * Meltdown special case - if we are adding a new PDE for
1261 	 * usermode addresses, just copy the PDE to the U-K page
1262 	 * table.
1263 	 */
1264 	if (pmap->pm_pdir_intel && ptp_i2v(pde_index) < VM_MAXUSER_ADDRESS) {
1265 		pva_intel = (pd_entry_t *)pmap->pm_pdir_intel;
1266 		pva_intel[pde_index] = PDE(pmap, pde_index);
1267 		DPRINTF("%s: copying usermode PDE (content=0x%x) pde_index %d "
1268 		    "from 0x%x -> 0x%x\n", __func__, PDE(pmap, pde_index),
1269 		    pde_index, (uint32_t)&PDE(pmap, pde_index),
1270 		    (uint32_t)&(pva_intel[pde_index]));
1271 	}
1272 
1273 	pmap->pm_stats.resident_count++;	/* count PTP as resident */
1274 	pmap->pm_ptphint = ptp;
1275 	return(ptp);
1276 }
1277 
1278 /*
1279  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1280  *
1281  * => pmap should NOT be pmap_kernel()
1282  * => pmap should be locked
1283  */
1284 
1285 struct vm_page *
pmap_get_ptp_86(struct pmap * pmap,int pde_index)1286 pmap_get_ptp_86(struct pmap *pmap, int pde_index)
1287 {
1288 	struct vm_page *ptp;
1289 
1290 	if (pmap_valid_entry(PDE(pmap, pde_index))) {
1291 		/* valid... check hint (saves us a PA->PG lookup) */
1292 		if (pmap->pm_ptphint &&
1293 		    (PDE(pmap, pde_index) & PG_FRAME) ==
1294 		    VM_PAGE_TO_PHYS(pmap->pm_ptphint))
1295 			return(pmap->pm_ptphint);
1296 
1297 		ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index));
1298 #ifdef DIAGNOSTIC
1299 		if (ptp == NULL)
1300 			panic("pmap_get_ptp_86: unmanaged user PTP");
1301 #endif
1302 		pmap->pm_ptphint = ptp;
1303 		return(ptp);
1304 	}
1305 
1306 	/* allocate a new PTP (updates ptphint) */
1307 	return (pmap_alloc_ptp_86(pmap, pde_index, PG_u));
1308 }
1309 
1310 void
pmap_drop_ptp_86(struct pmap * pm,vaddr_t va,struct vm_page * ptp,pt_entry_t * ptes)1311 pmap_drop_ptp_86(struct pmap *pm, vaddr_t va, struct vm_page *ptp,
1312     pt_entry_t *ptes)
1313 {
1314 	pd_entry_t *pva_intel;
1315 
1316 	i386_atomic_testset_ul(&PDE(pm, pdei(va)), 0);
1317 	pmap_tlb_shootpage(curcpu()->ci_curpmap, ((vaddr_t)ptes) + ptp->offset);
1318 #ifdef MULTIPROCESSOR
1319 	/*
1320 	 * Always shoot down the other pmap's
1321 	 * self-mapping of the PTP.
1322 	 */
1323 	pmap_tlb_shootpage(pm, ((vaddr_t)PTE_BASE) + ptp->offset);
1324 #endif
1325 	pm->pm_stats.resident_count--;
1326 	/* update hint */
1327 	if (pm->pm_ptphint == ptp)
1328 		pm->pm_ptphint = RBT_ROOT(uvm_objtree, &pm->pm_obj.memt);
1329 	ptp->wire_count = 0;
1330 	/* Postpone free to after shootdown. */
1331 	uvm_pagerealloc(ptp, NULL, 0);
1332 
1333 	if (pm->pm_pdir_intel) {
1334 		KASSERT(va < VM_MAXUSER_ADDRESS);
1335 		/* Zap special meltdown PDE */
1336 		pva_intel = (pd_entry_t *)pm->pm_pdir_intel;
1337 		i386_atomic_testset_ul(&pva_intel[pdei(va)], 0);
1338 		DPRINTF("%s: cleared meltdown PDE @ index %lu "
1339 		    "(va range start 0x%x)\n", __func__, pdei(va),
1340 		    (uint32_t)va);
1341 	}
1342 }
1343 
1344 /*
1345  * p m a p  l i f e c y c l e   f u n c t i o n s
1346  */
1347 
1348 /*
1349  * pmap_create: create a pmap
1350  *
1351  * => note: old pmap interface took a "size" args which allowed for
1352  *	the creation of "software only" pmaps (not in bsd).
1353  */
1354 
1355 struct pmap *
pmap_create(void)1356 pmap_create(void)
1357 {
1358 	struct pmap *pmap;
1359 
1360 	pmap = pool_get(&pmap_pmap_pool, PR_WAITOK);
1361 
1362 	mtx_init(&pmap->pm_mtx, IPL_VM);
1363 	mtx_init(&pmap->pm_apte_mtx, IPL_VM);
1364 
1365 	/* init uvm_object */
1366 	uvm_obj_init(&pmap->pm_obj, &pmap_pager, 1);
1367 	pmap->pm_stats.wired_count = 0;
1368 	pmap->pm_stats.resident_count = 1;	/* count the PDP allocd below */
1369 	pmap->pm_ptphint = NULL;
1370 	pmap->pm_hiexec = 0;
1371 	pmap->pm_flags = 0;
1372 	pmap->pm_pdir_intel = 0;
1373 	pmap->pm_pdirpa_intel = 0;
1374 
1375 	initcodesegment(&pmap->pm_codeseg);
1376 
1377 	pmap_pinit_pd(pmap);
1378 	return (pmap);
1379 }
1380 
1381 void
pmap_pinit_pd_86(struct pmap * pmap)1382 pmap_pinit_pd_86(struct pmap *pmap)
1383 {
1384 	/* allocate PDP */
1385 	pmap->pm_pdir = (vaddr_t)km_alloc(NBPG, &kv_any, &kp_dirty, &kd_waitok);
1386 	if (pmap->pm_pdir == 0)
1387 		panic("kernel_map out of virtual space");
1388 	pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir,
1389 			    &pmap->pm_pdirpa);
1390 	pmap->pm_pdirsize = NBPG;
1391 
1392 	/* init PDP */
1393 	/* zero init area */
1394 	bzero((void *)pmap->pm_pdir, PDSLOT_PTE * sizeof(pd_entry_t));
1395 	/* put in recursive PDE to map the PTEs */
1396 	PDE(pmap, PDSLOT_PTE) = pmap->pm_pdirpa | PG_V | PG_KW | PG_U | PG_M;
1397 	PDE(pmap, PDSLOT_PTE + 1) = 0;
1398 
1399 	/*
1400 	 * we need to lock pmaps_lock to prevent nkpde from changing on
1401 	 * us.   note that there is no need to splvm to protect us from
1402 	 * malloc since malloc allocates out of a submap and we should have
1403 	 * already allocated kernel PTPs to cover the range...
1404 	 */
1405 	/* put in kernel VM PDEs */
1406 	bcopy(&PDP_BASE[PDSLOT_KERN], &PDE(pmap, PDSLOT_KERN),
1407 	       nkpde * sizeof(pd_entry_t));
1408 	/* zero the rest */
1409 	bzero(&PDE(pmap, PDSLOT_KERN + nkpde),
1410 	       NBPG - ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t)));
1411 
1412 	/*
1413 	 * Intel CPUs need a special page table to be used during usermode
1414 	 * execution, one that lacks all kernel mappings.
1415 	 */
1416 	if (cpu_meltdown) {
1417 		pmap_alloc_pdir_intel_x86(pmap);
1418 
1419 		/* Copy PDEs from pmap_kernel's U-K view */
1420 		bcopy((void *)pmap_kernel()->pm_pdir_intel,
1421 		    (void *)pmap->pm_pdir_intel, NBPG);
1422 
1423 		DPRINTF("%s: pmap %p pm_pdir 0x%lx pm_pdirpa 0x%lx "
1424 		    "pdir_intel 0x%lx pdirpa_intel 0x%lx\n",
1425 		    __func__, pmap, pmap->pm_pdir, pmap->pm_pdirpa,
1426 		    pmap->pm_pdir_intel, pmap->pm_pdirpa_intel);
1427 	}
1428 
1429 	mtx_enter(&pmaps_lock);
1430 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
1431 	mtx_leave(&pmaps_lock);
1432 }
1433 
1434 /*
1435  * pmap_destroy: drop reference count on pmap.   free pmap if
1436  *	reference count goes to zero.
1437  */
1438 
1439 void
pmap_destroy(struct pmap * pmap)1440 pmap_destroy(struct pmap *pmap)
1441 {
1442 	struct vm_page *pg;
1443 	int refs;
1444 
1445 	refs = atomic_dec_int_nv(&pmap->pm_obj.uo_refs);
1446 	if (refs > 0)
1447 		return;
1448 
1449 #ifdef MULTIPROCESSOR
1450 	pmap_tlb_droppmap(pmap);
1451 #endif
1452 
1453 	mtx_enter(&pmaps_lock);
1454 	LIST_REMOVE(pmap, pm_list);
1455 	mtx_leave(&pmaps_lock);
1456 
1457 	/* Free any remaining PTPs. */
1458 	while ((pg = RBT_ROOT(uvm_objtree, &pmap->pm_obj.memt)) != NULL) {
1459 		pg->wire_count = 0;
1460 		uvm_pagefree(pg);
1461 	}
1462 
1463 	km_free((void *)pmap->pm_pdir, pmap->pm_pdirsize, &kv_any, &kp_dirty);
1464 	pmap->pm_pdir = 0;
1465 
1466 	if (pmap->pm_pdir_intel) {
1467 		km_free((void *)pmap->pm_pdir_intel, pmap->pm_pdirsize,
1468 		    &kv_any, &kp_dirty);
1469 		pmap->pm_pdir_intel = 0;
1470 	}
1471 
1472 	pool_put(&pmap_pmap_pool, pmap);
1473 }
1474 
1475 
1476 /*
1477  *	Add a reference to the specified pmap.
1478  */
1479 
1480 void
pmap_reference(struct pmap * pmap)1481 pmap_reference(struct pmap *pmap)
1482 {
1483 	atomic_inc_int(&pmap->pm_obj.uo_refs);
1484 }
1485 
1486 void
pmap_activate(struct proc * p)1487 pmap_activate(struct proc *p)
1488 {
1489 	KASSERT(curproc == p);
1490 	KASSERT(&p->p_addr->u_pcb == curpcb);
1491 	pmap_switch(NULL, p);
1492 }
1493 
1494 int nlazy_cr3_hit;
1495 int nlazy_cr3;
1496 
1497 void
pmap_switch(struct proc * o,struct proc * p)1498 pmap_switch(struct proc *o, struct proc *p)
1499 {
1500 	struct pcb *pcb = &p->p_addr->u_pcb;
1501 	struct pmap *pmap, *opmap;
1502 	struct cpu_info *self = curcpu();
1503 
1504 	pmap = p->p_vmspace->vm_map.pmap;
1505 	opmap = self->ci_curpmap;
1506 
1507 	pcb->pcb_pmap = pmap;
1508 	pcb->pcb_cr3 = pmap->pm_pdirpa;
1509 
1510 	if (opmap == pmap) {
1511 		if (pmap != pmap_kernel())
1512 			nlazy_cr3_hit++;
1513 	} else if (o != NULL && pmap == pmap_kernel()) {
1514 		nlazy_cr3++;
1515 	} else {
1516 		self->ci_curpmap = pmap;
1517 		lcr3(pmap->pm_pdirpa);
1518 	}
1519 
1520 	/*
1521 	 * Meltdown: iff we're doing separate U+K and U-K page tables,
1522 	 * then record them in cpu_info for easy access in syscall and
1523 	 * interrupt trampolines.
1524 	 */
1525 	if (pmap->pm_pdirpa_intel) {
1526 		self->ci_kern_cr3 = pmap->pm_pdirpa;
1527 		self->ci_user_cr3 = pmap->pm_pdirpa_intel;
1528 	}
1529 
1530 	/*
1531 	 * Set the correct descriptor value (i.e. with the
1532 	 * correct code segment X limit) in the GDT.
1533 	 */
1534 	self->ci_gdt[GUCODE_SEL].sd = pmap->pm_codeseg;
1535 	self->ci_gdt[GUFS_SEL].sd = pcb->pcb_threadsegs[TSEG_FS];
1536 	self->ci_gdt[GUGS_SEL].sd = pcb->pcb_threadsegs[TSEG_GS];
1537 }
1538 
1539 void
pmap_deactivate(struct proc * p)1540 pmap_deactivate(struct proc *p)
1541 {
1542 }
1543 
1544 /*
1545  * pmap_extract: extract a PA for the given VA
1546  */
1547 
1548 int
pmap_extract_86(struct pmap * pmap,vaddr_t va,paddr_t * pap)1549 pmap_extract_86(struct pmap *pmap, vaddr_t va, paddr_t *pap)
1550 {
1551 	pt_entry_t *ptes, pte;
1552 
1553 	ptes = pmap_map_ptes_86(pmap);
1554 	if (pmap_valid_entry(PDE(pmap, pdei(va)))) {
1555 		pte = ptes[atop(va)];
1556 		pmap_unmap_ptes_86(pmap);
1557 		if (!pmap_valid_entry(pte))
1558 			return 0;
1559 		if (pap != NULL)
1560 			*pap = (pte & PG_FRAME) | (va & ~PG_FRAME);
1561 		return 1;
1562 	}
1563 	pmap_unmap_ptes_86(pmap);
1564 	return 0;
1565 }
1566 
1567 /*
1568  * pmap_virtual_space: used during bootup [uvm_pageboot_alloc] to
1569  *	determine the bounds of the kernel virtual address space.
1570  */
1571 
1572 void
pmap_virtual_space(vaddr_t * startp,vaddr_t * endp)1573 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
1574 {
1575 	*startp = virtual_avail;
1576 	*endp = virtual_end;
1577 }
1578 
1579 /*
1580  * pmap_zero_page: zero a page
1581  */
1582 void (*pagezero)(void *, size_t) = bzero;
1583 
1584 void
pmap_zero_page(struct vm_page * pg)1585 pmap_zero_page(struct vm_page *pg)
1586 {
1587 	pmap_zero_phys(VM_PAGE_TO_PHYS(pg));
1588 }
1589 
1590 /*
1591  * pmap_zero_phys: same as pmap_zero_page, but for use before vm_pages are
1592  * initialized.
1593  */
1594 void
pmap_zero_phys_86(paddr_t pa)1595 pmap_zero_phys_86(paddr_t pa)
1596 {
1597 #ifdef MULTIPROCESSOR
1598 	int id = cpu_number();
1599 #endif
1600 	pt_entry_t *zpte = PTESLEW(zero_pte, id);
1601 	caddr_t zerova = VASLEW(pmap_zerop, id);
1602 
1603 #ifdef DIAGNOSTIC
1604 	if (*zpte)
1605 		panic("pmap_zero_phys_86: lock botch");
1606 #endif
1607 
1608 	*zpte = (pa & PG_FRAME) | PG_V | PG_RW;	/* map in */
1609 	pmap_update_pg((vaddr_t)zerova);	/* flush TLB */
1610 	pagezero(zerova, PAGE_SIZE);		/* zero */
1611 	*zpte = 0;
1612 }
1613 
1614 /*
1615  * pmap_flush_cache: flush the cache for a virtual address.
1616  */
1617 void
pmap_flush_cache(vaddr_t addr,vsize_t len)1618 pmap_flush_cache(vaddr_t addr, vsize_t len)
1619 {
1620 	vaddr_t i;
1621 
1622 	if (curcpu()->ci_cflushsz == 0) {
1623 		wbinvd_on_all_cpus();
1624 		return;
1625 	}
1626 
1627 	mfence();
1628 	for (i = addr; i < addr + len; i += curcpu()->ci_cflushsz)
1629 		clflush(i);
1630 	mfence();
1631 }
1632 
1633 void
pmap_flush_page(paddr_t pa)1634 pmap_flush_page(paddr_t pa)
1635 {
1636 #ifdef MULTIPROCESSOR
1637 	int id = cpu_number();
1638 #endif
1639 	pt_entry_t *pte;
1640 	caddr_t va;
1641 
1642 	KDASSERT(PHYS_TO_VM_PAGE(pa) != NULL);
1643 
1644 	if (cpu_pae) {
1645 		pmap_flush_page_pae(pa);
1646 		return;
1647 	}
1648 
1649 	pte = PTESLEW(flsh_pte, id);
1650 	va = VASLEW(pmap_flshp, id);
1651 
1652 #ifdef DIAGNOSTIC
1653 	if (*pte)
1654 		panic("pmap_flush_page: lock botch");
1655 #endif
1656 
1657 	*pte = (pa & PG_FRAME) | PG_V | PG_RW;
1658 	pmap_update_pg(va);
1659 	pmap_flush_cache((vaddr_t)va, PAGE_SIZE);
1660 	*pte = 0;
1661 	pmap_update_pg(va);
1662 }
1663 
1664 /*
1665  * pmap_copy_page: copy a page
1666  */
1667 
1668 void
pmap_copy_page_86(struct vm_page * srcpg,struct vm_page * dstpg)1669 pmap_copy_page_86(struct vm_page *srcpg, struct vm_page *dstpg)
1670 {
1671 	paddr_t srcpa = VM_PAGE_TO_PHYS(srcpg);
1672 	paddr_t dstpa = VM_PAGE_TO_PHYS(dstpg);
1673 #ifdef MULTIPROCESSOR
1674 	int id = cpu_number();
1675 #endif
1676 	pt_entry_t *spte = PTESLEW(csrc_pte, id);
1677 	pt_entry_t *dpte = PTESLEW(cdst_pte, id);
1678 	caddr_t csrcva = VASLEW(pmap_csrcp, id);
1679 	caddr_t cdstva = VASLEW(pmap_cdstp, id);
1680 
1681 #ifdef DIAGNOSTIC
1682 	if (*spte || *dpte)
1683 		panic("pmap_copy_page_86: lock botch");
1684 #endif
1685 
1686 	*spte = (srcpa & PG_FRAME) | PG_V | PG_RW;
1687 	*dpte = (dstpa & PG_FRAME) | PG_V | PG_RW;
1688 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
1689 	bcopy(csrcva, cdstva, PAGE_SIZE);
1690 	*spte = *dpte = 0;
1691 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
1692 }
1693 
1694 /*
1695  * p m a p   r e m o v e   f u n c t i o n s
1696  *
1697  * functions that remove mappings
1698  */
1699 
1700 /*
1701  * pmap_remove_ptes: remove PTEs from a PTP
1702  *
1703  * => caller must hold pmap's lock
1704  * => PTP must be mapped into KVA
1705  * => PTP should be null if pmap == pmap_kernel()
1706  */
1707 
1708 void
pmap_remove_ptes_86(struct pmap * pmap,struct vm_page * ptp,vaddr_t ptpva,vaddr_t startva,vaddr_t endva,int flags,struct pv_entry ** free_pvs)1709 pmap_remove_ptes_86(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
1710     vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs)
1711 {
1712 	struct pv_entry *pve;
1713 	pt_entry_t *pte = (pt_entry_t *) ptpva;
1714 	struct vm_page *pg;
1715 	pt_entry_t opte;
1716 
1717 	/*
1718 	 * note that ptpva points to the PTE that maps startva.   this may
1719 	 * or may not be the first PTE in the PTP.
1720 	 *
1721 	 * we loop through the PTP while there are still PTEs to look at
1722 	 * and the wire_count is greater than 1 (because we use the wire_count
1723 	 * to keep track of the number of real PTEs in the PTP).
1724 	 */
1725 
1726 	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
1727 			     ; pte++, startva += NBPG) {
1728 		if (!pmap_valid_entry(*pte))
1729 			continue;			/* VA not mapped */
1730 
1731 		if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W))
1732 			continue;
1733 
1734 		/* atomically save the old PTE and zero it */
1735 		opte = i386_atomic_testset_ul(pte, 0);
1736 
1737 		if (opte & PG_W)
1738 			pmap->pm_stats.wired_count--;
1739 		pmap->pm_stats.resident_count--;
1740 
1741 		if (ptp)
1742 			ptp->wire_count--;		/* dropping a PTE */
1743 
1744 		/*
1745 		 * Unnecessary work if not PG_PVLIST.
1746 		 */
1747 		pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
1748 
1749 		/*
1750 		 * if we are not on a pv list we are done.
1751 		 */
1752 		if ((opte & PG_PVLIST) == 0) {
1753 #ifdef DIAGNOSTIC
1754 			if (pg != NULL)
1755 				panic("pmap_remove_ptes_86: managed page "
1756 				     "without PG_PVLIST for 0x%lx", startva);
1757 #endif
1758 			continue;
1759 		}
1760 
1761 #ifdef DIAGNOSTIC
1762 		if (pg == NULL)
1763 			panic("pmap_remove_ptes_86: unmanaged page marked "
1764 			      "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
1765 			      startva, (u_long)(opte & PG_FRAME));
1766 #endif
1767 
1768 		/* sync R/M bits */
1769 		pmap_sync_flags_pte_86(pg, opte);
1770 		pve = pmap_remove_pv(pg, pmap, startva);
1771 		if (pve) {
1772 			pve->pv_next = *free_pvs;
1773 			*free_pvs = pve;
1774 		}
1775 
1776 		/* end of "for" loop: time for next pte */
1777 	}
1778 }
1779 
1780 /*
1781  * pmap_remove: top level mapping removal function
1782  *
1783  * => caller should not be holding any pmap locks
1784  */
1785 
1786 void
pmap_remove(struct pmap * pmap,vaddr_t sva,vaddr_t eva)1787 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
1788 {
1789 	pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
1790 }
1791 
1792 void
pmap_do_remove_86(struct pmap * pmap,vaddr_t sva,vaddr_t eva,int flags)1793 pmap_do_remove_86(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
1794 {
1795 	pt_entry_t *ptes;
1796 	paddr_t ptppa;
1797 	vaddr_t blkendva;
1798 	struct vm_page *ptp;
1799 	struct pv_entry *pve;
1800 	struct pv_entry *free_pvs = NULL;
1801 	TAILQ_HEAD(, vm_page) empty_ptps;
1802 	int shootall;
1803 	vaddr_t va;
1804 
1805 	TAILQ_INIT(&empty_ptps);
1806 
1807 	ptes = pmap_map_ptes_86(pmap);	/* locks pmap */
1808 
1809 	/*
1810 	 * Decide if we want to shoot the whole tlb or just the range.
1811 	 * Right now, we simply shoot everything when we remove more
1812 	 * than 32 pages, but never in the kernel pmap. XXX - tune.
1813 	 */
1814 	if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
1815 		shootall = 1;
1816 	else
1817 		shootall = 0;
1818 
1819 	for (va = sva ; va < eva ; va = blkendva) {
1820 		/* determine range of block */
1821 		blkendva = i386_round_pdr(va + 1);
1822 		if (blkendva > eva)
1823 			blkendva = eva;
1824 
1825 		/*
1826 		 * XXXCDC: our PTE mappings should never be removed
1827 		 * with pmap_remove!  if we allow this (and why would
1828 		 * we?) then we end up freeing the pmap's page
1829 		 * directory page (PDP) before we are finished using
1830 		 * it when we hit it in the recursive mapping.  this
1831 		 * is BAD.
1832 		 *
1833 		 * long term solution is to move the PTEs out of user
1834 		 * address space.  and into kernel address space (up
1835 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
1836 		 * be VM_MAX_ADDRESS.
1837 		 */
1838 
1839 		if (pdei(va) == PDSLOT_PTE)
1840 			/* XXXCDC: ugly hack to avoid freeing PDP here */
1841 			continue;
1842 
1843 		if (!pmap_valid_entry(PDE(pmap, pdei(va))))
1844 			/* valid block? */
1845 			continue;
1846 
1847 		/* PA of the PTP */
1848 		ptppa = PDE(pmap, pdei(va)) & PG_FRAME;
1849 
1850 		/* get PTP if non-kernel mapping */
1851 		if (pmap == pmap_kernel()) {
1852 			/* we never free kernel PTPs */
1853 			ptp = NULL;
1854 		} else {
1855 			if (pmap->pm_ptphint &&
1856 			    VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) {
1857 				ptp = pmap->pm_ptphint;
1858 			} else {
1859 				ptp = PHYS_TO_VM_PAGE(ptppa);
1860 #ifdef DIAGNOSTIC
1861 				if (ptp == NULL)
1862 					panic("pmap_do_remove_86: unmanaged "
1863 					      "PTP detected");
1864 #endif
1865 			}
1866 		}
1867 		pmap_remove_ptes_86(pmap, ptp, (vaddr_t)&ptes[atop(va)],
1868 		    va, blkendva, flags, &free_pvs);
1869 
1870 		/* If PTP is no longer being used, free it. */
1871 		if (ptp && ptp->wire_count <= 1) {
1872 			pmap_drop_ptp_86(pmap, va, ptp, ptes);
1873 			TAILQ_INSERT_TAIL(&empty_ptps, ptp, pageq);
1874 		}
1875 
1876 		if (!shootall)
1877 			pmap_tlb_shootrange(pmap, va, blkendva);
1878 	}
1879 
1880 	if (shootall)
1881 		pmap_tlb_shoottlb();
1882 
1883 	pmap_unmap_ptes_86(pmap);
1884 	pmap_tlb_shootwait();
1885 
1886 	while ((pve = free_pvs) != NULL) {
1887 		free_pvs = pve->pv_next;
1888 		pool_put(&pmap_pv_pool, pve);
1889 	}
1890 
1891 	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1892 		TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1893 		uvm_pagefree(ptp);
1894 	}
1895 }
1896 
1897 /*
1898  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
1899  *
1900  * => R/M bits are sync'd back to attrs
1901  */
1902 
1903 void
pmap_page_remove_86(struct vm_page * pg)1904 pmap_page_remove_86(struct vm_page *pg)
1905 {
1906 	struct pv_entry *pve;
1907 	struct pmap *pm;
1908 	pt_entry_t *ptes, opte;
1909 	TAILQ_HEAD(, vm_page) empty_ptps;
1910 	struct vm_page *ptp;
1911 
1912 	if (pg->mdpage.pv_list == NULL)
1913 		return;
1914 
1915 	TAILQ_INIT(&empty_ptps);
1916 
1917 	mtx_enter(&pg->mdpage.pv_mtx);
1918 	while ((pve = pg->mdpage.pv_list) != NULL) {
1919 		pmap_reference(pve->pv_pmap);
1920 		pm = pve->pv_pmap;
1921 		mtx_leave(&pg->mdpage.pv_mtx);
1922 
1923 		ptes = pmap_map_ptes_86(pm);		/* locks pmap */
1924 
1925 		/*
1926 		 * We dropped the pvlist lock before grabbing the pmap
1927 		 * lock to avoid lock ordering problems.  This means
1928 		 * we have to check the pvlist again since somebody
1929 		 * else might have modified it.  All we care about is
1930 		 * that the pvlist entry matches the pmap we just
1931 		 * locked.  If it doesn't, unlock the pmap and try
1932 		 * again.
1933 		 */
1934 		mtx_enter(&pg->mdpage.pv_mtx);
1935 		if ((pve = pg->mdpage.pv_list) == NULL ||
1936 		    pve->pv_pmap != pm) {
1937 			mtx_leave(&pg->mdpage.pv_mtx);
1938 			pmap_unmap_ptes_86(pm);		/* unlocks pmap */
1939 			pmap_destroy(pm);
1940 			mtx_enter(&pg->mdpage.pv_mtx);
1941 			continue;
1942 		}
1943 
1944 		pg->mdpage.pv_list = pve->pv_next;
1945 		mtx_leave(&pg->mdpage.pv_mtx);
1946 
1947 #ifdef DIAGNOSTIC
1948 		if (pve->pv_ptp && (PDE(pve->pv_pmap, pdei(pve->pv_va)) &
1949 				    PG_FRAME)
1950 		    != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
1951 			printf("pmap_page_remove_86: pg=%p: va=%lx, "
1952 				"pv_ptp=%p\n",
1953 				pg, pve->pv_va, pve->pv_ptp);
1954 			printf("pmap_page_remove_86: PTP's phys addr: "
1955 				"actual=%x, recorded=%lx\n",
1956 				(PDE(pve->pv_pmap, pdei(pve->pv_va)) &
1957 				PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp));
1958 			panic("pmap_page_remove_86: mapped managed page has "
1959 				"invalid pv_ptp field");
1960 }
1961 #endif
1962 		opte = i386_atomic_testset_ul(&ptes[atop(pve->pv_va)], 0);
1963 
1964 		if (opte & PG_W)
1965 			pve->pv_pmap->pm_stats.wired_count--;
1966 		pve->pv_pmap->pm_stats.resident_count--;
1967 
1968 		/* sync R/M bits */
1969 		pmap_sync_flags_pte_86(pg, opte);
1970 
1971 		/* update the PTP reference count.  free if last reference. */
1972 		if (pve->pv_ptp && --pve->pv_ptp->wire_count <= 1) {
1973 			pmap_drop_ptp_86(pve->pv_pmap, pve->pv_va,
1974 			    pve->pv_ptp, ptes);
1975 			TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp, pageq);
1976 		}
1977 
1978 		pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
1979 
1980 		pmap_unmap_ptes_86(pve->pv_pmap);	/* unlocks pmap */
1981 		pmap_destroy(pve->pv_pmap);
1982 		pool_put(&pmap_pv_pool, pve);
1983 		mtx_enter(&pg->mdpage.pv_mtx);
1984 	}
1985 	mtx_leave(&pg->mdpage.pv_mtx);
1986 
1987 	pmap_tlb_shootwait();
1988 
1989 	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1990 		TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1991 		uvm_pagefree(ptp);
1992 	}
1993 }
1994 
1995 /*
1996  * p m a p   a t t r i b u t e  f u n c t i o n s
1997  * functions that test/change managed page's attributes
1998  * since a page can be mapped multiple times we must check each PTE that
1999  * maps it by going down the pv lists.
2000  */
2001 
2002 /*
2003  * pmap_test_attrs: test a page's attributes
2004  */
2005 
2006 int
pmap_test_attrs_86(struct vm_page * pg,int testbits)2007 pmap_test_attrs_86(struct vm_page *pg, int testbits)
2008 {
2009 	struct pv_entry *pve;
2010 	pt_entry_t *ptes, pte;
2011 	u_long mybits, testflags;
2012 	paddr_t ptppa;
2013 
2014 	testflags = pmap_pte2flags(testbits);
2015 
2016 	if (pg->pg_flags & testflags)
2017 		return 1;
2018 
2019 	mybits = 0;
2020 	mtx_enter(&pg->mdpage.pv_mtx);
2021 	for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0;
2022 	    pve = pve->pv_next) {
2023 		ptppa = PDE(pve->pv_pmap, pdei(pve->pv_va)) & PG_FRAME;
2024 		ptes = (pt_entry_t *)pmap_tmpmap_pa(ptppa);
2025 		pte = ptes[ptei(pve->pv_va)];
2026 		pmap_tmpunmap_pa();
2027 		mybits |= (pte & testbits);
2028 	}
2029 	mtx_leave(&pg->mdpage.pv_mtx);
2030 
2031 	if (mybits == 0)
2032 		return 0;
2033 
2034 	atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits));
2035 
2036 	return 1;
2037 }
2038 
2039 /*
2040  * pmap_clear_attrs: change a page's attributes
2041  *
2042  * => we return 1 if we cleared one of the bits we were asked to
2043  */
2044 
2045 int
pmap_clear_attrs_86(struct vm_page * pg,int clearbits)2046 pmap_clear_attrs_86(struct vm_page *pg, int clearbits)
2047 {
2048 	struct pv_entry *pve;
2049 	pt_entry_t *ptes, opte;
2050 	u_long clearflags;
2051 	paddr_t ptppa;
2052 	int result;
2053 
2054 	clearflags = pmap_pte2flags(clearbits);
2055 
2056 	result = pg->pg_flags & clearflags;
2057 	if (result)
2058 		atomic_clearbits_int(&pg->pg_flags, clearflags);
2059 
2060 	mtx_enter(&pg->mdpage.pv_mtx);
2061 	for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) {
2062 		ptppa = PDE(pve->pv_pmap, pdei(pve->pv_va)) & PG_FRAME;
2063 		ptes = (pt_entry_t *)pmap_tmpmap_pa(ptppa);
2064 #ifdef DIAGNOSTIC
2065 		if (!pmap_valid_entry(PDE(pve->pv_pmap, pdei(pve->pv_va))))
2066 			panic("pmap_clear_attrs_86: mapping without PTP "
2067 			      "detected");
2068 #endif
2069 
2070 		opte = ptes[ptei(pve->pv_va)];
2071 		if (opte & clearbits) {
2072 			result = 1;
2073 			i386_atomic_clearbits_l(&ptes[ptei(pve->pv_va)],
2074 			    (opte & clearbits));
2075 			pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
2076 		}
2077 		pmap_tmpunmap_pa();
2078 	}
2079 	mtx_leave(&pg->mdpage.pv_mtx);
2080 
2081 	pmap_tlb_shootwait();
2082 
2083 	return (result != 0);
2084 }
2085 
2086 /*
2087  * p m a p   p r o t e c t i o n   f u n c t i o n s
2088  */
2089 
2090 /*
2091  * pmap_page_protect: change the protection of all recorded mappings
2092  *	of a managed page
2093  *
2094  * => NOTE: this is an inline function in pmap.h
2095  */
2096 
2097 /* see pmap.h */
2098 
2099 /*
2100  * pmap_protect: set the protection in of the pages in a pmap
2101  *
2102  * => NOTE: this is an inline function in pmap.h
2103  */
2104 
2105 /* see pmap.h */
2106 
2107 /*
2108  * pmap_write_protect: write-protect pages in a pmap
2109  */
2110 
2111 void
pmap_write_protect_86(struct pmap * pmap,vaddr_t sva,vaddr_t eva,vm_prot_t prot)2112 pmap_write_protect_86(struct pmap *pmap, vaddr_t sva, vaddr_t eva,
2113     vm_prot_t prot)
2114 {
2115 	pt_entry_t *ptes, *spte, *epte, npte, opte;
2116 	vaddr_t blockend;
2117 	u_int32_t md_prot;
2118 	vaddr_t va;
2119 	int shootall = 0;
2120 
2121 	ptes = pmap_map_ptes_86(pmap);		/* locks pmap */
2122 
2123 	/* should be ok, but just in case ... */
2124 	sva &= PG_FRAME;
2125 	eva &= PG_FRAME;
2126 
2127 	if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
2128 		shootall = 1;
2129 
2130 	for (va = sva; va < eva; va = blockend) {
2131 		blockend = (va & PD_MASK) + NBPD;
2132 		if (blockend > eva)
2133 			blockend = eva;
2134 
2135 		/*
2136 		 * XXXCDC: our PTE mappings should never be write-protected!
2137 		 *
2138 		 * long term solution is to move the PTEs out of user
2139 		 * address space.  and into kernel address space (up
2140 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
2141 		 * be VM_MAX_ADDRESS.
2142 		 */
2143 
2144 		/* XXXCDC: ugly hack to avoid freeing PDP here */
2145 		if (pdei(va) == PDSLOT_PTE)
2146 			continue;
2147 
2148 		/* empty block? */
2149 		if (!pmap_valid_entry(PDE(pmap, pdei(va))))
2150 			continue;
2151 
2152 		md_prot = protection_codes[prot];
2153 		if (va < VM_MAXUSER_ADDRESS)
2154 			md_prot |= PG_u;
2155 		else if (va < VM_MAX_ADDRESS)
2156 			/* XXX: write-prot our PTES? never! */
2157 			md_prot |= PG_RW;
2158 
2159 		spte = &ptes[atop(va)];
2160 		epte = &ptes[atop(blockend)];
2161 
2162 		for (/*null */; spte < epte ; spte++, va += PAGE_SIZE) {
2163 
2164 			if (!pmap_valid_entry(*spte))	/* no mapping? */
2165 				continue;
2166 
2167 			opte = *spte;
2168 			npte = (opte & ~PG_PROT) | md_prot;
2169 
2170 			if (npte != opte) {
2171 				pmap_exec_account(pmap, va, *spte, npte);
2172 				i386_atomic_clearbits_l(spte,
2173 				    (~md_prot & opte) & PG_PROT);
2174 				i386_atomic_setbits_l(spte, md_prot);
2175 			}
2176 		}
2177 	}
2178 	if (shootall)
2179 		pmap_tlb_shoottlb();
2180 	else
2181 		pmap_tlb_shootrange(pmap, sva, eva);
2182 
2183 	pmap_unmap_ptes_86(pmap);		/* unlocks pmap */
2184 	pmap_tlb_shootwait();
2185 }
2186 
2187 /*
2188  * end of protection functions
2189  */
2190 
2191 /*
2192  * pmap_unwire: clear the wired bit in the PTE
2193  *
2194  * => mapping should already be in map
2195  */
2196 
2197 void
pmap_unwire_86(struct pmap * pmap,vaddr_t va)2198 pmap_unwire_86(struct pmap *pmap, vaddr_t va)
2199 {
2200 	pt_entry_t *ptes;
2201 
2202 	if (pmap_valid_entry(PDE(pmap, pdei(va)))) {
2203 		ptes = pmap_map_ptes_86(pmap);		/* locks pmap */
2204 
2205 #ifdef DIAGNOSTIC
2206 		if (!pmap_valid_entry(ptes[atop(va)]))
2207 			panic("pmap_unwire_86: invalid (unmapped) va "
2208 			      "0x%lx", va);
2209 #endif
2210 
2211 		if ((ptes[atop(va)] & PG_W) != 0) {
2212 			i386_atomic_clearbits_l(&ptes[atop(va)], PG_W);
2213 			pmap->pm_stats.wired_count--;
2214 		}
2215 #ifdef DIAGNOSTIC
2216 		else {
2217 			printf("pmap_unwire_86: wiring for pmap %p va 0x%lx "
2218 			       "didn't change!\n", pmap, va);
2219 		}
2220 #endif
2221 		pmap_unmap_ptes_86(pmap);		/* unlocks map */
2222 	}
2223 #ifdef DIAGNOSTIC
2224 	else {
2225 		panic("pmap_unwire_86: invalid PDE");
2226 	}
2227 #endif
2228 }
2229 
2230 /*
2231  * pmap_enter: enter a mapping into a pmap
2232  *
2233  * => must be done "now" ... no lazy-evaluation
2234  */
2235 
2236 int
pmap_enter_86(struct pmap * pmap,vaddr_t va,paddr_t pa,vm_prot_t prot,int flags)2237 pmap_enter_86(struct pmap *pmap, vaddr_t va, paddr_t pa,
2238     vm_prot_t prot, int flags)
2239 {
2240 	pt_entry_t *ptes, opte, npte;
2241 	struct vm_page *ptp;
2242 	struct pv_entry *pve, *opve = NULL;
2243 	int wired = (flags & PMAP_WIRED) != 0;
2244 	int nocache = (pa & PMAP_NOCACHE) != 0;
2245 	int wc = (pa & PMAP_WC) != 0;
2246 	struct vm_page *pg = NULL;
2247 	int error, wired_count, resident_count, ptp_count;
2248 
2249 	KASSERT(!(wc && nocache));
2250 	pa &= PMAP_PA_MASK;	/* nuke flags from pa */
2251 
2252 #ifdef DIAGNOSTIC
2253 	/* sanity check: totally out of range? */
2254 	if (va >= VM_MAX_KERNEL_ADDRESS)
2255 		panic("pmap_enter_86: too big");
2256 
2257 	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
2258 		panic("pmap_enter_86: trying to map over PDP/APDP!");
2259 
2260 	/* sanity check: kernel PTPs should already have been pre-allocated */
2261 	if (va >= VM_MIN_KERNEL_ADDRESS &&
2262 	    !pmap_valid_entry(PDE(pmap, pdei(va))))
2263 		panic("pmap_enter: missing kernel PTP!");
2264 #endif
2265 	if (pmap_initialized)
2266 		pve = pool_get(&pmap_pv_pool, PR_NOWAIT);
2267 	else
2268 		pve = NULL;
2269 	wired_count = resident_count = ptp_count = 0;
2270 
2271 	/*
2272 	 * map in ptes and get a pointer to our PTP (unless we are the kernel)
2273 	 */
2274 
2275 	ptes = pmap_map_ptes_86(pmap);		/* locks pmap */
2276 	if (pmap == pmap_kernel()) {
2277 		ptp = NULL;
2278 	} else {
2279 		ptp = pmap_get_ptp_86(pmap, pdei(va));
2280 		if (ptp == NULL) {
2281 			if (flags & PMAP_CANFAIL) {
2282 				pmap_unmap_ptes_86(pmap);
2283 				error = ENOMEM;
2284 				goto out;
2285 			}
2286 			panic("pmap_enter_86: get ptp failed");
2287 		}
2288 	}
2289 	/*
2290 	 * not allowed to sleep after here!
2291 	 */
2292 	opte = ptes[atop(va)];			/* old PTE */
2293 
2294 	/*
2295 	 * is there currently a valid mapping at our VA?
2296 	 */
2297 
2298 	if (pmap_valid_entry(opte)) {
2299 
2300 		/*
2301 		 * first, calculate pm_stats updates.  resident count will not
2302 		 * change since we are replacing/changing a valid
2303 		 * mapping.  wired count might change...
2304 		 */
2305 
2306 		if (wired && (opte & PG_W) == 0)
2307 			wired_count++;
2308 		else if (!wired && (opte & PG_W) != 0)
2309 			wired_count--;
2310 
2311 		/*
2312 		 * is the currently mapped PA the same as the one we
2313 		 * want to map?
2314 		 */
2315 
2316 		if ((opte & PG_FRAME) == pa) {
2317 
2318 			/* if this is on the PVLIST, sync R/M bit */
2319 			if (opte & PG_PVLIST) {
2320 				pg = PHYS_TO_VM_PAGE(pa);
2321 #ifdef DIAGNOSTIC
2322 				if (pg == NULL)
2323 					panic("pmap_enter_86: same pa "
2324 					     "PG_PVLIST mapping with "
2325 					     "unmanaged page "
2326 					     "pa = 0x%lx (0x%lx)", pa,
2327 					     atop(pa));
2328 #endif
2329 				pmap_sync_flags_pte_86(pg, opte);
2330 			}
2331 			goto enter_now;
2332 		}
2333 
2334 		/*
2335 		 * changing PAs: we must remove the old one first
2336 		 */
2337 
2338 		/*
2339 		 * if current mapping is on a pvlist,
2340 		 * remove it (sync R/M bits)
2341 		 */
2342 
2343 		if (opte & PG_PVLIST) {
2344 			pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
2345 #ifdef DIAGNOSTIC
2346 			if (pg == NULL)
2347 				panic("pmap_enter_86: PG_PVLIST mapping with "
2348 				      "unmanaged page "
2349 				      "pa = 0x%lx (0x%lx)", pa, atop(pa));
2350 #endif
2351 			pmap_sync_flags_pte_86(pg, opte);
2352 			opve = pmap_remove_pv(pg, pmap, va);
2353 			pg = NULL; /* This is not the page we are looking for */
2354 		}
2355 	} else {	/* opte not valid */
2356 		resident_count++;
2357 		if (wired)
2358 			wired_count++;
2359 		if (ptp)
2360 			ptp_count++;	/* count # of valid entries */
2361 	}
2362 
2363 	/*
2364 	 * pve is either NULL or points to a now-free pv_entry structure
2365 	 * (the latter case is if we called pmap_remove_pv above).
2366 	 *
2367 	 * if this entry is to be on a pvlist, enter it now.
2368 	 */
2369 
2370 	if (pmap_initialized && pg == NULL)
2371 		pg = PHYS_TO_VM_PAGE(pa);
2372 
2373 	if (pg != NULL) {
2374 		if (pve == NULL) {
2375 			pve = opve;
2376 			opve = NULL;
2377 		}
2378 		if (pve == NULL) {
2379 			if (flags & PMAP_CANFAIL) {
2380 				pmap_unmap_ptes_86(pmap);
2381 				error = ENOMEM;
2382 				goto out;
2383 			}
2384 			panic("pmap_enter_86: no pv entries available");
2385 		}
2386 		/* lock pg when adding */
2387 		pmap_enter_pv(pg, pve, pmap, va, ptp);
2388 		pve = NULL;
2389 	}
2390 
2391 enter_now:
2392 	/*
2393 	 * at this point pg is !NULL if we want the PG_PVLIST bit set
2394 	 */
2395 
2396 	npte = pa | protection_codes[prot] | PG_V;
2397 	pmap_exec_account(pmap, va, opte, npte);
2398 	if (wired)
2399 		npte |= PG_W;
2400 	if (nocache)
2401 		npte |= PG_N;
2402 	if (va < VM_MAXUSER_ADDRESS)
2403 		npte |= PG_u;
2404 	else if (va < VM_MAX_ADDRESS)
2405 		npte |= PG_RW;	/* XXXCDC: no longer needed? */
2406 	if (pmap == pmap_kernel())
2407 		npte |= pmap_pg_g;
2408 	if (flags & PROT_READ)
2409 		npte |= PG_U;
2410 	if (flags & PROT_WRITE)
2411 		npte |= PG_M;
2412 	if (pg) {
2413 		npte |= PG_PVLIST;
2414 		if (pg->pg_flags & PG_PMAP_WC) {
2415 			KASSERT(nocache == 0);
2416 			wc = 1;
2417 		}
2418 		pmap_sync_flags_pte_86(pg, npte);
2419 	}
2420 	if (wc)
2421 		npte |= pmap_pg_wc;
2422 
2423 	opte = i386_atomic_testset_ul(&ptes[atop(va)], npte);
2424 	if (ptp)
2425 		ptp->wire_count += ptp_count;
2426 	pmap->pm_stats.resident_count += resident_count;
2427 	pmap->pm_stats.wired_count += wired_count;
2428 
2429 	if (pmap_valid_entry(opte)) {
2430 		if (nocache && (opte & PG_N) == 0)
2431 			wbinvd_on_all_cpus(); /* XXX clflush before we enter? */
2432 		pmap_tlb_shootpage(pmap, va);
2433 	}
2434 
2435 	pmap_unmap_ptes_86(pmap);
2436 	pmap_tlb_shootwait();
2437 
2438 	error = 0;
2439 
2440 out:
2441 	if (pve)
2442 		pool_put(&pmap_pv_pool, pve);
2443 	if (opve)
2444 		pool_put(&pmap_pv_pool, opve);
2445 
2446 	return error;
2447 }
2448 
2449 /*
2450  * Allocate an extra PD page and PT pages as needed to map kernel
2451  * pages used for the U-K mappings.  These special mappings are set
2452  * up during bootstrap and get never removed and are part of
2453  * pmap_kernel.
2454  *
2455  * New pmaps inherit the kernel portion of pmap_kernel including
2456  * the special mappings (see pmap_pinit_pd_86()).
2457  *
2458  * To be able to release PT pages when migrating to PAE paging, use
2459  * wire_count for number of PTEs in the PT page.
2460  */
2461 void
pmap_enter_special_86(vaddr_t va,paddr_t pa,vm_prot_t prot,u_int32_t flags)2462 pmap_enter_special_86(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int32_t flags)
2463 {
2464 	struct pmap	*pmap = pmap_kernel();
2465 	struct vm_page	*ptppg = NULL;
2466 	pd_entry_t	*pd, *ptp;
2467 	pt_entry_t	*ptes;
2468 	uint32_t	 l2idx, l1idx;
2469 	paddr_t		 npa;
2470 
2471 	/* If CPU is secure, no need to do anything */
2472 	if (!cpu_meltdown)
2473 		return;
2474 
2475 	/* Must be kernel VA */
2476 	if (va < VM_MIN_KERNEL_ADDRESS)
2477 		panic("invalid special mapping va 0x%lx requested", va);
2478 
2479 	if (!pmap->pm_pdir_intel)
2480 		pmap_alloc_pdir_intel_x86(pmap);
2481 
2482 	DPRINTF("%s: pm_pdir_intel 0x%x pm_pdirpa_intel 0x%x\n", __func__,
2483 	    (uint32_t)pmap->pm_pdir_intel, (uint32_t)pmap->pm_pdirpa_intel);
2484 
2485 	l2idx = pdei(va);
2486 	l1idx = ptei(va);
2487 
2488 	DPRINTF("%s: va 0x%08lx pa 0x%08lx prot 0x%08lx flags 0x%08x "
2489 	    "l2idx %u l1idx %u\n", __func__, va, pa, (unsigned long)prot,
2490 	    flags, l2idx, l1idx);
2491 
2492 	if ((pd = (pd_entry_t *)pmap->pm_pdir_intel) == NULL)
2493 		panic("%s: PD not initialized for pmap @ %p", __func__, pmap);
2494 
2495 	/* npa = physaddr of PT page */
2496 	npa = pd[l2idx] & PMAP_PA_MASK;
2497 
2498 	/* Valid PDE for the 4MB region containing va? */
2499 	if (!npa) {
2500 		/*
2501 		 * No valid PDE - allocate PT page and set PDE.  We
2502 		 * get it from pm_obj, which is used for PT pages.
2503 		 * We calculate the offset  from l2idx+1024, so we are
2504 		 * beyond the regular PT pages. For their l2dix
2505 		 * 0 <= l2idx < 1024 holds.
2506 		 */
2507 		ptppg = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(l2idx + 1024),
2508 		    NULL, UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2509 		if (ptppg == NULL)
2510 			panic("%s: failed to allocate PT page", __func__);
2511 
2512 		atomic_clearbits_int(&ptppg->pg_flags, PG_BUSY);
2513 		ptppg->wire_count = 1;	/* no mappings yet */
2514 
2515 		npa = VM_PAGE_TO_PHYS(ptppg);
2516 		pd[l2idx] = (npa | PG_RW | PG_V | PG_M | PG_U);
2517 
2518 		DPRINTF("%s: allocated new PT page at phys 0x%x, "
2519 		    "setting PDE[%d] = 0x%x\n", __func__, (uint32_t)npa,
2520 		    l2idx, pd[l2idx]);
2521 	}
2522 
2523 	/* temporarily map PT page and set PTE for U-K mapping */
2524 	if (ptppg == NULL && (ptppg = PHYS_TO_VM_PAGE(npa)) == NULL)
2525 		panic("%s: no vm_page for PT page", __func__);
2526 	mtx_enter(&ptppg->mdpage.pv_mtx);
2527 	ptp = (pd_entry_t *)pmap_tmpmap_pa(npa);
2528 	ptp[l1idx] = (pa | protection_codes[prot] | PG_V | PG_M | PG_U | flags);
2529 	ptppg->wire_count++;
2530 	DPRINTF("%s: setting PTE[%d] = 0x%x (wire_count %d)\n", __func__,
2531 	    l1idx, ptp[l1idx], ptppg->wire_count);
2532 	pmap_tmpunmap_pa();
2533 	mtx_leave(&ptppg->mdpage.pv_mtx);
2534 
2535 	/*
2536 	 * if supported, set the PG_G flag on the corresponding U+K
2537 	 * entry.  U+K mappings can use PG_G, as they are mapped
2538 	 * along with user land anyway.
2539 	 */
2540 	if (!(cpu_feature & CPUID_PGE))
2541 		return;
2542 	ptes = pmap_map_ptes_86(pmap);	/* pmap_kernel -> PTE_BASE */
2543 	if (pmap_valid_entry(ptes[atop(va)]))
2544 		ptes[atop(va)] |= PG_G;
2545 	else
2546 		DPRINTF("%s: no U+K mapping for special mapping?\n", __func__);
2547 	pmap_unmap_ptes_86(pmap);	/* pmap_kernel -> nothing */
2548 }
2549 
2550 /*
2551  * pmap_growkernel: increase usage of KVM space
2552  *
2553  * => we allocate new PTPs for the kernel and install them in all
2554  *	the pmaps on the system.
2555  */
2556 
2557 vaddr_t
pmap_growkernel_86(vaddr_t maxkvaddr)2558 pmap_growkernel_86(vaddr_t maxkvaddr)
2559 {
2560 	struct pmap *kpm = pmap_kernel(), *pm;
2561 	int needed_kpde;   /* needed number of kernel PTPs */
2562 	int s;
2563 	paddr_t ptaddr;
2564 
2565 	needed_kpde = (int)(maxkvaddr - VM_MIN_KERNEL_ADDRESS + (NBPD-1))
2566 		/ NBPD;
2567 	if (needed_kpde <= nkpde)
2568 		goto out;		/* we are OK */
2569 
2570 	/*
2571 	 * whoops!   we need to add kernel PTPs
2572 	 */
2573 
2574 	s = splhigh();	/* to be safe */
2575 
2576 	for (/*null*/ ; nkpde < needed_kpde ; nkpde++) {
2577 
2578 		if (uvm.page_init_done == 0) {
2579 
2580 			/*
2581 			 * we're growing the kernel pmap early (from
2582 			 * uvm_pageboot_alloc()).  this case must be
2583 			 * handled a little differently.
2584 			 */
2585 
2586 			if (uvm_page_physget(&ptaddr) == 0)
2587 				panic("pmap_growkernel: out of memory");
2588 			pmap_zero_phys_86(ptaddr);
2589 
2590 			PDE(kpm, PDSLOT_KERN + nkpde) =
2591 				ptaddr | PG_RW | PG_V | PG_U | PG_M;
2592 
2593 			/* count PTP as resident */
2594 			kpm->pm_stats.resident_count++;
2595 			continue;
2596 		}
2597 
2598 		/*
2599 		 * THIS *MUST* BE CODED SO AS TO WORK IN THE
2600 		 * pmap_initialized == 0 CASE!  WE MAY BE
2601 		 * INVOKED WHILE pmap_init() IS RUNNING!
2602 		 */
2603 
2604 		while (!pmap_alloc_ptp_86(kpm, PDSLOT_KERN + nkpde, 0))
2605 			uvm_wait("pmap_growkernel");
2606 
2607 		/* distribute new kernel PTP to all active pmaps */
2608 		mtx_enter(&pmaps_lock);
2609 		LIST_FOREACH(pm, &pmaps, pm_list) {
2610 			PDE(pm, PDSLOT_KERN + nkpde) =
2611 				PDE(kpm, PDSLOT_KERN + nkpde);
2612 		}
2613 		mtx_leave(&pmaps_lock);
2614 	}
2615 
2616 	splx(s);
2617 
2618 out:
2619 	return (VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD));
2620 }
2621 
2622 #ifdef MULTIPROCESSOR
2623 /*
2624  * Locking for tlb shootdown.
2625  *
2626  * We lock by setting tlb_shoot_wait to the number of cpus that will
2627  * receive our tlb shootdown. After sending the IPIs, we don't need to
2628  * worry about locking order or interrupts spinning for the lock because
2629  * the call that grabs the "lock" isn't the one that releases it. And
2630  * there is nothing that can block the IPI that releases the lock.
2631  *
2632  * The functions are organized so that we first count the number of
2633  * cpus we need to send the IPI to, then we grab the counter, then
2634  * we send the IPIs, then we finally do our own shootdown.
2635  *
2636  * Our shootdown is last to make it parallel with the other cpus
2637  * to shorten the spin time.
2638  *
2639  * Notice that we depend on failures to send IPIs only being able to
2640  * happen during boot. If they happen later, the above assumption
2641  * doesn't hold since we can end up in situations where noone will
2642  * release the lock if we get an interrupt in a bad moment.
2643  */
2644 
2645 volatile int tlb_shoot_wait __attribute__((section(".kudata")));
2646 
2647 volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata")));
2648 volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata")));
2649 
2650 void
pmap_tlb_shootpage(struct pmap * pm,vaddr_t va)2651 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va)
2652 {
2653 	struct cpu_info *ci, *self = curcpu();
2654 	CPU_INFO_ITERATOR cii;
2655 	int wait = 0;
2656 	u_int64_t mask = 0;
2657 
2658 	CPU_INFO_FOREACH(cii, ci) {
2659 		if (ci == self || !pmap_is_active(pm, ci) ||
2660 		    !(ci->ci_flags & CPUF_RUNNING))
2661 			continue;
2662 		mask |= (1ULL << ci->ci_cpuid);
2663 		wait++;
2664 	}
2665 
2666 	if (wait > 0) {
2667 		int s = splvm();
2668 
2669 		while (atomic_cas_uint(&tlb_shoot_wait, 0, wait) != 0) {
2670 			while (tlb_shoot_wait != 0)
2671 				CPU_BUSY_CYCLE();
2672 		}
2673 		tlb_shoot_addr1 = va;
2674 		CPU_INFO_FOREACH(cii, ci) {
2675 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
2676 				continue;
2677 			if (i386_fast_ipi(ci, LAPIC_IPI_INVLPG) != 0)
2678 				panic("pmap_tlb_shootpage: ipi failed");
2679 		}
2680 		splx(s);
2681 	}
2682 
2683 	if (pmap_is_curpmap(pm))
2684 		pmap_update_pg(va);
2685 }
2686 
2687 void
pmap_tlb_shootrange(struct pmap * pm,vaddr_t sva,vaddr_t eva)2688 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva)
2689 {
2690 	struct cpu_info *ci, *self = curcpu();
2691 	CPU_INFO_ITERATOR cii;
2692 	int wait = 0;
2693 	u_int64_t mask = 0;
2694 	vaddr_t va;
2695 
2696 	CPU_INFO_FOREACH(cii, ci) {
2697 		if (ci == self || !pmap_is_active(pm, ci) ||
2698 		    !(ci->ci_flags & CPUF_RUNNING))
2699 			continue;
2700 		mask |= (1ULL << ci->ci_cpuid);
2701 		wait++;
2702 	}
2703 
2704 	if (wait > 0) {
2705 		int s = splvm();
2706 
2707 		while (atomic_cas_uint(&tlb_shoot_wait, 0, wait) != 0) {
2708 			while (tlb_shoot_wait != 0)
2709 				CPU_BUSY_CYCLE();
2710 		}
2711 		tlb_shoot_addr1 = sva;
2712 		tlb_shoot_addr2 = eva;
2713 		CPU_INFO_FOREACH(cii, ci) {
2714 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
2715 				continue;
2716 			if (i386_fast_ipi(ci, LAPIC_IPI_INVLRANGE) != 0)
2717 				panic("pmap_tlb_shootrange: ipi failed");
2718 		}
2719 		splx(s);
2720 	}
2721 
2722 	if (pmap_is_curpmap(pm))
2723 		for (va = sva; va < eva; va += PAGE_SIZE)
2724 			pmap_update_pg(va);
2725 }
2726 
2727 void
pmap_tlb_shoottlb(void)2728 pmap_tlb_shoottlb(void)
2729 {
2730 	struct cpu_info *ci, *self = curcpu();
2731 	CPU_INFO_ITERATOR cii;
2732 	int wait = 0;
2733 	u_int64_t mask = 0;
2734 
2735 	CPU_INFO_FOREACH(cii, ci) {
2736 		if (ci == self || !(ci->ci_flags & CPUF_RUNNING))
2737 			continue;
2738 		mask |= (1ULL << ci->ci_cpuid);
2739 		wait++;
2740 	}
2741 
2742 	if (wait) {
2743 		int s = splvm();
2744 
2745 		while (atomic_cas_uint(&tlb_shoot_wait, 0, wait) != 0) {
2746 			while (tlb_shoot_wait != 0)
2747 				CPU_BUSY_CYCLE();
2748 		}
2749 
2750 		CPU_INFO_FOREACH(cii, ci) {
2751 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
2752 				continue;
2753 			if (i386_fast_ipi(ci, LAPIC_IPI_INVLTLB) != 0)
2754 				panic("pmap_tlb_shoottlb: ipi failed");
2755 		}
2756 		splx(s);
2757 	}
2758 
2759 	tlbflush();
2760 }
2761 
2762 void
pmap_tlb_droppmap(struct pmap * pm)2763 pmap_tlb_droppmap(struct pmap *pm)
2764 {
2765 	struct cpu_info *ci, *self = curcpu();
2766 	CPU_INFO_ITERATOR cii;
2767 	int wait = 0;
2768 	u_int64_t mask = 0;
2769 
2770 	CPU_INFO_FOREACH(cii, ci) {
2771 		if (ci == self || !(ci->ci_flags & CPUF_RUNNING) ||
2772 		    ci->ci_curpmap != pm)
2773 			continue;
2774 		mask |= (1ULL << ci->ci_cpuid);
2775 		wait++;
2776 	}
2777 
2778 	if (wait) {
2779 		int s = splvm();
2780 
2781 		while (atomic_cas_uint(&tlb_shoot_wait, 0, wait) != 0) {
2782 			while (tlb_shoot_wait != 0)
2783 				CPU_BUSY_CYCLE();
2784 		}
2785 
2786 		CPU_INFO_FOREACH(cii, ci) {
2787 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
2788 				continue;
2789 			if (i386_fast_ipi(ci, LAPIC_IPI_RELOADCR3) != 0)
2790 				panic("pmap_tlb_droppmap: ipi failed");
2791 		}
2792 		splx(s);
2793 	}
2794 
2795 	if (self->ci_curpmap == pm)
2796 		pmap_activate(curproc);
2797 
2798 	pmap_tlb_shootwait();
2799 }
2800 
2801 void
pmap_tlb_shootwait(void)2802 pmap_tlb_shootwait(void)
2803 {
2804 	while (tlb_shoot_wait != 0)
2805 		CPU_BUSY_CYCLE();
2806 }
2807 
2808 #else
2809 
2810 void
pmap_tlb_shootpage(struct pmap * pm,vaddr_t va)2811 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va)
2812 {
2813 	if (pmap_is_curpmap(pm))
2814 		pmap_update_pg(va);
2815 
2816 }
2817 
2818 void
pmap_tlb_shootrange(struct pmap * pm,vaddr_t sva,vaddr_t eva)2819 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva)
2820 {
2821 	vaddr_t va;
2822 
2823 	for (va = sva; va < eva; va += PAGE_SIZE)
2824 		pmap_update_pg(va);
2825 }
2826 
2827 void
pmap_tlb_shoottlb(void)2828 pmap_tlb_shoottlb(void)
2829 {
2830 	tlbflush();
2831 }
2832 #endif /* MULTIPROCESSOR */
2833 
2834 u_int32_t	(*pmap_pte_set_p)(vaddr_t, paddr_t, u_int32_t) =
2835     pmap_pte_set_86;
2836 u_int32_t	(*pmap_pte_setbits_p)(vaddr_t, u_int32_t, u_int32_t) =
2837     pmap_pte_setbits_86;
2838 u_int32_t	(*pmap_pte_bits_p)(vaddr_t) = pmap_pte_bits_86;
2839 paddr_t		(*pmap_pte_paddr_p)(vaddr_t) = pmap_pte_paddr_86;
2840 int		(*pmap_clear_attrs_p)(struct vm_page *, int) =
2841     pmap_clear_attrs_86;
2842 int		(*pmap_enter_p)(pmap_t, vaddr_t, paddr_t, vm_prot_t, int) =
2843     pmap_enter_86;
2844 void		(*pmap_enter_special_p)(vaddr_t, paddr_t, vm_prot_t,
2845     u_int32_t) = pmap_enter_special_86;
2846 int		(*pmap_extract_p)(pmap_t, vaddr_t, paddr_t *) =
2847     pmap_extract_86;
2848 vaddr_t		(*pmap_growkernel_p)(vaddr_t) = pmap_growkernel_86;
2849 void		(*pmap_page_remove_p)(struct vm_page *) = pmap_page_remove_86;
2850 void		(*pmap_do_remove_p)(struct pmap *, vaddr_t, vaddr_t, int) =
2851     pmap_do_remove_86;
2852 int		 (*pmap_test_attrs_p)(struct vm_page *, int) =
2853     pmap_test_attrs_86;
2854 void		(*pmap_unwire_p)(struct pmap *, vaddr_t) = pmap_unwire_86;
2855 void		(*pmap_write_protect_p)(struct pmap *, vaddr_t, vaddr_t,
2856     vm_prot_t) = pmap_write_protect_86;
2857 void		(*pmap_pinit_pd_p)(pmap_t) = pmap_pinit_pd_86;
2858 void		(*pmap_zero_phys_p)(paddr_t) = pmap_zero_phys_86;
2859 void		(*pmap_copy_page_p)(struct vm_page *, struct vm_page *) =
2860     pmap_copy_page_86;
2861