xref: /openbsd/sys/arch/i386/i386/pmapae.c (revision 0aa88a20)
1 /*	$OpenBSD: pmapae.c,v 1.75 2024/11/16 10:09:08 mpi Exp $	*/
2 
3 /*
4  * Copyright (c) 2006-2008 Michael Shalayeff
5  * All rights reserved.
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER IN
16  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
17  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 /*
20  * Copyright (c) 1997 Charles D. Cranor and Washington University.
21  * All rights reserved.
22  *
23  * Redistribution and use in source and binary forms, with or without
24  * modification, are permitted provided that the following conditions
25  * are met:
26  * 1. Redistributions of source code must retain the above copyright
27  *    notice, this list of conditions and the following disclaimer.
28  * 2. Redistributions in binary form must reproduce the above copyright
29  *    notice, this list of conditions and the following disclaimer in the
30  *    documentation and/or other materials provided with the distribution.
31  *
32  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
33  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
34  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
35  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
36  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
38  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
39  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
40  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
41  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42  *
43  *	from OpenBSD: pmap.c,v 1.85 2005/11/18 17:05:04 brad Exp
44  */
45 
46 /*
47  * pmap.c: i386 pmap module rewrite
48  * Chuck Cranor <chuck@ccrc.wustl.edu>
49  * 11-Aug-97
50  *
51  * history of this pmap module: in addition to my own input, i used
52  *    the following references for this rewrite of the i386 pmap:
53  *
54  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
55  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
56  *     it was then ported to the i386 by William Jolitz of UUNET
57  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
58  *     project fixed some bugs and provided some speed ups.
59  *
60  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
61  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
62  *     and David Greenman.
63  *
64  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
65  *     between several processors.   the VAX version was done by
66  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
67  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
68  *     David Golub, and Richard Draves.    the alpha version was
69  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
70  *     (NetBSD/alpha).
71  */
72 /*
73  * PAE support
74  * Michael Shalayeff <mickey@lucifier.net>
75  *
76  * This module implements PAE mode for i386.
77  *
78  */
79 
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/atomic.h>
83 #include <sys/pool.h>
84 #include <sys/user.h>
85 #include <sys/mutex.h>
86 
87 #include <uvm/uvm.h>
88 
89 #include <machine/specialreg.h>
90 
91 #include <dev/isa/isareg.h>
92 #include <i386/isa/isa_machdep.h>
93 
94 #include "ksyms.h"
95 
96 /* #define PMAPAE_DEBUG */
97 
98 #ifdef PMAPAE_DEBUG
99 #define DPRINTF(x...)	do { printf(x); } while(0)
100 #else
101 #define DPRINTF(x...)
102 #endif	/* PMAPAE_DEBUG */
103 
104 /*
105  * this file contains the code for the "pmap module."   the module's
106  * job is to manage the hardware's virtual to physical address mappings.
107  * note that there are two levels of mapping in the VM system:
108  *
109  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
110  *      to map ranges of virtual address space to objects/files.  for
111  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
112  *      to the file /bin/ls starting at offset zero."   note that
113  *      the upper layer mapping is not concerned with how individual
114  *      vm_pages are mapped.
115  *
116  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
117  *      from virtual addresses.   it is concerned with which vm_page is
118  *      mapped where.   for example, when you run /bin/ls and start
119  *      at page 0x1000 the fault routine may lookup the correct page
120  *      of the /bin/ls file and then ask the pmap layer to establish
121  *      a mapping for it.
122  *
123  * note that information in the lower layer of the VM system can be
124  * thrown away since it can easily be reconstructed from the info
125  * in the upper layer.
126  *
127  * data structures we use include:
128  *
129  *  - struct pmap: describes the address space of one thread
130  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
131  *  - struct pv_head: there is one pv_head per managed page of
132  *	physical memory.   the pv_head points to a list of pv_entry
133  *	structures which describe all the <PMAP,VA> pairs that this
134  *      page is mapped in.    this is critical for page based operations
135  *      such as pmap_page_protect() [change protection on _all_ mappings
136  *      of a page]
137  */
138 /*
139  * i386 PAE hardware Page Tables structure:
140  *
141  * the i386 PAE Page Table is a three-level PT which maps 4GB of VA.
142  * the pagesize is 4K (4096 [0x1000] bytes) or 2MB.
143  *
144  * the first level table is called "page directory index" and consists
145  * of 4 page directory index entries (PDIE) each 64 bits in size.
146  *
147  * the second level table is called a "page directory" and it contains
148  * 512 page directory entries (PDEs).   each PDE is
149  * 8 bytes (a long long), so a PD fits in a single 4K page.   this page is
150  * the page directory page (PDP).  each PDE in a PDP maps 1GB of space
151  * (512 * 2MB = 1GB).   a PDE contains the physical address of the
152  * second level table: the page table.   or, if 2MB pages are being used,
153  * then the PDE contains the PA of the 2MB page being mapped.
154  *
155  * a page table consists of 512 page table entries (PTEs).  each PTE is
156  * 8 bytes (a long long), so a page table also fits in a single 4K page.
157  * a 4K page being used as a page table is called a page table page (PTP).
158  * each PTE in a PTP maps one 4K page (512 * 4K = 2MB).   a PTE contains
159  * the physical address of the page it maps and some flag bits (described
160  * below).
161  *
162  * the processor has a special register, "cr3", which points to the
163  * the PDP which is currently controlling the mappings of the virtual
164  * address space.
165  *
166  * the following picture shows the translation process for a 4K page:
167  *
168  * %cr3 register [PA of PDPT]
169  *  |
170  *  |  bits <31-30> of VA
171  *  |  index the DPE (0-3)
172  *  |        |
173  *  v        v
174  *  +-----------+
175  *  |  PDP Ptr  |
176  *  | 4 entries |
177  *  +-----------+
178  *       |
179  *    PA of PDP
180  *       |
181  *       |
182  *       |  bits <29-21> of VA       bits <20-12> of VA   bits <11-0>
183  *       |  index the PDP (0 - 512)  index the PTP        are the page offset
184  *       |        |                         |                    |
185  *       |        v                         |                    |
186  *       +-->+---------+                    |                    |
187  *           | PD Page |    PA of           v                    |
188  *           |         |-----PTP----->+------------+             |
189  *           | 512 PDE |              | page table |--PTE--+     |
190  *           | entries |              | (aka PTP)  |       |     |
191  *           +---------+              |  512 PTE   |       |     |
192  *                                    |  entries   |       |     |
193  *                                    +------------+       |     |
194  *                                                         |     |
195  *                                              bits <35-12>   bits <11-0>
196  *                                               p h y s i c a l  a d d r
197  *
198  * the i386 caches PTEs in a TLB.   it is important to flush out old
199  * TLB mappings when making a change to a mapping.   writing to the
200  * %cr3 will flush the entire TLB.    newer processors also have an
201  * instruction that will invalidate the mapping of a single page (which
202  * is useful if you are changing a single mapping because it preserves
203  * all the cached TLB entries).
204  *
205  * as shows, bits 31-12 of the PTE contain PA of the page being mapped.
206  * the rest of the PTE is defined as follows:
207  *   bit#	name	use
208  *   63		NX	no-execute bit (0=ITLB, 1=DTLB), optional
209  *   11		n/a	available for OS use, hardware ignores it
210  *   10		n/a	available for OS use, hardware ignores it
211  *   9		n/a	available for OS use, hardware ignores it
212  *   8		G	global bit (see discussion below)
213  *   7		PS	page size [for PDEs] (0=4k, 1=4M <if supported>)
214  *   6		D	dirty (modified) page
215  *   5		A	accessed (referenced) page
216  *   4		PCD	cache disable
217  *   3		PWT	prevent write through (cache)
218  *   2		U/S	user/supervisor bit (0=supervisor only, 1=both u&s)
219  *   1		R/W	read/write bit (0=read only, 1=read-write)
220  *   0		P	present (valid)
221  *
222  * notes:
223  *  - on the i386 the R/W bit is ignored if processor is in supervisor
224  *    state (bug!)
225  *  - PS is only supported on newer processors
226  *  - PTEs with the G bit are global in the sense that they are not
227  *    flushed from the TLB when %cr3 is written (to flush, use the
228  *    "flush single page" instruction).   this is only supported on
229  *    newer processors.    this bit can be used to keep the kernel's
230  *    TLB entries around while context switching.   since the kernel
231  *    is mapped into all processes at the same place it does not make
232  *    sense to flush these entries when switching from one process'
233  *    pmap to another.
234  */
235 /*
236  * A pmap describes a process' 4GB virtual address space.  This
237  * virtual address space can be broken up into 2048 2MB regions which
238  * are described by PDEs in the PDP.  The PDEs are defined as follows:
239  *
240  * Ranges are inclusive -> exclusive, just like vm_map_entry start/end.
241  * The following assumes that KERNBASE is 0xd0000000.
242  *
243  * PDE#s	VA range		Usage
244  * 0->1660	0x0 -> 0xcf800000	user address space, note that the
245  *					max user address is 0xcfbfe000
246  *					the final two pages in the last 4MB
247  *					used to be reserved for the UAREA
248  *					but now are no longer used.
249  * 1660		0xcf800000->		recursive mapping of PDP (used for
250  *			0xd0000000	linear mapping of PTPs).
251  * 1664->2044	0xd0000000->		kernel address space (constant
252  *			0xff800000	across all pmaps/processes).
253  * 2044		0xff800000->		"alternate" recursive PDP mapping
254  *			<end>		(for other pmaps).
255  *
256  *
257  * Note: A recursive PDP mapping provides a way to map all the PTEs for
258  * a 4GB address space into a linear chunk of virtual memory.  In other
259  * words, the PTE for page 0 is the first 8b mapped into the 2MB recursive
260  * area.  The PTE for page 1 is the second 8b.  The very last 8b in the
261  * 2MB range is the PTE that maps VA 0xffffe000 (the last page in a 4GB
262  * address).
263  *
264  * All pmaps' PDs must have the same values in slots 1660->2043 so that
265  * the kernel is always mapped in every process.  These values are loaded
266  * into the PD at pmap creation time.
267  *
268  * At any one time only one pmap can be active on a processor.  This is
269  * the pmap whose PDP is pointed to by processor register %cr3.  This pmap
270  * will have all its PTEs mapped into memory at the recursive mapping
271  * point (slots #1660-3 as show above).  When the pmap code wants to find the
272  * PTE for a virtual address, all it has to do is the following:
273  *
274  * Address of PTE = (1660 * 2MB) + (VA / NBPG) * sizeof(pt_entry_t)
275  *                = 0xcf800000 + (VA / 4096) * 8
276  *
277  * What happens if the pmap layer is asked to perform an operation
278  * on a pmap that is not the one which is currently active?  In that
279  * case we take the PA of the PDP of the non-active pmap and put it in
280  * slots 2044-7 of the active pmap.  This causes the non-active pmap's
281  * PTEs to get mapped in the final 4MB of the 4GB address space
282  * (e.g. starting at 0xffc00000).
283  *
284  * The following figure shows the effects of the recursive PDP mapping:
285  *
286  *   PDP (%cr3->PDPTP)
287  *   +----+
288  *   |   0| -> PTP#0 that maps VA 0x0 -> 0x200000
289  *   |    |
290  *   |    |
291  *   |1660| -> points back to PDP (%cr3) mapping VA 0xcf800000 -> 0xd0000000
292  *   |1661|    (PDP is 4 pages)
293  *   |1662|
294  *   |1663|
295  *   |1664| -> first kernel PTP (maps 0xd0000000 -> 0xe0200000)
296  *   |    |
297  *   |2044| -> points to alternate pmap's PDP (maps 0xff800000 -> end)
298  *   |2045|
299  *   |2046|
300  *   |2047|
301  *   +----+
302  *
303  * Note that the PDE#1660 VA (0xcf8033e0) is defined as "PTE_BASE".
304  * Note that the PDE#2044 VA (0xff803fe0) is defined as "APTE_BASE".
305  *
306  * Starting at VA 0xcf8033e0 the current active PDPs (%cr3) act as a
307  * PDPTP and reference four consecutively mapped pages:
308  *
309  * PTP#1660-3 == PDP(%cr3) => maps VA 0xcf800000 -> 0xd0000000
310  *   +----+
311  *   |   0| -> maps the contents of PTP#0 at VA 0xcf800000->0xcf801000
312  *   |    |
313  *   |    |
314  *   |1660| -> maps the contents of PTP#1660 (the PDP) at VA 0xcfe7c000
315  *   |1661|
316  *   |1662|
317  *   |1663|
318  *   |1664| -> maps the contents of first kernel PTP
319  *   |    |
320  *   |2047|
321  *   +----+
322  *
323  * Note that mapping of the PDP at PTP#1660's VA (0xcfe7c000) is
324  * defined as "PDP_BASE".... within that mapping there are two
325  * defines:
326  *   "PDP_PDE" (0xcfe7f3e0) is the VA of the PDE in the PDP
327  *      which points back to itself.
328  *   "APDP_PDE" (0xfff02fe0) is the VA of the PDE in the PDP which
329  *      establishes the recursive mapping of the alternate pmap.
330  *      To set the alternate PDP, one just has to put the correct
331  *	PA info in *APDP_PDE.
332  *
333  * Note that in the APTE_BASE space, the APDP appears at VA
334  * "APDP_BASE" (0xffffc000).
335  *
336  * unfortunately, we cannot use recursive PDPT from the page tables
337  * because cr3 is only 32 bits wide.
338  *
339  */
340 #define PG_FRAME	0xffffff000ULL	/* page frame mask */
341 #define PG_LGFRAME	0xfffe00000ULL	/* large (2M) page frame mask */
342 
343 /*
344  * Redefine the PDSHIFT and NBPD macros for PAE
345  */
346 #undef PDSHIFT
347 #define PDSHIFT		21		/* page directory address shift */
348 #undef NBPD
349 #define NBPD		(1U << PDSHIFT)	/* # bytes mapped by PD (2MB) */
350 
351 #define PDSHIFT86	22		/* for pmap86 transfer */
352 
353 #undef PDSLOT_PTE
354 #define PDSLOT_PTE	(1660U)	/* 1660: for recursive PDP map */
355 #undef PDSLOT_KERN
356 #define PDSLOT_KERN	(1664U)	/* 1664: start of kernel space */
357 #undef PDSLOT_APTE
358 #define PDSLOT_APTE	(2044U)	/* 2044: alternative recursive slot */
359 
360 /*
361  * The following defines give the virtual addresses of various MMU
362  * data structures:
363  * PTE_BASE and APTE_BASE: the base VA of the linear PTE mappings
364  * PDP_PDE and APDP_PDE: the VA of the PDE that points back to the PDP/APDP
365  */
366 #define PTE_BASE	((pt_entry_t *) (PDSLOT_PTE * NBPD))
367 #define APTE_BASE	((pt_entry_t *) (PDSLOT_APTE * NBPD))
368 #define PDP_BASE ((pd_entry_t *)(((char *)PTE_BASE) + (PDSLOT_PTE * NBPG)))
369 #define APDP_BASE ((pd_entry_t *)(((char *)APTE_BASE) + (PDSLOT_APTE * NBPG)))
370 #define PDP_PDE		(PDP_BASE + PDSLOT_PTE)
371 #define APDP_PDE	(PDP_BASE + PDSLOT_APTE)
372 
373 /*
374  * pdei/ptei: generate index into PDP/PTP from a VA
375  */
376 #define PD_MASK		0xffe00000	/* page directory address bits */
377 #define PT_MASK		0x001ff000	/* page table address bits */
378 #define pdei(VA)	(((VA) & PD_MASK) >> PDSHIFT)
379 #define ptei(VA)	(((VA) & PT_MASK) >> PGSHIFT)
380 
381 #define PD_MASK86	0xffc00000	/* for pmap86 transfer */
382 #define PT_MASK86	0x003ff000	/* for pmap86 transfer */
383 
384 /*
385  * Mach derived conversion macros
386  */
387 #define i386_round_pdr(x)	((((unsigned)(x)) + ~PD_MASK) & PD_MASK)
388 
389 /*
390  * various address macros
391  *
392  *  vtopte: return a pointer to the PTE mapping a VA
393  */
394 #define vtopte(VA)	(PTE_BASE + atop((vaddr_t)VA))
395 
396 /*
397  * PTP macros:
398  *   A PTP's index is the PD index of the PDE that points to it.
399  *   A PTP's offset is the byte-offset in the PTE space that this PTP is at.
400  *   A PTP's VA is the first VA mapped by that PTP.
401  *
402  * Note that NBPG == number of bytes in a PTP (4096 bytes == 512 entries)
403  *           NBPD == number of bytes a PTP can map (2MB)
404  */
405 
406 #define ptp_i2o(I)	((I) * NBPG)	/* index => offset */
407 #define ptp_o2i(O)	((O) / NBPG)	/* offset => index */
408 #define ptp_i2v(I)	((I) * NBPD)	/* index => VA */
409 #define ptp_v2i(V)	((V) / NBPD)	/* VA => index (same as pdei) */
410 
411 /*
412  * Access PD and PT
413  */
414 #define PDE(pm,i)	(((pd_entry_t *)(pm)->pm_pdir)[(i)])
415 
416 /*
417  * here we define the data types for PDEs and PTEs for PAE
418  */
419 typedef u_int64_t pd_entry_t;		/* PDE */
420 typedef u_int64_t pt_entry_t;		/* PTE */
421 
422 #define PG_NX	0x8000000000000000ULL	/* execute-disable */
423 
424 /*
425  * Number of PTEs per cache line. 8 byte pte, 64-byte cache line
426  * Used to avoid false sharing of cache lines.
427  */
428 #define NPTECL			8
429 
430 /*
431  * other data structures
432  */
433 
434 extern u_int32_t protection_codes[];	/* maps MI prot to i386 prot code */
435 extern int pmap_initialized;	/* pmap_init done yet? */
436 
437 /* Segment boundaries */
438 extern vaddr_t kernel_text, etext, __rodata_start, erodata, __data_start;
439 extern vaddr_t edata, __bss_start, end, ssym, esym, PTmap;
440 
441 /*
442  * MULTIPROCESSOR: special VAs/ PTEs are actually allocated inside a
443  * MAXCPUS*NPTECL array of PTEs, to avoid cache line thrashing
444  * due to false sharing.
445  */
446 
447 #ifdef MULTIPROCESSOR
448 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
449 #define VASLEW(va,id) ((va)+(id)*NPTECL*NBPG)
450 #else
451 #define PTESLEW(pte, id) (pte)
452 #define VASLEW(va,id) (va)
453 #endif
454 
455 /*
456  * special VAs and the PTEs that map them
457  */
458 
459 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *flsh_pte;
460 extern caddr_t pmap_csrcp, pmap_cdstp, pmap_zerop, pmap_ptpp, pmap_flshp;
461 
462 extern int pmap_pg_g;
463 extern int pmap_pg_wc;
464 extern struct pmap_head pmaps;
465 extern struct mutex pmaps_lock;
466 
467 extern uint32_t	cpu_meltdown;
468 
469 /*
470  * local prototypes
471  */
472 struct vm_page	*pmap_alloc_ptp_pae(struct pmap *, int, pt_entry_t);
473 struct vm_page	*pmap_get_ptp_pae(struct pmap *, int);
474 void		 pmap_drop_ptp_pae(struct pmap *, vaddr_t, struct vm_page *,
475     pt_entry_t *);
476 pt_entry_t	*pmap_map_ptes_pae(struct pmap *);
477 void		 pmap_unmap_ptes_pae(struct pmap *);
478 void		 pmap_do_remove_pae(struct pmap *, vaddr_t, vaddr_t, int);
479 void		 pmap_remove_ptes_pae(struct pmap *, struct vm_page *,
480 		     vaddr_t, vaddr_t, vaddr_t, int, struct pv_entry **);
481 void		 pmap_sync_flags_pte_pae(struct vm_page *, pt_entry_t);
482 
483 static __inline u_int
pmap_pte2flags(pt_entry_t pte)484 pmap_pte2flags(pt_entry_t pte)
485 {
486 	return (((pte & PG_U) ? PG_PMAP_REF : 0) |
487 	    ((pte & PG_M) ? PG_PMAP_MOD : 0));
488 }
489 
490 void
pmap_sync_flags_pte_pae(struct vm_page * pg,pt_entry_t pte)491 pmap_sync_flags_pte_pae(struct vm_page *pg, pt_entry_t pte)
492 {
493 	if (pte & (PG_U|PG_M)) {
494 		atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte));
495 	}
496 }
497 
498 /*
499  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
500  *
501  * => we lock enough pmaps to keep things locked in
502  * => must be undone with pmap_unmap_ptes before returning
503  */
504 
505 pt_entry_t *
pmap_map_ptes_pae(struct pmap * pmap)506 pmap_map_ptes_pae(struct pmap *pmap)
507 {
508 	pd_entry_t opde;
509 
510 	/* the kernel's pmap is always accessible */
511 	if (pmap == pmap_kernel()) {
512 		return(PTE_BASE);
513 	}
514 
515 	mtx_enter(&pmap->pm_mtx);
516 
517 	/* if curpmap then we are always mapped */
518 	if (pmap_is_curpmap(pmap)) {
519 		return(PTE_BASE);
520 	}
521 
522 	mtx_enter(&curcpu()->ci_curpmap->pm_apte_mtx);
523 
524 	/* need to load a new alternate pt space into curpmap? */
525 	opde = *APDP_PDE;
526 #if defined(MULTIPROCESSOR) && defined(DIAGNOSTIC)
527 	if (pmap_valid_entry(opde))
528 		panic("pmap_map_ptes_pae: APTE valid");
529 #endif
530 	if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdidx[0]) {
531 		APDP_PDE[0] = pmap->pm_pdidx[0] | PG_RW | PG_V | PG_U | PG_M;
532 		APDP_PDE[1] = pmap->pm_pdidx[1] | PG_RW | PG_V | PG_U | PG_M;
533 		APDP_PDE[2] = pmap->pm_pdidx[2] | PG_RW | PG_V | PG_U | PG_M;
534 		APDP_PDE[3] = pmap->pm_pdidx[3] | PG_RW | PG_V | PG_U | PG_M;
535 		if (pmap_valid_entry(opde))
536 			pmap_apte_flush();
537 	}
538 	return(APTE_BASE);
539 }
540 
541 /*
542  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
543  */
544 
545 void
pmap_unmap_ptes_pae(struct pmap * pmap)546 pmap_unmap_ptes_pae(struct pmap *pmap)
547 {
548 	if (pmap == pmap_kernel())
549 		return;
550 
551 	if (!pmap_is_curpmap(pmap)) {
552 #if defined(MULTIPROCESSOR)
553 		APDP_PDE[0] = 0;
554 		APDP_PDE[1] = 0;
555 		APDP_PDE[2] = 0;
556 		APDP_PDE[3] = 0;
557 		pmap_apte_flush();
558 #endif
559 		mtx_leave(&curcpu()->ci_curpmap->pm_apte_mtx);
560 	}
561 
562 	mtx_leave(&pmap->pm_mtx);
563 }
564 
565 u_int32_t
pmap_pte_set_pae(vaddr_t va,paddr_t pa,u_int32_t bits)566 pmap_pte_set_pae(vaddr_t va, paddr_t pa, u_int32_t bits)
567 {
568 	pt_entry_t pte, *ptep = vtopte(va);
569 	uint64_t nx;
570 
571 	pa &= PMAP_PA_MASK;
572 
573 	if (bits & PG_X)
574 		nx = 0;
575 	else
576 		nx = PG_NX;
577 
578 	pte = i386_atomic_testset_uq(ptep, pa | bits | nx);  /* zap! */
579 	return (pte & ~PG_FRAME);
580 }
581 
582 u_int32_t
pmap_pte_setbits_pae(vaddr_t va,u_int32_t set,u_int32_t clr)583 pmap_pte_setbits_pae(vaddr_t va, u_int32_t set, u_int32_t clr)
584 {
585 	pt_entry_t *ptep = vtopte(va);
586 	pt_entry_t pte = *ptep;
587 
588 	i386_atomic_testset_uq(ptep, (pte | set) & ~(pt_entry_t)clr);
589 	return (pte & ~PG_FRAME);
590 }
591 
592 u_int32_t
pmap_pte_bits_pae(vaddr_t va)593 pmap_pte_bits_pae(vaddr_t va)
594 {
595 	pt_entry_t *ptep = vtopte(va);
596 
597 	return (*ptep & ~PG_FRAME);
598 }
599 
600 paddr_t
pmap_pte_paddr_pae(vaddr_t va)601 pmap_pte_paddr_pae(vaddr_t va)
602 {
603 	pt_entry_t *ptep = vtopte(va);
604 
605 	return (*ptep & PG_FRAME);
606 }
607 
608 /*
609  * Allocate a new PD for Intel's U-K.
610  */
611 void
pmap_alloc_pdir_intel_pae(struct pmap * pmap)612 pmap_alloc_pdir_intel_pae(struct pmap *pmap)
613 {
614 	vaddr_t		 va;
615 	int		 i;
616 
617 	KASSERT(pmap->pm_pdir_intel == 0);
618 
619 	va = (vaddr_t)km_alloc(4 * NBPG, &kv_any, &kp_zero, &kd_waitok);
620 	if (va == 0)
621 		panic("kernel_map out of virtual space");
622 	pmap->pm_pdir_intel = va;
623 	if (!pmap_extract(pmap_kernel(), (vaddr_t)&pmap->pm_pdidx_intel,
624 	    &pmap->pm_pdirpa_intel))
625 		panic("can't locate PDPT");
626 
627 	for (i = 0; i < 4; i++) {
628 		pmap->pm_pdidx_intel[i] = 0;
629 		if (!pmap_extract(pmap, va + i * NBPG,
630 		    (paddr_t *)&pmap->pm_pdidx_intel[i]))
631 			panic("can't locate PD page");
632 
633 		pmap->pm_pdidx_intel[i] |= PG_V;
634 
635 		DPRINTF("%s: pm_pdidx_intel[%d] = 0x%llx\n", __func__,
636 		    i, pmap->pm_pdidx_intel[i]);
637 	}
638 }
639 
640 /*
641  * Switch over to PAE page tables
642  */
643 void
pmap_bootstrap_pae(void)644 pmap_bootstrap_pae(void)
645 {
646 	extern int nkpde;
647 	struct pmap *kpm = pmap_kernel();
648 	struct vm_page *ptp;
649 	paddr_t ptaddr;
650 	u_int32_t bits, *pd = NULL;
651 	vaddr_t va, eva;
652 	pt_entry_t pte;
653 
654 	if ((cpu_feature & CPUID_PAE) == 0 ||
655 	    (ecpu_feature & CPUID_NXE) == 0)
656 		return;
657 
658 	cpu_pae = 1;
659 
660 	DPRINTF("%s: pm_pdir 0x%x pm_pdirpa 0x%x pm_pdirsize %d\n", __func__,
661 	    (uint32_t)kpm->pm_pdir, (uint32_t)kpm->pm_pdirpa,
662 	    kpm->pm_pdirsize);
663 
664 	va = (vaddr_t)kpm->pm_pdir;
665 	kpm->pm_pdidx[0] = (va + 0*NBPG - KERNBASE) | PG_V;
666 	kpm->pm_pdidx[1] = (va + 1*NBPG - KERNBASE) | PG_V;
667 	kpm->pm_pdidx[2] = (va + 2*NBPG - KERNBASE) | PG_V;
668 	kpm->pm_pdidx[3] = (va + 3*NBPG - KERNBASE) | PG_V;
669 	/* map pde recursively into itself */
670 	PDE(kpm, PDSLOT_PTE+0) = kpm->pm_pdidx[0] | PG_KW | PG_M | PG_U;
671 	PDE(kpm, PDSLOT_PTE+1) = kpm->pm_pdidx[1] | PG_KW | PG_M | PG_U;
672 	PDE(kpm, PDSLOT_PTE+2) = kpm->pm_pdidx[2] | PG_KW | PG_M | PG_U;
673 	PDE(kpm, PDSLOT_PTE+3) = kpm->pm_pdidx[3] | PG_KW | PG_M | PG_U;
674 
675 	/* allocate new special PD before transferring all mappings. */
676 	if (kpm->pm_pdir_intel) {
677 		pd = (uint32_t *)kpm->pm_pdir_intel;
678 		kpm->pm_pdir_intel = kpm->pm_pdirpa_intel = 0;
679 		pmap_alloc_pdir_intel_pae(kpm);
680 	}
681 
682 	/* transfer all kernel mappings over into pae tables */
683 	for (va = KERNBASE, eva = va + (nkpde << PDSHIFT86);
684 	    va < eva; va += PAGE_SIZE) {
685 		if (!pmap_valid_entry(PDE(kpm, pdei(va)))) {
686 			ptp = uvm_pagealloc(&kpm->pm_obj, va, NULL,
687 			    UVM_PGA_ZERO);
688 			if (ptp == NULL)
689 				panic("%s: uvm_pagealloc() failed", __func__);
690 			ptaddr = VM_PAGE_TO_PHYS(ptp);
691 			PDE(kpm, pdei(va)) = ptaddr | PG_KW | PG_V |
692 			    PG_U | PG_M;
693 			pmap_pte_set_86((vaddr_t)vtopte(va),
694 			    ptaddr, PG_KW | PG_V | PG_U | PG_M);
695 
696 			/* count PTP as resident */
697 			kpm->pm_stats.resident_count++;
698 		}
699 		bits = pmap_pte_bits_86(va) | pmap_pg_g;
700 
701 		/*
702 		 * At this point, ideally only kernel text should be executable.
703 		 * However, we need to leave the ISA hole executable to handle
704 		 * bios32, pcibios, and apmbios calls that may potentially
705 		 * happen later since we don't know (yet) which of those may be
706 		 * in use. Later (in biosattach), we will reset the permissions
707 		 * according to what we actually need.
708 		 */
709 		if ((va >= (vaddr_t)&kernel_text && va <= (vaddr_t)&etext) ||
710 		    (va >= (vaddr_t)atdevbase && va <=
711 		     (vaddr_t)(atdevbase + IOM_SIZE)))
712 			bits |= PG_X;
713 		else
714 			bits &= ~PG_X;
715 
716 		if (pmap_valid_entry(bits))
717 			pmap_pte_set_pae(va, pmap_pte_paddr_86(va), bits);
718 	}
719 
720 	/* Transfer special mappings */
721 	if (pd) {
722 		uint32_t	*ptp;
723 		uint32_t	 l1idx, l2idx;
724 		paddr_t		 npa;
725 		struct vm_page	*ptppg;
726 
727 		for (va = KERNBASE, eva = va + (nkpde << PDSHIFT86); va < eva;
728 		    va += PAGE_SIZE) {
729 			l1idx = ((va & PT_MASK86) >> PGSHIFT);
730 			l2idx = ((va & PD_MASK86) >> PDSHIFT86);
731 
732 			if (!pmap_valid_entry(pd[l2idx]))
733 				continue;
734 
735 			npa = pd[l2idx]	& PMAP_PA_MASK;
736 			ptppg = PHYS_TO_VM_PAGE(npa);
737 			mtx_enter(&ptppg->mdpage.pv_mtx);
738 
739 			/* still running on pmap86 */
740 			ptp = (uint32_t *)pmap_tmpmap_pa_86(npa);
741 
742 			if (!pmap_valid_entry(ptp[l1idx])) {
743 				mtx_leave(&ptppg->mdpage.pv_mtx);
744 				pmap_tmpunmap_pa_86();
745 				continue;
746 			}
747 			DPRINTF("%s: va 0x%x l2idx %u 0x%x lx1idx %u 0x%x\n",
748 			    __func__, (uint32_t)va, l2idx, (uint32_t)pd[l2idx],
749 			    l1idx, (uint32_t)ptp[l1idx]);
750 
751 			/* protection and cacheability */
752 			bits = ptp[l1idx] & (PG_PROT|PG_N|PG_WT);
753 			npa = ptp[l1idx] & PMAP_PA_MASK;
754 
755 			/* still running on pmap86 */
756 			pmap_tmpunmap_pa_86();
757 			mtx_leave(&ptppg->mdpage.pv_mtx);
758 
759 			/* enforce use of pmap86 */
760 			cpu_pae = 0;
761 			pmap_enter_special_pae(va, npa, 0, bits);
762 			cpu_pae = 1;
763 
764 			if (--ptppg->wire_count == 1) {
765 				ptppg->wire_count = 0;
766 				uvm_pagerealloc(ptppg, NULL, 0);
767 				DPRINTF("%s: freeing PT page 0x%x\n", __func__,
768 				    (uint32_t)VM_PAGE_TO_PHYS(ptppg));
769 			}
770 		}
771 		km_free(pd, NBPG, &kv_any, &kp_dirty);
772 		DPRINTF("%s: freeing PDP 0x%x\n", __func__, (uint32_t)pd);
773 	}
774 
775 	if (!cpu_paenable(&kpm->pm_pdidx[0])) {
776 		extern struct user *proc0paddr;
777 
778 		proc0paddr->u_pcb.pcb_cr3 = kpm->pm_pdirpa =
779 		    (vaddr_t)kpm - KERNBASE;
780 		kpm->pm_pdirsize = 4 * NBPG;
781 
782 		/* Reset cr3 for NMI task switch */
783 		cpu_update_nmi_cr3(kpm->pm_pdirpa);
784 
785 		DPRINTF("%s: pm_pdir 0x%x pm_pdirpa 0x%x pm_pdirsize %d\n",
786 		    __func__, (uint32_t)kpm->pm_pdir, (uint32_t)kpm->pm_pdirpa,
787 		    kpm->pm_pdirsize);
788 
789 		csrc_pte = vtopte(pmap_csrcp);
790 		cdst_pte = vtopte(pmap_cdstp);
791 		zero_pte = vtopte(pmap_zerop);
792 		ptp_pte = vtopte(pmap_ptpp);
793 		flsh_pte = vtopte(pmap_flshp);
794 
795 		nkpde *= 2;
796 		nkptp_max = 2048 - PDSLOT_KERN - 4;
797 
798 		pmap_pte_set_p = pmap_pte_set_pae;
799 		pmap_pte_setbits_p = pmap_pte_setbits_pae;
800 		pmap_pte_bits_p = pmap_pte_bits_pae;
801 		pmap_pte_paddr_p = pmap_pte_paddr_pae;
802 		pmap_clear_attrs_p = pmap_clear_attrs_pae;
803 		pmap_enter_p = pmap_enter_pae;
804 		pmap_enter_special_p = pmap_enter_special_pae;
805 		pmap_extract_p = pmap_extract_pae;
806 		pmap_growkernel_p = pmap_growkernel_pae;
807 		pmap_page_remove_p = pmap_page_remove_pae;
808 		pmap_do_remove_p = pmap_do_remove_pae;
809 		pmap_test_attrs_p = pmap_test_attrs_pae;
810 		pmap_unwire_p = pmap_unwire_pae;
811 		pmap_write_protect_p = pmap_write_protect_pae;
812 		pmap_pinit_pd_p = pmap_pinit_pd_pae;
813 		pmap_zero_phys_p = pmap_zero_phys_pae;
814 		pmap_copy_page_p = pmap_copy_page_pae;
815 
816 		bzero((void *)kpm->pm_pdir + 8, (PDSLOT_PTE-1) * 8);
817 		/* TODO also reclaim old PDPs */
818 	}
819 
820 	/* Set region permissions */
821 	for (va = (vaddr_t)&PTmap; va < KERNBASE; va += NBPD) {
822 		pte = PDE(kpm, pdei(va));
823 		PDE(kpm, pdei(va)) = pte | PG_NX;
824 	}
825 
826 	va = (vaddr_t)APTE_BASE;
827 	pte = PDE(kpm, pdei(va));
828 	PDE(kpm, pdei(va)) = pte | PG_NX;
829 
830 	pmap_write_protect(kpm, (vaddr_t)&kernel_text, (vaddr_t)&etext,
831 	    PROT_READ | PROT_EXEC);
832 	pmap_write_protect(kpm, (vaddr_t)&__rodata_start,
833 	    (vaddr_t)&erodata, PROT_READ);
834 	pmap_write_protect(kpm, (vaddr_t)&__data_start, (vaddr_t)&edata,
835 	    PROT_READ | PROT_WRITE);
836 	pmap_write_protect(kpm, (vaddr_t)&__bss_start, (vaddr_t)&end,
837 	    PROT_READ | PROT_WRITE);
838 
839 #if defined(DDB) || NKSYMS > 0
840 	pmap_write_protect(kpm, ssym, esym, PROT_READ);
841 #endif
842 }
843 
844 /*
845  * p t p   f u n c t i o n s
846  */
847 
848 /*
849  * pmap_alloc_ptp: allocate a PTP for a PMAP
850  *
851  * => pmap should already be locked by caller
852  * => we use the ptp's wire_count to count the number of active mappings
853  *	in the PTP (we start it at one to prevent any chance this PTP
854  *	will ever leak onto the active/inactive queues)
855  * => we should not be holding any pv_head locks (in case we are forced
856  *	to call pmap_steal_ptp())
857  * => we may need to lock pv_head's if we have to steal a PTP
858  */
859 
860 struct vm_page *
pmap_alloc_ptp_pae(struct pmap * pmap,int pde_index,pt_entry_t pde_flags)861 pmap_alloc_ptp_pae(struct pmap *pmap, int pde_index, pt_entry_t pde_flags)
862 {
863 	struct vm_page *ptp;
864 	pd_entry_t *pva_intel;
865 
866 	ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL,
867 			    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
868 	if (ptp == NULL)
869 		return (NULL);
870 
871 	/* got one! */
872 	atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
873 	ptp->wire_count = 1;	/* no mappings yet */
874 	PDE(pmap, pde_index) = (pd_entry_t)(VM_PAGE_TO_PHYS(ptp) |
875 	    PG_RW | PG_V | PG_M | PG_U | pde_flags);
876 
877 	/*
878 	 * Meltdown special case - if we are adding a new PDE for
879 	 * usermode addresses, just copy the PDE to the U-K
880 	 * table.
881 	 */
882 	if (pmap->pm_pdir_intel && ptp_i2v(pde_index) < VM_MAXUSER_ADDRESS) {
883 		pva_intel = (pd_entry_t *)pmap->pm_pdir_intel;
884 		pva_intel[pde_index] = PDE(pmap, pde_index);
885 		DPRINTF("%s: copying usermode PDE (content=0x%llx) pde_index "
886 		    "%d from 0x%llx -> 0x%llx\n", __func__,
887 		    PDE(pmap, pde_index), pde_index,
888 		    (uint64_t)&PDE(pmap, pde_index),
889 		    (uint64_t)&(pva_intel[pde_index]));
890 	}
891 
892 	pmap->pm_stats.resident_count++;	/* count PTP as resident */
893 	pmap->pm_ptphint = ptp;
894 	return(ptp);
895 }
896 
897 /*
898  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
899  *
900  * => pmap should NOT be pmap_kernel()
901  * => pmap should be locked
902  */
903 
904 struct vm_page *
pmap_get_ptp_pae(struct pmap * pmap,int pde_index)905 pmap_get_ptp_pae(struct pmap *pmap, int pde_index)
906 {
907 	struct vm_page *ptp;
908 
909 	if (pmap_valid_entry(PDE(pmap, pde_index))) {
910 		/* valid... check hint (saves us a PA->PG lookup) */
911 		if (pmap->pm_ptphint &&
912 		    (PDE(pmap, pde_index) & PG_FRAME) ==
913 		    VM_PAGE_TO_PHYS(pmap->pm_ptphint))
914 			return(pmap->pm_ptphint);
915 
916 		ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index));
917 #ifdef DIAGNOSTIC
918 		if (ptp == NULL)
919 			panic("pmap_get_ptp_pae: unmanaged user PTP");
920 #endif
921 		pmap->pm_ptphint = ptp;
922 		return(ptp);
923 	}
924 
925 	/* allocate a new PTP (updates ptphint) */
926 	return (pmap_alloc_ptp_pae(pmap, pde_index, PG_u));
927 }
928 
929 void
pmap_drop_ptp_pae(struct pmap * pm,vaddr_t va,struct vm_page * ptp,pt_entry_t * ptes)930 pmap_drop_ptp_pae(struct pmap *pm, vaddr_t va, struct vm_page *ptp,
931     pt_entry_t *ptes)
932 {
933 	pd_entry_t *pva_intel;
934 
935 	i386_atomic_testset_uq(&PDE(pm, pdei(va)), 0);
936 	pmap_tlb_shootpage(curcpu()->ci_curpmap, ((vaddr_t)ptes) + ptp->offset);
937 #ifdef MULTIPROCESSOR
938 	/*
939 	 * Always shoot down the other pmap's
940 	 * self-mapping of the PTP.
941 	 */
942 	pmap_tlb_shootpage(pm, ((vaddr_t)PTE_BASE) + ptp->offset);
943 #endif
944 	pm->pm_stats.resident_count--;
945 	/* update hint */
946 	if (pm->pm_ptphint == ptp)
947 		pm->pm_ptphint = RBT_ROOT(uvm_objtree, &pm->pm_obj.memt);
948 	ptp->wire_count = 0;
949 	/* Postpone free to after shootdown. */
950 	uvm_pagerealloc(ptp, NULL, 0);
951 
952 	if (pm->pm_pdir_intel) {
953 		KASSERT(va < VM_MAXUSER_ADDRESS);
954 		/* Zap special meltdown PDE */
955 		pva_intel = (pd_entry_t *)pm->pm_pdir_intel;
956 		i386_atomic_testset_uq(&pva_intel[pdei(va)], 0);
957 		DPRINTF("%s: cleared meltdown PDE @ index %lu "
958 		    "(va range start 0x%x)\n", __func__, pdei(va),
959 		    (uint32_t)va);
960 	}
961 }
962 
963 /*
964  * pmap_pinit_pd: given a freshly allocated pmap structure, give it a PD
965  */
966 void
pmap_pinit_pd_pae(struct pmap * pmap)967 pmap_pinit_pd_pae(struct pmap *pmap)
968 {
969 	extern int nkpde;
970 	vaddr_t va;
971 	paddr_t pdidx[4];
972 
973 	/* allocate PDP */
974 	pmap->pm_pdir = (vaddr_t)km_alloc(4 * NBPG, &kv_any, &kp_dirty,
975 	    &kd_waitok);
976 	if (pmap->pm_pdir == 0)
977 		panic("kernel_map out of virtual space");
978 	/* page index is in the pmap! */
979 	pmap_extract(pmap_kernel(), (vaddr_t)pmap, &pmap->pm_pdirpa);
980 	va = (vaddr_t)pmap->pm_pdir;
981 	pmap_extract(pmap_kernel(), va + 0*NBPG, &pdidx[0]);
982 	pmap_extract(pmap_kernel(), va + 1*NBPG, &pdidx[1]);
983 	pmap_extract(pmap_kernel(), va + 2*NBPG, &pdidx[2]);
984 	pmap_extract(pmap_kernel(), va + 3*NBPG, &pdidx[3]);
985 	pmap->pm_pdidx[0] = (uint64_t)pdidx[0];
986 	pmap->pm_pdidx[1] = (uint64_t)pdidx[1];
987 	pmap->pm_pdidx[2] = (uint64_t)pdidx[2];
988 	pmap->pm_pdidx[3] = (uint64_t)pdidx[3];
989 	pmap->pm_pdidx[0] |= PG_V;
990 	pmap->pm_pdidx[1] |= PG_V;
991 	pmap->pm_pdidx[2] |= PG_V;
992 	pmap->pm_pdidx[3] |= PG_V;
993 	pmap->pm_pdirsize = 4 * NBPG;
994 
995 	/* init PDP */
996 	/* zero init area */
997 	bzero((void *)pmap->pm_pdir, PDSLOT_PTE * sizeof(pd_entry_t));
998 	/* put in recursive PDE to map the PTEs */
999 	PDE(pmap, PDSLOT_PTE+0) = pmap->pm_pdidx[0] | PG_KW | PG_U |
1000 	    PG_M | PG_V | PG_NX;
1001 	PDE(pmap, PDSLOT_PTE+1) = pmap->pm_pdidx[1] | PG_KW | PG_U |
1002 	    PG_M | PG_V | PG_NX;
1003 	PDE(pmap, PDSLOT_PTE+2) = pmap->pm_pdidx[2] | PG_KW | PG_U |
1004 	    PG_M | PG_V | PG_NX;
1005 	PDE(pmap, PDSLOT_PTE+3) = pmap->pm_pdidx[3] | PG_KW | PG_U |
1006 	    PG_M | PG_V | PG_NX;
1007 
1008 	/*
1009 	 * we need to lock pmaps_lock to prevent nkpde from changing on
1010 	 * us.   note that there is no need to splvm to protect us from
1011 	 * malloc since malloc allocates out of a submap and we should have
1012 	 * already allocated kernel PTPs to cover the range...
1013 	 */
1014 	/* put in kernel VM PDEs */
1015 	bcopy(&PDP_BASE[PDSLOT_KERN], &PDE(pmap, PDSLOT_KERN),
1016 	       nkpde * sizeof(pd_entry_t));
1017 	/* zero the rest */
1018 	bzero(&PDE(pmap, PDSLOT_KERN + nkpde), pmap->pm_pdirsize -
1019 	    ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t)));
1020 
1021 	/*
1022 	 * Intel CPUs need a special page table to be used during usermode
1023 	 * execution, one that lacks all kernel mappings.
1024 	 */
1025 	if (cpu_meltdown) {
1026 		pmap_alloc_pdir_intel_pae(pmap);
1027 
1028 		/* Copy PDEs from pmap_kernel's U-K view */
1029 		bcopy((void *)pmap_kernel()->pm_pdir_intel,
1030 		    (void *)pmap->pm_pdir_intel, 4 * NBPG);
1031 
1032 		DPRINTF("%s: pmap %p pm_pdir 0x%lx pm_pdirpa 0x%lx "
1033 		    "pdir_intel 0x%lx pdirpa_intel 0x%lx\n",
1034 		    __func__, pmap, pmap->pm_pdir, pmap->pm_pdirpa,
1035 		    pmap->pm_pdir_intel, pmap->pm_pdirpa_intel);
1036 	}
1037 
1038 	mtx_enter(&pmaps_lock);
1039 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
1040 	mtx_leave(&pmaps_lock);
1041 }
1042 
1043 /*
1044  * some misc. functions
1045  */
1046 
1047 /*
1048  * pmap_extract: extract a PA for the given VA
1049  */
1050 
1051 int
pmap_extract_pae(struct pmap * pmap,vaddr_t va,paddr_t * pap)1052 pmap_extract_pae(struct pmap *pmap, vaddr_t va, paddr_t *pap)
1053 {
1054 	pt_entry_t *ptes, pte;
1055 
1056 	ptes = pmap_map_ptes_pae(pmap);
1057 	if (pmap_valid_entry(PDE(pmap, pdei(va)))) {
1058 		pte = ptes[atop(va)];
1059 		pmap_unmap_ptes_pae(pmap);
1060 		if (!pmap_valid_entry(pte))
1061 			return 0;
1062 		if (pap != NULL)
1063 			*pap = (pte & PG_FRAME) | (va & ~PG_FRAME);
1064 		return 1;
1065 	}
1066 	pmap_unmap_ptes_pae(pmap);
1067 	return 0;
1068 }
1069 
1070 extern void (*pagezero)(void *, size_t);
1071 
1072 /*
1073  * pmap_zero_phys: same as pmap_zero_page, but for use before vm_pages are
1074  * initialized.
1075  */
1076 void
pmap_zero_phys_pae(paddr_t pa)1077 pmap_zero_phys_pae(paddr_t pa)
1078 {
1079 #ifdef MULTIPROCESSOR
1080 	int id = cpu_number();
1081 #endif
1082 	pt_entry_t *zpte = PTESLEW(zero_pte, id);
1083 	caddr_t zerova = VASLEW(pmap_zerop, id);
1084 
1085 #ifdef DIAGNOSTIC
1086 	if (*zpte)
1087 		panic("pmap_zero_phys_pae: lock botch");
1088 #endif
1089 
1090 	*zpte = (pa & PG_FRAME) | PG_V | PG_RW;	/* map in */
1091 	pmap_update_pg((vaddr_t)zerova);	/* flush TLB */
1092 	pagezero(zerova, PAGE_SIZE);		/* zero */
1093 	*zpte = 0;
1094 }
1095 
1096 /*
1097  * pmap_copy_page: copy a page
1098  */
1099 
1100 void
pmap_copy_page_pae(struct vm_page * srcpg,struct vm_page * dstpg)1101 pmap_copy_page_pae(struct vm_page *srcpg, struct vm_page *dstpg)
1102 {
1103 	paddr_t srcpa = VM_PAGE_TO_PHYS(srcpg);
1104 	paddr_t dstpa = VM_PAGE_TO_PHYS(dstpg);
1105 #ifdef MULTIPROCESSOR
1106 	int id = cpu_number();
1107 #endif
1108 	pt_entry_t *spte = PTESLEW(csrc_pte, id);
1109 	pt_entry_t *dpte = PTESLEW(cdst_pte, id);
1110 	caddr_t csrcva = VASLEW(pmap_csrcp, id);
1111 	caddr_t cdstva = VASLEW(pmap_cdstp, id);
1112 
1113 #ifdef DIAGNOSTIC
1114 	if (*spte || *dpte)
1115 		panic("pmap_copy_page_pae: lock botch");
1116 #endif
1117 
1118 	*spte = (srcpa & PG_FRAME) | PG_V | PG_RW;
1119 	*dpte = (dstpa & PG_FRAME) | PG_V | PG_RW;
1120 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
1121 	bcopy(csrcva, cdstva, PAGE_SIZE);
1122 	*spte = *dpte = 0;
1123 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
1124 }
1125 
1126 /*
1127  * p m a p   r e m o v e   f u n c t i o n s
1128  *
1129  * functions that remove mappings
1130  */
1131 
1132 /*
1133  * pmap_remove_ptes: remove PTEs from a PTP
1134  *
1135  * => caller must hold pmap's lock
1136  * => PTP must be mapped into KVA
1137  * => PTP should be null if pmap == pmap_kernel()
1138 */
1139 
1140 void
pmap_remove_ptes_pae(struct pmap * pmap,struct vm_page * ptp,vaddr_t ptpva,vaddr_t startva,vaddr_t endva,int flags,struct pv_entry ** free_pvs)1141 pmap_remove_ptes_pae(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
1142     vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs)
1143 {
1144 	struct pv_entry *pve;
1145 	pt_entry_t *pte = (pt_entry_t *) ptpva;
1146 	struct vm_page *pg;
1147 	pt_entry_t opte;
1148 
1149 	/*
1150 	 * note that ptpva points to the PTE that maps startva.   this may
1151 	 * or may not be the first PTE in the PTP.
1152 	 *
1153 	 * we loop through the PTP while there are still PTEs to look at
1154 	 * and the wire_count is greater than 1 (because we use the wire_count
1155 	 * to keep track of the number of real PTEs in the PTP).
1156 	 */
1157 
1158 	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
1159 			     ; pte++, startva += NBPG) {
1160 		if (!pmap_valid_entry(*pte))
1161 			continue;			/* VA not mapped */
1162 
1163 		if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W))
1164 			continue;
1165 
1166 		/* atomically save the old PTE and zero it */
1167 		opte = i386_atomic_testset_uq(pte, 0);
1168 
1169 		if (opte & PG_W)
1170 			pmap->pm_stats.wired_count--;
1171 		pmap->pm_stats.resident_count--;
1172 
1173 		if (ptp)
1174 			ptp->wire_count--;		/* dropping a PTE */
1175 
1176 		/*
1177 		 * Unnecessary work if not PG_PVLIST.
1178 		 */
1179 		pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
1180 
1181 		/*
1182 		 * if we are not on a pv list we are done.
1183 		 */
1184 		if ((opte & PG_PVLIST) == 0) {
1185 #ifdef DIAGNOSTIC
1186 			if (pg != NULL)
1187 				panic("pmap_remove_ptes_pae: managed page "
1188 				     "without PG_PVLIST for 0x%lx", startva);
1189 #endif
1190 			continue;
1191 		}
1192 
1193 #ifdef DIAGNOSTIC
1194 		if (pg == NULL)
1195 			panic("pmap_remove_ptes_pae: unmanaged page marked "
1196 			      "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
1197 			      startva, (u_long)(opte & PG_FRAME));
1198 #endif
1199 
1200 		/* sync R/M bits */
1201 		pmap_sync_flags_pte_pae(pg, opte);
1202 		pve = pmap_remove_pv(pg, pmap, startva);
1203 		if (pve) {
1204 			pve->pv_next = *free_pvs;
1205 			*free_pvs = pve;
1206 		}
1207 
1208 		/* end of "for" loop: time for next pte */
1209 	}
1210 }
1211 
1212 /*
1213  * pmap_remove: top level mapping removal function
1214  *
1215  * => caller should not be holding any pmap locks
1216  */
1217 
1218 void
pmap_do_remove_pae(struct pmap * pmap,vaddr_t sva,vaddr_t eva,int flags)1219 pmap_do_remove_pae(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
1220 {
1221 	pt_entry_t *ptes;
1222 	paddr_t ptppa;
1223 	vaddr_t blkendva;
1224 	struct vm_page *ptp;
1225 	struct pv_entry *pve;
1226 	struct pv_entry *free_pvs = NULL;
1227 	TAILQ_HEAD(, vm_page) empty_ptps;
1228 	int shootall;
1229 	vaddr_t va;
1230 
1231 	TAILQ_INIT(&empty_ptps);
1232 
1233 	ptes = pmap_map_ptes_pae(pmap);	/* locks pmap */
1234 
1235 	/*
1236 	 * Decide if we want to shoot the whole tlb or just the range.
1237 	 * Right now, we simply shoot everything when we remove more
1238 	 * than 32 pages, but never in the kernel pmap. XXX - tune.
1239 	 */
1240 	if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
1241 		shootall = 1;
1242 	else
1243 		shootall = 0;
1244 
1245 	for (va = sva ; va < eva ; va = blkendva) {
1246 		/* determine range of block */
1247 		blkendva = i386_round_pdr(va + 1);
1248 		if (blkendva > eva)
1249 			blkendva = eva;
1250 
1251 		/*
1252 		 * XXXCDC: our PTE mappings should never be removed
1253 		 * with pmap_remove!  if we allow this (and why would
1254 		 * we?) then we end up freeing the pmap's page
1255 		 * directory page (PDP) before we are finished using
1256 		 * it when we hit it in the recursive mapping.  this
1257 		 * is BAD.
1258 		 *
1259 		 * long term solution is to move the PTEs out of user
1260 		 * address space.  and into kernel address space (up
1261 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
1262 		 * be VM_MAX_ADDRESS.
1263 		 */
1264 
1265 		if (pdei(va) >= PDSLOT_PTE && pdei(va) <= (PDSLOT_PTE + 3))
1266 			/* XXXCDC: ugly hack to avoid freeing PDP here */
1267 			continue;
1268 
1269 		if (!pmap_valid_entry(PDE(pmap, pdei(va))))
1270 			/* valid block? */
1271 			continue;
1272 
1273 		/* PA of the PTP */
1274 		ptppa = PDE(pmap, pdei(va)) & PG_FRAME;
1275 
1276 		/* get PTP if non-kernel mapping */
1277 		if (pmap == pmap_kernel()) {
1278 			/* we never free kernel PTPs */
1279 			ptp = NULL;
1280 		} else {
1281 			if (pmap->pm_ptphint &&
1282 			    VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) {
1283 				ptp = pmap->pm_ptphint;
1284 			} else {
1285 				ptp = PHYS_TO_VM_PAGE(ptppa);
1286 #ifdef DIAGNOSTIC
1287 				if (ptp == NULL)
1288 					panic("pmap_do_remove_pae: unmanaged "
1289 					      "PTP detected");
1290 #endif
1291 			}
1292 		}
1293 
1294 		pmap_remove_ptes_pae(pmap, ptp, (vaddr_t)&ptes[atop(va)],
1295 		    va, blkendva, flags, &free_pvs);
1296 
1297 		/* If PTP is no longer being used, free it. */
1298 		if (ptp && ptp->wire_count <= 1) {
1299 			pmap_drop_ptp_pae(pmap, va, ptp, ptes);
1300 			TAILQ_INSERT_TAIL(&empty_ptps, ptp, pageq);
1301 		}
1302 
1303 		if (!shootall)
1304 			pmap_tlb_shootrange(pmap, va, blkendva);
1305 	}
1306 
1307 	if (shootall)
1308 		pmap_tlb_shoottlb();
1309 
1310 	pmap_unmap_ptes_pae(pmap);
1311 	pmap_tlb_shootwait();
1312 
1313 	while ((pve = free_pvs) != NULL) {
1314 		free_pvs = pve->pv_next;
1315 		pool_put(&pmap_pv_pool, pve);
1316 	}
1317 
1318 	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1319 		TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1320 		uvm_pagefree(ptp);
1321 	}
1322 }
1323 
1324 /*
1325  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
1326  *
1327  * => R/M bits are sync'd back to attrs
1328  */
1329 
1330 void
pmap_page_remove_pae(struct vm_page * pg)1331 pmap_page_remove_pae(struct vm_page *pg)
1332 {
1333 	struct pv_entry *pve;
1334 	struct pmap *pm;
1335 	pt_entry_t *ptes, opte;
1336 	TAILQ_HEAD(, vm_page) empty_ptps;
1337 	struct vm_page *ptp;
1338 
1339 	if (pg->mdpage.pv_list == NULL)
1340 		return;
1341 
1342 	TAILQ_INIT(&empty_ptps);
1343 
1344 	mtx_enter(&pg->mdpage.pv_mtx);
1345 	while ((pve = pg->mdpage.pv_list) != NULL) {
1346 		pmap_reference(pve->pv_pmap);
1347 		pm = pve->pv_pmap;
1348 		mtx_leave(&pg->mdpage.pv_mtx);
1349 
1350 		ptes = pmap_map_ptes_pae(pm);	/* locks pmap */
1351 
1352 		/*
1353 		 * We dropped the pvlist lock before grabbing the pmap
1354 		 * lock to avoid lock ordering problems.  This means
1355 		 * we have to check the pvlist again since somebody
1356 		 * else might have modified it.  All we care about is
1357 		 * that the pvlist entry matches the pmap we just
1358 		 * locked.  If it doesn't, unlock the pmap and try
1359 		 * again.
1360 		 */
1361 		mtx_enter(&pg->mdpage.pv_mtx);
1362 		if ((pve = pg->mdpage.pv_list) == NULL ||
1363 		    pve->pv_pmap != pm) {
1364 			mtx_leave(&pg->mdpage.pv_mtx);
1365 			pmap_unmap_ptes_pae(pm);	/* unlocks pmap */
1366 			pmap_destroy(pm);
1367 			mtx_enter(&pg->mdpage.pv_mtx);
1368 			continue;
1369 		}
1370 
1371 		pg->mdpage.pv_list = pve->pv_next;
1372 		mtx_leave(&pg->mdpage.pv_mtx);
1373 
1374 #ifdef DIAGNOSTIC
1375 		if (pve->pv_ptp && (PDE(pve->pv_pmap, pdei(pve->pv_va)) &
1376 				    PG_FRAME)
1377 		    != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
1378 			printf("pmap_page_remove_pae: pg=%p: va=%lx, "
1379 				"pv_ptp=%p\n",
1380 				pg, pve->pv_va, pve->pv_ptp);
1381 			printf("pmap_page_remove_pae: PTP's phys addr: "
1382 				"actual=%llx, recorded=%lx\n",
1383 				(PDE(pve->pv_pmap, pdei(pve->pv_va)) &
1384 				PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp));
1385 			panic("pmap_page_remove_pae: mapped managed page has "
1386 				"invalid pv_ptp field");
1387 }
1388 #endif
1389 		opte = i386_atomic_testset_uq(&ptes[atop(pve->pv_va)], 0);
1390 
1391 		if (opte & PG_W)
1392 			pve->pv_pmap->pm_stats.wired_count--;
1393 		pve->pv_pmap->pm_stats.resident_count--;
1394 
1395 		/* sync R/M bits */
1396 		pmap_sync_flags_pte_pae(pg, opte);
1397 
1398 		/* update the PTP reference count.  free if last reference. */
1399 		if (pve->pv_ptp && --pve->pv_ptp->wire_count <= 1) {
1400 			pmap_drop_ptp_pae(pve->pv_pmap, pve->pv_va,
1401 			    pve->pv_ptp, ptes);
1402 			TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp, pageq);
1403 		}
1404 
1405 		pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
1406 
1407 		pmap_unmap_ptes_pae(pve->pv_pmap);	/* unlocks pmap */
1408 		pmap_destroy(pve->pv_pmap);
1409 		pool_put(&pmap_pv_pool, pve);
1410 		mtx_enter(&pg->mdpage.pv_mtx);
1411 	}
1412 	mtx_leave(&pg->mdpage.pv_mtx);
1413 
1414 	pmap_tlb_shootwait();
1415 
1416 	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1417 		TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1418 		uvm_pagefree(ptp);
1419 	}
1420 }
1421 
1422 /*
1423  * p m a p   a t t r i b u t e  f u n c t i o n s
1424  * functions that test/change managed page's attributes
1425  * since a page can be mapped multiple times we must check each PTE that
1426  * maps it by going down the pv lists.
1427  */
1428 
1429 /*
1430  * pmap_test_attrs: test a page's attributes
1431  *
1432  * => we set pv_head => pmap locking
1433  */
1434 
1435 int
pmap_test_attrs_pae(struct vm_page * pg,int testbits)1436 pmap_test_attrs_pae(struct vm_page *pg, int testbits)
1437 {
1438 	struct pv_entry *pve;
1439 	pt_entry_t *ptes, pte;
1440 	u_long mybits, testflags;
1441 	paddr_t ptppa;
1442 
1443 	testflags = pmap_pte2flags(testbits);
1444 
1445 	if (pg->pg_flags & testflags)
1446 		return 1;
1447 
1448 	mybits = 0;
1449 	mtx_enter(&pg->mdpage.pv_mtx);
1450 	for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0;
1451 	    pve = pve->pv_next) {
1452 		ptppa = PDE(pve->pv_pmap, pdei(pve->pv_va)) & PG_FRAME;
1453 		ptes = (pt_entry_t *)pmap_tmpmap_pa(ptppa);
1454 		pte = ptes[ptei(pve->pv_va)];
1455 		pmap_tmpunmap_pa();
1456 		mybits |= (pte & testbits);
1457 	}
1458 	mtx_leave(&pg->mdpage.pv_mtx);
1459 
1460 	if (mybits == 0)
1461 		return 0;
1462 
1463 	atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits));
1464 
1465 	return 1;
1466 }
1467 
1468 /*
1469  * pmap_clear_attrs: change a page's attributes
1470  *
1471  * => we return 1 if we cleared one of the bits we were asked to
1472  */
1473 int
pmap_clear_attrs_pae(struct vm_page * pg,int clearbits)1474 pmap_clear_attrs_pae(struct vm_page *pg, int clearbits)
1475 {
1476 	struct pv_entry *pve;
1477 	pt_entry_t *ptes, npte, opte;
1478 	u_long clearflags;
1479 	paddr_t ptppa;
1480 	int result;
1481 
1482 	clearflags = pmap_pte2flags(clearbits);
1483 
1484 	result = pg->pg_flags & clearflags;
1485 	if (result)
1486 		atomic_clearbits_int(&pg->pg_flags, clearflags);
1487 
1488 	mtx_enter(&pg->mdpage.pv_mtx);
1489 	for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) {
1490 		ptppa = PDE(pve->pv_pmap, pdei(pve->pv_va)) & PG_FRAME;
1491 		ptes = (pt_entry_t *)pmap_tmpmap_pa(ptppa);
1492 #ifdef DIAGNOSTIC
1493 		if (!pmap_valid_entry(PDE(pve->pv_pmap, pdei(pve->pv_va))))
1494 			panic("pmap_clear_attrs_pae: mapping without PTP "
1495 				"detected");
1496 #endif
1497 
1498 		opte = ptes[ptei(pve->pv_va)];
1499 		if (opte & clearbits) {
1500 			result = 1;
1501 			npte = opte & ~clearbits;
1502 			opte = i386_atomic_testset_uq(
1503 			   &ptes[ptei(pve->pv_va)], npte);
1504 			pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
1505 		}
1506 		pmap_tmpunmap_pa();
1507 	}
1508 	mtx_leave(&pg->mdpage.pv_mtx);
1509 
1510 	pmap_tlb_shootwait();
1511 
1512 	return (result != 0);
1513 }
1514 
1515 
1516 /*
1517  * p m a p   p r o t e c t i o n   f u n c t i o n s
1518  */
1519 
1520 /*
1521  * pmap_page_protect: change the protection of all recorded mappings
1522  *	of a managed page
1523  *
1524  * => NOTE: this is an inline function in pmap.h
1525  */
1526 
1527 /* see pmap.h */
1528 
1529 /*
1530  * pmap_protect: set the protection in of the pages in a pmap
1531  *
1532  * => NOTE: this is an inline function in pmap.h
1533  */
1534 
1535 /* see pmap.h */
1536 
1537 /*
1538  * pmap_write_protect: write-protect pages in a pmap
1539  */
1540 
1541 void
pmap_write_protect_pae(struct pmap * pmap,vaddr_t sva,vaddr_t eva,vm_prot_t prot)1542 pmap_write_protect_pae(struct pmap *pmap, vaddr_t sva, vaddr_t eva,
1543     vm_prot_t prot)
1544 {
1545 	pt_entry_t *ptes, *spte, *epte, npte, opte;
1546 	vaddr_t blockend;
1547 	u_int64_t md_prot;
1548 	vaddr_t va;
1549 	int shootall = 0;
1550 
1551 	ptes = pmap_map_ptes_pae(pmap);		/* locks pmap */
1552 
1553 	/* should be ok, but just in case ... */
1554 	sva &= PG_FRAME;
1555 	eva &= PG_FRAME;
1556 
1557 	if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
1558 		shootall = 1;
1559 
1560 	for (va = sva; va < eva; va = blockend) {
1561 		blockend = (va & PD_MASK) + NBPD;
1562 		if (blockend > eva)
1563 			blockend = eva;
1564 
1565 		/*
1566 		 * XXXCDC: our PTE mappings should never be write-protected!
1567 		 *
1568 		 * long term solution is to move the PTEs out of user
1569 		 * address space.  and into kernel address space (up
1570 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
1571 		 * be VM_MAX_ADDRESS.
1572 		 */
1573 
1574 		/* XXXCDC: ugly hack to avoid freeing PDP here */
1575 		if (pdei(va) >= PDSLOT_PTE && pdei(va) <= (PDSLOT_PTE + 3))
1576 			continue;
1577 
1578 		/* empty block? */
1579 		if (!pmap_valid_entry(PDE(pmap, pdei(va))))
1580 			continue;
1581 
1582 		md_prot = protection_codes[prot];
1583 		if (!(prot & PROT_EXEC))
1584 			md_prot |= PG_NX;
1585 		if (va < VM_MAXUSER_ADDRESS)
1586 			md_prot |= PG_u;
1587 		else if (va < VM_MAX_ADDRESS)
1588 			/* XXX: write-prot our PTES? never! */
1589 			md_prot |= PG_RW;
1590 
1591 		spte = &ptes[atop(va)];
1592 		epte = &ptes[atop(blockend)];
1593 
1594 		for (/*null */; spte < epte ; spte++, va += PAGE_SIZE) {
1595 
1596 			if (!pmap_valid_entry(*spte))	/* no mapping? */
1597 				continue;
1598 
1599 			opte = *spte;
1600 			npte = (opte & ~(pt_entry_t)PG_PROT) | md_prot;
1601 
1602 			if (npte != opte) {
1603 				pmap_exec_account(pmap, va, *spte, npte);
1604 				i386_atomic_testset_uq(spte, npte);
1605 			}
1606 		}
1607 	}
1608 	if (shootall)
1609 		pmap_tlb_shoottlb();
1610 	else
1611 		pmap_tlb_shootrange(pmap, sva, eva);
1612 
1613 	pmap_unmap_ptes_pae(pmap);		/* unlocks pmap */
1614 	pmap_tlb_shootwait();
1615 }
1616 
1617 /*
1618  * end of protection functions
1619  */
1620 
1621 /*
1622  * pmap_unwire: clear the wired bit in the PTE
1623  *
1624  * => mapping should already be in map
1625  */
1626 
1627 void
pmap_unwire_pae(struct pmap * pmap,vaddr_t va)1628 pmap_unwire_pae(struct pmap *pmap, vaddr_t va)
1629 {
1630 	pt_entry_t *ptes;
1631 
1632 	if (pmap_valid_entry(PDE(pmap, pdei(va)))) {
1633 		ptes = pmap_map_ptes_pae(pmap);		/* locks pmap */
1634 
1635 #ifdef DIAGNOSTIC
1636 		if (!pmap_valid_entry(ptes[atop(va)]))
1637 			panic("pmap_unwire_pae: invalid (unmapped) va "
1638 			      "0x%lx", va);
1639 #endif
1640 		if ((ptes[atop(va)] & PG_W) != 0) {
1641 			i386_atomic_testset_uq(&ptes[atop(va)],
1642 			    ptes[atop(va)] & ~PG_W);
1643 			pmap->pm_stats.wired_count--;
1644 		}
1645 #ifdef DIAGNOSTIC
1646 		else {
1647 			printf("pmap_unwire_pae: wiring for pmap %p va 0x%lx "
1648 			       "didn't change!\n", pmap, va);
1649 		}
1650 #endif
1651 		pmap_unmap_ptes_pae(pmap);		/* unlocks map */
1652 	}
1653 #ifdef DIAGNOSTIC
1654 	else {
1655 		panic("pmap_unwire_pae: invalid PDE");
1656 	}
1657 #endif
1658 }
1659 
1660 /*
1661  * pmap_enter: enter a mapping into a pmap
1662  *
1663  * => must be done "now" ... no lazy-evaluation
1664  */
1665 
1666 int
pmap_enter_pae(struct pmap * pmap,vaddr_t va,paddr_t pa,vm_prot_t prot,int flags)1667 pmap_enter_pae(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
1668     int flags)
1669 {
1670 	pt_entry_t *ptes, opte, npte;
1671 	struct vm_page *ptp;
1672 	struct pv_entry *pve, *opve = NULL;
1673 	int wired = (flags & PMAP_WIRED) != 0;
1674 	int nocache = (pa & PMAP_NOCACHE) != 0;
1675 	int wc = (pa & PMAP_WC) != 0;
1676 	struct vm_page *pg = NULL;
1677 	int error, wired_count, resident_count, ptp_count;
1678 
1679 	KASSERT(!(wc && nocache));
1680 	pa &= PMAP_PA_MASK;	/* nuke flags from pa */
1681 
1682 #ifdef DIAGNOSTIC
1683 	/* sanity check: totally out of range? */
1684 	if (va >= VM_MAX_KERNEL_ADDRESS)
1685 		panic("pmap_enter_pae: too big");
1686 
1687 	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
1688 		panic("pmap_enter_pae: trying to map over PDP/APDP!");
1689 
1690 	/* sanity check: kernel PTPs should already have been pre-allocated */
1691 	if (va >= VM_MIN_KERNEL_ADDRESS &&
1692 	    !pmap_valid_entry(PDE(pmap, pdei(va))))
1693 		panic("pmap_enter_pae: missing kernel PTP!");
1694 #endif
1695 
1696 	if (pmap_initialized)
1697 		pve = pool_get(&pmap_pv_pool, PR_NOWAIT);
1698 	else
1699 		pve = NULL;
1700 	wired_count = resident_count = ptp_count = 0;
1701 
1702 	/*
1703 	 * map in ptes and get a pointer to our PTP (unless we are the kernel)
1704 	 */
1705 
1706 	ptes = pmap_map_ptes_pae(pmap);		/* locks pmap */
1707 	if (pmap == pmap_kernel()) {
1708 		ptp = NULL;
1709 	} else {
1710 		ptp = pmap_get_ptp_pae(pmap, pdei(va));
1711 		if (ptp == NULL) {
1712 			if (flags & PMAP_CANFAIL) {
1713 				error = ENOMEM;
1714 				pmap_unmap_ptes_pae(pmap);
1715 				goto out;
1716 			}
1717 			panic("pmap_enter_pae: get ptp failed");
1718 		}
1719 	}
1720 	/*
1721 	 * not allowed to sleep after here!
1722 	 */
1723 	opte = ptes[atop(va)];			/* old PTE */
1724 
1725 	/*
1726 	 * is there currently a valid mapping at our VA?
1727 	 */
1728 
1729 	if (pmap_valid_entry(opte)) {
1730 
1731 		/*
1732 		 * first, calculate pm_stats updates.  resident count will not
1733 		 * change since we are replacing/changing a valid
1734 		 * mapping.  wired count might change...
1735 		 */
1736 
1737 		if (wired && (opte & PG_W) == 0)
1738 			wired_count++;
1739 		else if (!wired && (opte & PG_W) != 0)
1740 			wired_count--;
1741 
1742 		/*
1743 		 * is the currently mapped PA the same as the one we
1744 		 * want to map?
1745 		 */
1746 
1747 		if ((opte & PG_FRAME) == pa) {
1748 
1749 			/* if this is on the PVLIST, sync R/M bit */
1750 			if (opte & PG_PVLIST) {
1751 				pg = PHYS_TO_VM_PAGE(pa);
1752 #ifdef DIAGNOSTIC
1753 				if (pg == NULL)
1754 					panic("pmap_enter_pae: same pa "
1755 					     "PG_PVLIST mapping with "
1756 					     "unmanaged page "
1757 					     "pa = 0x%lx (0x%lx)", pa,
1758 					     atop(pa));
1759 #endif
1760 				pmap_sync_flags_pte_pae(pg, opte);
1761 			}
1762 			goto enter_now;
1763 		}
1764 
1765 		/*
1766 		 * changing PAs: we must remove the old one first
1767 		 */
1768 
1769 		/*
1770 		 * if current mapping is on a pvlist,
1771 		 * remove it (sync R/M bits)
1772 		 */
1773 
1774 		if (opte & PG_PVLIST) {
1775 			pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
1776 #ifdef DIAGNOSTIC
1777 			if (pg == NULL)
1778 				panic("pmap_enter_pae: PG_PVLIST mapping with "
1779 				      "unmanaged page "
1780 				      "pa = 0x%lx (0x%lx)", pa, atop(pa));
1781 #endif
1782 			pmap_sync_flags_pte_pae(pg, opte);
1783 			opve = pmap_remove_pv(pg, pmap, va);
1784 			pg = NULL; /* This is not the page we are looking for */
1785 		}
1786 	} else {	/* opte not valid */
1787 		resident_count++;
1788 		if (wired)
1789 			wired_count++;
1790 		if (ptp)
1791 			ptp_count++;	/* count # of valid entries */
1792 	}
1793 
1794 	/*
1795 	 * pve is either NULL or points to a now-free pv_entry structure
1796 	 * (the latter case is if we called pmap_remove_pv above).
1797 	 *
1798 	 * if this entry is to be on a pvlist, enter it now.
1799 	 */
1800 
1801 	if (pmap_initialized && pg == NULL)
1802 		pg = PHYS_TO_VM_PAGE(pa);
1803 
1804 	if (pg != NULL) {
1805 		if (pve == NULL) {
1806 			pve = opve;
1807 			opve = NULL;
1808 		}
1809 		if (pve == NULL) {
1810 			if (flags & PMAP_CANFAIL) {
1811 				pmap_unmap_ptes_pae(pmap);
1812 				error = ENOMEM;
1813 				goto out;
1814 			}
1815 			panic("pmap_enter_pae: no pv entries available");
1816 		}
1817 		/* lock pg when adding */
1818 		pmap_enter_pv(pg, pve, pmap, va, ptp);
1819 		pve = NULL;
1820 	}
1821 
1822 enter_now:
1823 	/*
1824 	 * at this point pg is !NULL if we want the PG_PVLIST bit set
1825 	 */
1826 
1827 	npte = pa | protection_codes[prot] | PG_V;
1828 	if (!(prot & PROT_EXEC))
1829 		npte |= PG_NX;
1830 	pmap_exec_account(pmap, va, opte, npte);
1831 	if (wired)
1832 		npte |= PG_W;
1833 	if (nocache)
1834 		npte |= PG_N;
1835 	if (va < VM_MAXUSER_ADDRESS)
1836 		npte |= PG_u;
1837 	else if (va < VM_MAX_ADDRESS)
1838 		npte |= PG_RW;	/* XXXCDC: no longer needed? */
1839 	if (pmap == pmap_kernel())
1840 		npte |= pmap_pg_g;
1841 	if (flags & PROT_READ)
1842 		npte |= PG_U;
1843 	if (flags & PROT_WRITE)
1844 		npte |= PG_M;
1845 	if (pg) {
1846 		npte |= PG_PVLIST;
1847 		if (pg->pg_flags & PG_PMAP_WC) {
1848 			KASSERT(nocache == 0);
1849 			wc = 1;
1850 		}
1851 		pmap_sync_flags_pte_pae(pg, npte);
1852 	}
1853 	if (wc)
1854 		npte |= pmap_pg_wc;
1855 
1856 	opte = i386_atomic_testset_uq(&ptes[atop(va)], npte);
1857 	if (ptp)
1858 		ptp->wire_count += ptp_count;
1859 	pmap->pm_stats.resident_count += resident_count;
1860 	pmap->pm_stats.wired_count += wired_count;
1861 
1862 	if (pmap_valid_entry(opte)) {
1863 		if (nocache && (opte & PG_N) == 0)
1864 			wbinvd_on_all_cpus(); /* XXX clflush before we enter? */
1865 		pmap_tlb_shootpage(pmap, va);
1866 	}
1867 
1868 	pmap_unmap_ptes_pae(pmap);
1869 	pmap_tlb_shootwait();
1870 
1871 	error = 0;
1872 
1873 out:
1874 	if (pve)
1875 		pool_put(&pmap_pv_pool, pve);
1876 	if (opve)
1877 		pool_put(&pmap_pv_pool, opve);
1878 
1879 	return error;
1880 }
1881 
1882 /*
1883  * Allocate an extra PDPT and PT pages as needed to map kernel pages
1884  * used for the U-K mappings.  These special mappings are set up
1885  * during bootstrap and get never removed and are part of pmap_kernel.
1886  *
1887  * New pmaps inherit the kernel portion of pmap_kernel including
1888  * the special mappings (see pmap_pinit_pd_pae()).
1889  */
1890 void
pmap_enter_special_pae(vaddr_t va,paddr_t pa,vm_prot_t prot,u_int32_t flags)1891 pmap_enter_special_pae(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int32_t flags)
1892 {
1893 	struct pmap 	*pmap = pmap_kernel();
1894 	struct vm_page	*ptppg = NULL;
1895 	pd_entry_t	*pd, *ptp;
1896 	pt_entry_t	*ptes;
1897 	uint32_t	 l2idx, l1idx;
1898 	paddr_t		 npa;
1899 
1900 	/* If CPU is secure, no need to do anything */
1901 	if (!cpu_meltdown)
1902 		return;
1903 
1904 	/* Must be kernel VA */
1905 	if (va < VM_MIN_KERNEL_ADDRESS)
1906 		panic("invalid special mapping va 0x%lx requested", va);
1907 
1908 	KASSERT(pmap->pm_pdir_intel != 0);
1909 
1910 	DPRINTF("%s: pm_pdir_intel 0x%x pm_pdirpa_intel 0x%x\n", __func__,
1911 	    (uint32_t)pmap->pm_pdir_intel, (uint32_t)pmap->pm_pdirpa_intel);
1912 
1913 	/* These are the PAE versions of pdei() and ptei() */
1914 	l2idx = pdei(va);
1915 	l1idx = ptei(va);
1916 
1917 	DPRINTF("%s: va 0x%08lx pa 0x%08lx prot 0x%08lx flags 0x%08x "
1918 	    "l2idx %u l1idx %u\n", __func__, va, pa, (unsigned long)prot,
1919 	    flags, l2idx, l1idx);
1920 
1921 	if ((pd = (pd_entry_t *)pmap->pm_pdir_intel) == 0)
1922 		panic("%s: PD not initialized for pmap @ %p", __func__, pmap);
1923 
1924 	/* npa = physaddr of PT page */
1925 	npa = pd[l2idx] & PMAP_PA_MASK;
1926 
1927 	/* Valid PDE for the 2MB region containing va? */
1928 	if (!npa) {
1929 		/*
1930 		 * No valid PDE - allocate PT page and set PDE.  We
1931 		 * get it from pm_obj, which is used for PT pages.
1932 		 * We calculate the offset  from l2idx+2048, so we are
1933 		 * beyond the regular PT pages. For their l2dix
1934 		 * 0 <= l2idx < 2048 holds.
1935 		 */
1936 		ptppg = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(l2idx + 2048),
1937 		    NULL, UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1938 		if (ptppg == NULL)
1939 			panic("%s: failed to allocate PT page", __func__);
1940 
1941 		atomic_clearbits_int(&ptppg->pg_flags, PG_BUSY);
1942 		ptppg->wire_count = 1;	/* no mappings yet */
1943 
1944 		npa = VM_PAGE_TO_PHYS(ptppg);
1945 		pd[l2idx] = (npa | PG_RW | PG_V | PG_M | PG_U);
1946 
1947 		DPRINTF("%s: allocated new PT page at phys 0x%x, "
1948 		    "setting PDE[%d] = 0x%llx\n", __func__, (uint32_t)npa,
1949 		    l2idx, pd[l2idx]);
1950 	}
1951 
1952 	/* temporarily map PT page and set PTE for U-K mapping */
1953 	if (ptppg == NULL && (ptppg = PHYS_TO_VM_PAGE(npa)) == NULL)
1954 		panic("%s: no vm_page for PT page", __func__);
1955 	mtx_enter(&ptppg->mdpage.pv_mtx);
1956 	ptp = (pd_entry_t *)pmap_tmpmap_pa(npa);
1957 	ptp[l1idx] = (pa | protection_codes[prot] | PG_V | PG_M | PG_U | flags);
1958 	DPRINTF("%s: setting PTE[%d] = 0x%llx\n", __func__, l1idx, ptp[l1idx]);
1959 	pmap_tmpunmap_pa();
1960 	mtx_leave(&ptppg->mdpage.pv_mtx);
1961 
1962 	/* if supported, set the PG_G flag on the corresponding U+K entry */
1963 	if (!(cpu_feature & CPUID_PGE))
1964 		return;
1965 	ptes = pmap_map_ptes_pae(pmap);	/* pmap_kernel -> PTE_BASE */
1966 	if (pmap_valid_entry(ptes[atop(va)]))
1967 		ptes[atop(va)] |= PG_G;
1968 	else
1969 		DPRINTF("%s: no U+K mapping for special mapping?\n", __func__);
1970 	pmap_unmap_ptes_pae(pmap);	/* pmap_kernel -> nothing */
1971 }
1972 
1973 /*
1974  * pmap_growkernel: increase usage of KVM space
1975  *
1976  * => we allocate new PTPs for the kernel and install them in all
1977  *	the pmaps on the system.
1978  */
1979 
1980 vaddr_t
pmap_growkernel_pae(vaddr_t maxkvaddr)1981 pmap_growkernel_pae(vaddr_t maxkvaddr)
1982 {
1983 	extern int nkpde;
1984 	struct pmap *kpm = pmap_kernel(), *pm;
1985 	int needed_kpde;   /* needed number of kernel PTPs */
1986 	int s;
1987 	paddr_t ptaddr;
1988 
1989 	needed_kpde = (int)(maxkvaddr - VM_MIN_KERNEL_ADDRESS + (NBPD-1))
1990 		/ NBPD;
1991 	if (needed_kpde <= nkpde)
1992 		goto out;		/* we are OK */
1993 
1994 	/*
1995 	 * whoops!   we need to add kernel PTPs
1996 	 */
1997 
1998 	s = splhigh();	/* to be safe */
1999 
2000 	for (/*null*/ ; nkpde < needed_kpde ; nkpde++) {
2001 
2002 		if (uvm.page_init_done == 0) {
2003 
2004 			/*
2005 			 * we're growing the kernel pmap early (from
2006 			 * uvm_pageboot_alloc()).  this case must be
2007 			 * handled a little differently.
2008 			 */
2009 
2010 			if (uvm_page_physget(&ptaddr) == 0)
2011 				panic("pmap_growkernel: out of memory");
2012 			pmap_zero_phys_pae(ptaddr);
2013 
2014 			PDE(kpm, PDSLOT_KERN + nkpde) =
2015 				ptaddr | PG_RW | PG_V | PG_U | PG_M;
2016 
2017 			/* count PTP as resident */
2018 			kpm->pm_stats.resident_count++;
2019 			continue;
2020 		}
2021 
2022 		/*
2023 		 * THIS *MUST* BE CODED SO AS TO WORK IN THE
2024 		 * pmap_initialized == 0 CASE!  WE MAY BE
2025 		 * INVOKED WHILE pmap_init() IS RUNNING!
2026 		 */
2027 
2028 		while (!pmap_alloc_ptp_pae(kpm, PDSLOT_KERN + nkpde, 0))
2029 			uvm_wait("pmap_growkernel");
2030 
2031 		/* distribute new kernel PTP to all active pmaps */
2032 		mtx_enter(&pmaps_lock);
2033 		LIST_FOREACH(pm, &pmaps, pm_list) {
2034 			PDE(pm, PDSLOT_KERN + nkpde) =
2035 				PDE(kpm, PDSLOT_KERN + nkpde);
2036 		}
2037 		mtx_leave(&pmaps_lock);
2038 	}
2039 
2040 	splx(s);
2041 
2042 out:
2043 	return (VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD));
2044 }
2045 
2046 /*
2047  * Pre-allocate PTP 0 for low memory, so that 1:1 mappings for various
2048  * trampoline code can be entered.
2049  */
2050 void
pmap_prealloc_lowmem_ptp_pae(void)2051 pmap_prealloc_lowmem_ptp_pae(void)
2052 {
2053 	pt_entry_t *pte, npte;
2054 	vaddr_t ptpva = (vaddr_t)vtopte(0);
2055 
2056 	/* enter pa for pte 0 into recursive map */
2057 	pte = vtopte(ptpva);
2058 	npte = PTP0_PA | PG_RW | PG_V | PG_U | PG_M;
2059 
2060 	i386_atomic_testset_uq(pte, npte);
2061 
2062 	/* make sure it is clean before using */
2063 	memset((void *)ptpva, 0, NBPG);
2064 }
2065 
2066 /*
2067  * pmap_tmpmap_pa_pae: map a page in for tmp usage
2068  */
2069 
2070 vaddr_t
pmap_tmpmap_pa_pae(paddr_t pa)2071 pmap_tmpmap_pa_pae(paddr_t pa)
2072 {
2073 #ifdef MULTIPROCESSOR
2074 	int id = cpu_number();
2075 #endif
2076 	pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
2077 	caddr_t ptpva = VASLEW(pmap_ptpp, id);
2078 #if defined(DIAGNOSTIC)
2079 	if (*ptpte)
2080 		panic("pmap_tmpmap_pa_pae: ptp_pte in use?");
2081 #endif
2082 	*ptpte = PG_V | PG_RW | pa;	/* always a new mapping */
2083 	return((vaddr_t)ptpva);
2084 }
2085 
2086 /*
2087  * pmap_tmpunmap_pa_pae: unmap a tmp use page (undoes pmap_tmpmap_pa_pae)
2088  */
2089 
2090 void
pmap_tmpunmap_pa_pae(void)2091 pmap_tmpunmap_pa_pae(void)
2092 {
2093 #ifdef MULTIPROCESSOR
2094 	int id = cpu_number();
2095 #endif
2096 	pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
2097 	caddr_t ptpva = VASLEW(pmap_ptpp, id);
2098 #if defined(DIAGNOSTIC)
2099 	if (!pmap_valid_entry(*ptpte))
2100 		panic("pmap_tmpunmap_pa_pae: our pte invalid?");
2101 #endif
2102 	*ptpte = 0;
2103 	pmap_update_pg((vaddr_t)ptpva);
2104 #ifdef MULTIPROCESSOR
2105 	/*
2106 	 * No need for tlb shootdown here, since ptp_pte is per-CPU.
2107 	 */
2108 #endif
2109 }
2110 
2111 paddr_t
vtophys_pae(vaddr_t va)2112 vtophys_pae(vaddr_t va)
2113 {
2114 	return ((*vtopte(va) & PG_FRAME) | (va & ~PG_FRAME));
2115 }
2116 
2117 void
pmap_flush_page_pae(paddr_t pa)2118 pmap_flush_page_pae(paddr_t pa)
2119 {
2120 #ifdef MULTIPROCESSOR
2121 	int id = cpu_number();
2122 #endif
2123 	pt_entry_t *pte = PTESLEW(flsh_pte, id);
2124 	caddr_t va = VASLEW(pmap_flshp, id);
2125 
2126 	KDASSERT(PHYS_TO_VM_PAGE(pa) != NULL);
2127 #ifdef DIAGNOSTIC
2128 	if (*pte)
2129 		panic("pmap_flush_page_pae: lock botch");
2130 #endif
2131 
2132 	*pte = (pa & PG_FRAME) | PG_V | PG_RW;
2133 	pmap_update_pg(va);
2134 	pmap_flush_cache((vaddr_t)va, PAGE_SIZE);
2135 	*pte = 0;
2136 	pmap_update_pg(va);
2137 }
2138