1 /* $OpenBSD: pmapae.c,v 1.75 2024/11/16 10:09:08 mpi Exp $ */
2
3 /*
4 * Copyright (c) 2006-2008 Michael Shalayeff
5 * All rights reserved.
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER IN
16 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
17 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19 /*
20 * Copyright (c) 1997 Charles D. Cranor and Washington University.
21 * All rights reserved.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the above copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 *
32 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
33 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
34 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
35 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
36 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
38 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
39 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
40 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
41 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42 *
43 * from OpenBSD: pmap.c,v 1.85 2005/11/18 17:05:04 brad Exp
44 */
45
46 /*
47 * pmap.c: i386 pmap module rewrite
48 * Chuck Cranor <chuck@ccrc.wustl.edu>
49 * 11-Aug-97
50 *
51 * history of this pmap module: in addition to my own input, i used
52 * the following references for this rewrite of the i386 pmap:
53 *
54 * [1] the NetBSD i386 pmap. this pmap appears to be based on the
55 * BSD hp300 pmap done by Mike Hibler at University of Utah.
56 * it was then ported to the i386 by William Jolitz of UUNET
57 * Technologies, Inc. Then Charles M. Hannum of the NetBSD
58 * project fixed some bugs and provided some speed ups.
59 *
60 * [2] the FreeBSD i386 pmap. this pmap seems to be the
61 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
62 * and David Greenman.
63 *
64 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated
65 * between several processors. the VAX version was done by
66 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386
67 * version was done by Lance Berc, Mike Kupfer, Bob Baron,
68 * David Golub, and Richard Draves. the alpha version was
69 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou
70 * (NetBSD/alpha).
71 */
72 /*
73 * PAE support
74 * Michael Shalayeff <mickey@lucifier.net>
75 *
76 * This module implements PAE mode for i386.
77 *
78 */
79
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/atomic.h>
83 #include <sys/pool.h>
84 #include <sys/user.h>
85 #include <sys/mutex.h>
86
87 #include <uvm/uvm.h>
88
89 #include <machine/specialreg.h>
90
91 #include <dev/isa/isareg.h>
92 #include <i386/isa/isa_machdep.h>
93
94 #include "ksyms.h"
95
96 /* #define PMAPAE_DEBUG */
97
98 #ifdef PMAPAE_DEBUG
99 #define DPRINTF(x...) do { printf(x); } while(0)
100 #else
101 #define DPRINTF(x...)
102 #endif /* PMAPAE_DEBUG */
103
104 /*
105 * this file contains the code for the "pmap module." the module's
106 * job is to manage the hardware's virtual to physical address mappings.
107 * note that there are two levels of mapping in the VM system:
108 *
109 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
110 * to map ranges of virtual address space to objects/files. for
111 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
112 * to the file /bin/ls starting at offset zero." note that
113 * the upper layer mapping is not concerned with how individual
114 * vm_pages are mapped.
115 *
116 * [2] the lower layer of the VM system (the pmap) maintains the mappings
117 * from virtual addresses. it is concerned with which vm_page is
118 * mapped where. for example, when you run /bin/ls and start
119 * at page 0x1000 the fault routine may lookup the correct page
120 * of the /bin/ls file and then ask the pmap layer to establish
121 * a mapping for it.
122 *
123 * note that information in the lower layer of the VM system can be
124 * thrown away since it can easily be reconstructed from the info
125 * in the upper layer.
126 *
127 * data structures we use include:
128 *
129 * - struct pmap: describes the address space of one thread
130 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA
131 * - struct pv_head: there is one pv_head per managed page of
132 * physical memory. the pv_head points to a list of pv_entry
133 * structures which describe all the <PMAP,VA> pairs that this
134 * page is mapped in. this is critical for page based operations
135 * such as pmap_page_protect() [change protection on _all_ mappings
136 * of a page]
137 */
138 /*
139 * i386 PAE hardware Page Tables structure:
140 *
141 * the i386 PAE Page Table is a three-level PT which maps 4GB of VA.
142 * the pagesize is 4K (4096 [0x1000] bytes) or 2MB.
143 *
144 * the first level table is called "page directory index" and consists
145 * of 4 page directory index entries (PDIE) each 64 bits in size.
146 *
147 * the second level table is called a "page directory" and it contains
148 * 512 page directory entries (PDEs). each PDE is
149 * 8 bytes (a long long), so a PD fits in a single 4K page. this page is
150 * the page directory page (PDP). each PDE in a PDP maps 1GB of space
151 * (512 * 2MB = 1GB). a PDE contains the physical address of the
152 * second level table: the page table. or, if 2MB pages are being used,
153 * then the PDE contains the PA of the 2MB page being mapped.
154 *
155 * a page table consists of 512 page table entries (PTEs). each PTE is
156 * 8 bytes (a long long), so a page table also fits in a single 4K page.
157 * a 4K page being used as a page table is called a page table page (PTP).
158 * each PTE in a PTP maps one 4K page (512 * 4K = 2MB). a PTE contains
159 * the physical address of the page it maps and some flag bits (described
160 * below).
161 *
162 * the processor has a special register, "cr3", which points to the
163 * the PDP which is currently controlling the mappings of the virtual
164 * address space.
165 *
166 * the following picture shows the translation process for a 4K page:
167 *
168 * %cr3 register [PA of PDPT]
169 * |
170 * | bits <31-30> of VA
171 * | index the DPE (0-3)
172 * | |
173 * v v
174 * +-----------+
175 * | PDP Ptr |
176 * | 4 entries |
177 * +-----------+
178 * |
179 * PA of PDP
180 * |
181 * |
182 * | bits <29-21> of VA bits <20-12> of VA bits <11-0>
183 * | index the PDP (0 - 512) index the PTP are the page offset
184 * | | | |
185 * | v | |
186 * +-->+---------+ | |
187 * | PD Page | PA of v |
188 * | |-----PTP----->+------------+ |
189 * | 512 PDE | | page table |--PTE--+ |
190 * | entries | | (aka PTP) | | |
191 * +---------+ | 512 PTE | | |
192 * | entries | | |
193 * +------------+ | |
194 * | |
195 * bits <35-12> bits <11-0>
196 * p h y s i c a l a d d r
197 *
198 * the i386 caches PTEs in a TLB. it is important to flush out old
199 * TLB mappings when making a change to a mapping. writing to the
200 * %cr3 will flush the entire TLB. newer processors also have an
201 * instruction that will invalidate the mapping of a single page (which
202 * is useful if you are changing a single mapping because it preserves
203 * all the cached TLB entries).
204 *
205 * as shows, bits 31-12 of the PTE contain PA of the page being mapped.
206 * the rest of the PTE is defined as follows:
207 * bit# name use
208 * 63 NX no-execute bit (0=ITLB, 1=DTLB), optional
209 * 11 n/a available for OS use, hardware ignores it
210 * 10 n/a available for OS use, hardware ignores it
211 * 9 n/a available for OS use, hardware ignores it
212 * 8 G global bit (see discussion below)
213 * 7 PS page size [for PDEs] (0=4k, 1=4M <if supported>)
214 * 6 D dirty (modified) page
215 * 5 A accessed (referenced) page
216 * 4 PCD cache disable
217 * 3 PWT prevent write through (cache)
218 * 2 U/S user/supervisor bit (0=supervisor only, 1=both u&s)
219 * 1 R/W read/write bit (0=read only, 1=read-write)
220 * 0 P present (valid)
221 *
222 * notes:
223 * - on the i386 the R/W bit is ignored if processor is in supervisor
224 * state (bug!)
225 * - PS is only supported on newer processors
226 * - PTEs with the G bit are global in the sense that they are not
227 * flushed from the TLB when %cr3 is written (to flush, use the
228 * "flush single page" instruction). this is only supported on
229 * newer processors. this bit can be used to keep the kernel's
230 * TLB entries around while context switching. since the kernel
231 * is mapped into all processes at the same place it does not make
232 * sense to flush these entries when switching from one process'
233 * pmap to another.
234 */
235 /*
236 * A pmap describes a process' 4GB virtual address space. This
237 * virtual address space can be broken up into 2048 2MB regions which
238 * are described by PDEs in the PDP. The PDEs are defined as follows:
239 *
240 * Ranges are inclusive -> exclusive, just like vm_map_entry start/end.
241 * The following assumes that KERNBASE is 0xd0000000.
242 *
243 * PDE#s VA range Usage
244 * 0->1660 0x0 -> 0xcf800000 user address space, note that the
245 * max user address is 0xcfbfe000
246 * the final two pages in the last 4MB
247 * used to be reserved for the UAREA
248 * but now are no longer used.
249 * 1660 0xcf800000-> recursive mapping of PDP (used for
250 * 0xd0000000 linear mapping of PTPs).
251 * 1664->2044 0xd0000000-> kernel address space (constant
252 * 0xff800000 across all pmaps/processes).
253 * 2044 0xff800000-> "alternate" recursive PDP mapping
254 * <end> (for other pmaps).
255 *
256 *
257 * Note: A recursive PDP mapping provides a way to map all the PTEs for
258 * a 4GB address space into a linear chunk of virtual memory. In other
259 * words, the PTE for page 0 is the first 8b mapped into the 2MB recursive
260 * area. The PTE for page 1 is the second 8b. The very last 8b in the
261 * 2MB range is the PTE that maps VA 0xffffe000 (the last page in a 4GB
262 * address).
263 *
264 * All pmaps' PDs must have the same values in slots 1660->2043 so that
265 * the kernel is always mapped in every process. These values are loaded
266 * into the PD at pmap creation time.
267 *
268 * At any one time only one pmap can be active on a processor. This is
269 * the pmap whose PDP is pointed to by processor register %cr3. This pmap
270 * will have all its PTEs mapped into memory at the recursive mapping
271 * point (slots #1660-3 as show above). When the pmap code wants to find the
272 * PTE for a virtual address, all it has to do is the following:
273 *
274 * Address of PTE = (1660 * 2MB) + (VA / NBPG) * sizeof(pt_entry_t)
275 * = 0xcf800000 + (VA / 4096) * 8
276 *
277 * What happens if the pmap layer is asked to perform an operation
278 * on a pmap that is not the one which is currently active? In that
279 * case we take the PA of the PDP of the non-active pmap and put it in
280 * slots 2044-7 of the active pmap. This causes the non-active pmap's
281 * PTEs to get mapped in the final 4MB of the 4GB address space
282 * (e.g. starting at 0xffc00000).
283 *
284 * The following figure shows the effects of the recursive PDP mapping:
285 *
286 * PDP (%cr3->PDPTP)
287 * +----+
288 * | 0| -> PTP#0 that maps VA 0x0 -> 0x200000
289 * | |
290 * | |
291 * |1660| -> points back to PDP (%cr3) mapping VA 0xcf800000 -> 0xd0000000
292 * |1661| (PDP is 4 pages)
293 * |1662|
294 * |1663|
295 * |1664| -> first kernel PTP (maps 0xd0000000 -> 0xe0200000)
296 * | |
297 * |2044| -> points to alternate pmap's PDP (maps 0xff800000 -> end)
298 * |2045|
299 * |2046|
300 * |2047|
301 * +----+
302 *
303 * Note that the PDE#1660 VA (0xcf8033e0) is defined as "PTE_BASE".
304 * Note that the PDE#2044 VA (0xff803fe0) is defined as "APTE_BASE".
305 *
306 * Starting at VA 0xcf8033e0 the current active PDPs (%cr3) act as a
307 * PDPTP and reference four consecutively mapped pages:
308 *
309 * PTP#1660-3 == PDP(%cr3) => maps VA 0xcf800000 -> 0xd0000000
310 * +----+
311 * | 0| -> maps the contents of PTP#0 at VA 0xcf800000->0xcf801000
312 * | |
313 * | |
314 * |1660| -> maps the contents of PTP#1660 (the PDP) at VA 0xcfe7c000
315 * |1661|
316 * |1662|
317 * |1663|
318 * |1664| -> maps the contents of first kernel PTP
319 * | |
320 * |2047|
321 * +----+
322 *
323 * Note that mapping of the PDP at PTP#1660's VA (0xcfe7c000) is
324 * defined as "PDP_BASE".... within that mapping there are two
325 * defines:
326 * "PDP_PDE" (0xcfe7f3e0) is the VA of the PDE in the PDP
327 * which points back to itself.
328 * "APDP_PDE" (0xfff02fe0) is the VA of the PDE in the PDP which
329 * establishes the recursive mapping of the alternate pmap.
330 * To set the alternate PDP, one just has to put the correct
331 * PA info in *APDP_PDE.
332 *
333 * Note that in the APTE_BASE space, the APDP appears at VA
334 * "APDP_BASE" (0xffffc000).
335 *
336 * unfortunately, we cannot use recursive PDPT from the page tables
337 * because cr3 is only 32 bits wide.
338 *
339 */
340 #define PG_FRAME 0xffffff000ULL /* page frame mask */
341 #define PG_LGFRAME 0xfffe00000ULL /* large (2M) page frame mask */
342
343 /*
344 * Redefine the PDSHIFT and NBPD macros for PAE
345 */
346 #undef PDSHIFT
347 #define PDSHIFT 21 /* page directory address shift */
348 #undef NBPD
349 #define NBPD (1U << PDSHIFT) /* # bytes mapped by PD (2MB) */
350
351 #define PDSHIFT86 22 /* for pmap86 transfer */
352
353 #undef PDSLOT_PTE
354 #define PDSLOT_PTE (1660U) /* 1660: for recursive PDP map */
355 #undef PDSLOT_KERN
356 #define PDSLOT_KERN (1664U) /* 1664: start of kernel space */
357 #undef PDSLOT_APTE
358 #define PDSLOT_APTE (2044U) /* 2044: alternative recursive slot */
359
360 /*
361 * The following defines give the virtual addresses of various MMU
362 * data structures:
363 * PTE_BASE and APTE_BASE: the base VA of the linear PTE mappings
364 * PDP_PDE and APDP_PDE: the VA of the PDE that points back to the PDP/APDP
365 */
366 #define PTE_BASE ((pt_entry_t *) (PDSLOT_PTE * NBPD))
367 #define APTE_BASE ((pt_entry_t *) (PDSLOT_APTE * NBPD))
368 #define PDP_BASE ((pd_entry_t *)(((char *)PTE_BASE) + (PDSLOT_PTE * NBPG)))
369 #define APDP_BASE ((pd_entry_t *)(((char *)APTE_BASE) + (PDSLOT_APTE * NBPG)))
370 #define PDP_PDE (PDP_BASE + PDSLOT_PTE)
371 #define APDP_PDE (PDP_BASE + PDSLOT_APTE)
372
373 /*
374 * pdei/ptei: generate index into PDP/PTP from a VA
375 */
376 #define PD_MASK 0xffe00000 /* page directory address bits */
377 #define PT_MASK 0x001ff000 /* page table address bits */
378 #define pdei(VA) (((VA) & PD_MASK) >> PDSHIFT)
379 #define ptei(VA) (((VA) & PT_MASK) >> PGSHIFT)
380
381 #define PD_MASK86 0xffc00000 /* for pmap86 transfer */
382 #define PT_MASK86 0x003ff000 /* for pmap86 transfer */
383
384 /*
385 * Mach derived conversion macros
386 */
387 #define i386_round_pdr(x) ((((unsigned)(x)) + ~PD_MASK) & PD_MASK)
388
389 /*
390 * various address macros
391 *
392 * vtopte: return a pointer to the PTE mapping a VA
393 */
394 #define vtopte(VA) (PTE_BASE + atop((vaddr_t)VA))
395
396 /*
397 * PTP macros:
398 * A PTP's index is the PD index of the PDE that points to it.
399 * A PTP's offset is the byte-offset in the PTE space that this PTP is at.
400 * A PTP's VA is the first VA mapped by that PTP.
401 *
402 * Note that NBPG == number of bytes in a PTP (4096 bytes == 512 entries)
403 * NBPD == number of bytes a PTP can map (2MB)
404 */
405
406 #define ptp_i2o(I) ((I) * NBPG) /* index => offset */
407 #define ptp_o2i(O) ((O) / NBPG) /* offset => index */
408 #define ptp_i2v(I) ((I) * NBPD) /* index => VA */
409 #define ptp_v2i(V) ((V) / NBPD) /* VA => index (same as pdei) */
410
411 /*
412 * Access PD and PT
413 */
414 #define PDE(pm,i) (((pd_entry_t *)(pm)->pm_pdir)[(i)])
415
416 /*
417 * here we define the data types for PDEs and PTEs for PAE
418 */
419 typedef u_int64_t pd_entry_t; /* PDE */
420 typedef u_int64_t pt_entry_t; /* PTE */
421
422 #define PG_NX 0x8000000000000000ULL /* execute-disable */
423
424 /*
425 * Number of PTEs per cache line. 8 byte pte, 64-byte cache line
426 * Used to avoid false sharing of cache lines.
427 */
428 #define NPTECL 8
429
430 /*
431 * other data structures
432 */
433
434 extern u_int32_t protection_codes[]; /* maps MI prot to i386 prot code */
435 extern int pmap_initialized; /* pmap_init done yet? */
436
437 /* Segment boundaries */
438 extern vaddr_t kernel_text, etext, __rodata_start, erodata, __data_start;
439 extern vaddr_t edata, __bss_start, end, ssym, esym, PTmap;
440
441 /*
442 * MULTIPROCESSOR: special VAs/ PTEs are actually allocated inside a
443 * MAXCPUS*NPTECL array of PTEs, to avoid cache line thrashing
444 * due to false sharing.
445 */
446
447 #ifdef MULTIPROCESSOR
448 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
449 #define VASLEW(va,id) ((va)+(id)*NPTECL*NBPG)
450 #else
451 #define PTESLEW(pte, id) (pte)
452 #define VASLEW(va,id) (va)
453 #endif
454
455 /*
456 * special VAs and the PTEs that map them
457 */
458
459 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *flsh_pte;
460 extern caddr_t pmap_csrcp, pmap_cdstp, pmap_zerop, pmap_ptpp, pmap_flshp;
461
462 extern int pmap_pg_g;
463 extern int pmap_pg_wc;
464 extern struct pmap_head pmaps;
465 extern struct mutex pmaps_lock;
466
467 extern uint32_t cpu_meltdown;
468
469 /*
470 * local prototypes
471 */
472 struct vm_page *pmap_alloc_ptp_pae(struct pmap *, int, pt_entry_t);
473 struct vm_page *pmap_get_ptp_pae(struct pmap *, int);
474 void pmap_drop_ptp_pae(struct pmap *, vaddr_t, struct vm_page *,
475 pt_entry_t *);
476 pt_entry_t *pmap_map_ptes_pae(struct pmap *);
477 void pmap_unmap_ptes_pae(struct pmap *);
478 void pmap_do_remove_pae(struct pmap *, vaddr_t, vaddr_t, int);
479 void pmap_remove_ptes_pae(struct pmap *, struct vm_page *,
480 vaddr_t, vaddr_t, vaddr_t, int, struct pv_entry **);
481 void pmap_sync_flags_pte_pae(struct vm_page *, pt_entry_t);
482
483 static __inline u_int
pmap_pte2flags(pt_entry_t pte)484 pmap_pte2flags(pt_entry_t pte)
485 {
486 return (((pte & PG_U) ? PG_PMAP_REF : 0) |
487 ((pte & PG_M) ? PG_PMAP_MOD : 0));
488 }
489
490 void
pmap_sync_flags_pte_pae(struct vm_page * pg,pt_entry_t pte)491 pmap_sync_flags_pte_pae(struct vm_page *pg, pt_entry_t pte)
492 {
493 if (pte & (PG_U|PG_M)) {
494 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte));
495 }
496 }
497
498 /*
499 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
500 *
501 * => we lock enough pmaps to keep things locked in
502 * => must be undone with pmap_unmap_ptes before returning
503 */
504
505 pt_entry_t *
pmap_map_ptes_pae(struct pmap * pmap)506 pmap_map_ptes_pae(struct pmap *pmap)
507 {
508 pd_entry_t opde;
509
510 /* the kernel's pmap is always accessible */
511 if (pmap == pmap_kernel()) {
512 return(PTE_BASE);
513 }
514
515 mtx_enter(&pmap->pm_mtx);
516
517 /* if curpmap then we are always mapped */
518 if (pmap_is_curpmap(pmap)) {
519 return(PTE_BASE);
520 }
521
522 mtx_enter(&curcpu()->ci_curpmap->pm_apte_mtx);
523
524 /* need to load a new alternate pt space into curpmap? */
525 opde = *APDP_PDE;
526 #if defined(MULTIPROCESSOR) && defined(DIAGNOSTIC)
527 if (pmap_valid_entry(opde))
528 panic("pmap_map_ptes_pae: APTE valid");
529 #endif
530 if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdidx[0]) {
531 APDP_PDE[0] = pmap->pm_pdidx[0] | PG_RW | PG_V | PG_U | PG_M;
532 APDP_PDE[1] = pmap->pm_pdidx[1] | PG_RW | PG_V | PG_U | PG_M;
533 APDP_PDE[2] = pmap->pm_pdidx[2] | PG_RW | PG_V | PG_U | PG_M;
534 APDP_PDE[3] = pmap->pm_pdidx[3] | PG_RW | PG_V | PG_U | PG_M;
535 if (pmap_valid_entry(opde))
536 pmap_apte_flush();
537 }
538 return(APTE_BASE);
539 }
540
541 /*
542 * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
543 */
544
545 void
pmap_unmap_ptes_pae(struct pmap * pmap)546 pmap_unmap_ptes_pae(struct pmap *pmap)
547 {
548 if (pmap == pmap_kernel())
549 return;
550
551 if (!pmap_is_curpmap(pmap)) {
552 #if defined(MULTIPROCESSOR)
553 APDP_PDE[0] = 0;
554 APDP_PDE[1] = 0;
555 APDP_PDE[2] = 0;
556 APDP_PDE[3] = 0;
557 pmap_apte_flush();
558 #endif
559 mtx_leave(&curcpu()->ci_curpmap->pm_apte_mtx);
560 }
561
562 mtx_leave(&pmap->pm_mtx);
563 }
564
565 u_int32_t
pmap_pte_set_pae(vaddr_t va,paddr_t pa,u_int32_t bits)566 pmap_pte_set_pae(vaddr_t va, paddr_t pa, u_int32_t bits)
567 {
568 pt_entry_t pte, *ptep = vtopte(va);
569 uint64_t nx;
570
571 pa &= PMAP_PA_MASK;
572
573 if (bits & PG_X)
574 nx = 0;
575 else
576 nx = PG_NX;
577
578 pte = i386_atomic_testset_uq(ptep, pa | bits | nx); /* zap! */
579 return (pte & ~PG_FRAME);
580 }
581
582 u_int32_t
pmap_pte_setbits_pae(vaddr_t va,u_int32_t set,u_int32_t clr)583 pmap_pte_setbits_pae(vaddr_t va, u_int32_t set, u_int32_t clr)
584 {
585 pt_entry_t *ptep = vtopte(va);
586 pt_entry_t pte = *ptep;
587
588 i386_atomic_testset_uq(ptep, (pte | set) & ~(pt_entry_t)clr);
589 return (pte & ~PG_FRAME);
590 }
591
592 u_int32_t
pmap_pte_bits_pae(vaddr_t va)593 pmap_pte_bits_pae(vaddr_t va)
594 {
595 pt_entry_t *ptep = vtopte(va);
596
597 return (*ptep & ~PG_FRAME);
598 }
599
600 paddr_t
pmap_pte_paddr_pae(vaddr_t va)601 pmap_pte_paddr_pae(vaddr_t va)
602 {
603 pt_entry_t *ptep = vtopte(va);
604
605 return (*ptep & PG_FRAME);
606 }
607
608 /*
609 * Allocate a new PD for Intel's U-K.
610 */
611 void
pmap_alloc_pdir_intel_pae(struct pmap * pmap)612 pmap_alloc_pdir_intel_pae(struct pmap *pmap)
613 {
614 vaddr_t va;
615 int i;
616
617 KASSERT(pmap->pm_pdir_intel == 0);
618
619 va = (vaddr_t)km_alloc(4 * NBPG, &kv_any, &kp_zero, &kd_waitok);
620 if (va == 0)
621 panic("kernel_map out of virtual space");
622 pmap->pm_pdir_intel = va;
623 if (!pmap_extract(pmap_kernel(), (vaddr_t)&pmap->pm_pdidx_intel,
624 &pmap->pm_pdirpa_intel))
625 panic("can't locate PDPT");
626
627 for (i = 0; i < 4; i++) {
628 pmap->pm_pdidx_intel[i] = 0;
629 if (!pmap_extract(pmap, va + i * NBPG,
630 (paddr_t *)&pmap->pm_pdidx_intel[i]))
631 panic("can't locate PD page");
632
633 pmap->pm_pdidx_intel[i] |= PG_V;
634
635 DPRINTF("%s: pm_pdidx_intel[%d] = 0x%llx\n", __func__,
636 i, pmap->pm_pdidx_intel[i]);
637 }
638 }
639
640 /*
641 * Switch over to PAE page tables
642 */
643 void
pmap_bootstrap_pae(void)644 pmap_bootstrap_pae(void)
645 {
646 extern int nkpde;
647 struct pmap *kpm = pmap_kernel();
648 struct vm_page *ptp;
649 paddr_t ptaddr;
650 u_int32_t bits, *pd = NULL;
651 vaddr_t va, eva;
652 pt_entry_t pte;
653
654 if ((cpu_feature & CPUID_PAE) == 0 ||
655 (ecpu_feature & CPUID_NXE) == 0)
656 return;
657
658 cpu_pae = 1;
659
660 DPRINTF("%s: pm_pdir 0x%x pm_pdirpa 0x%x pm_pdirsize %d\n", __func__,
661 (uint32_t)kpm->pm_pdir, (uint32_t)kpm->pm_pdirpa,
662 kpm->pm_pdirsize);
663
664 va = (vaddr_t)kpm->pm_pdir;
665 kpm->pm_pdidx[0] = (va + 0*NBPG - KERNBASE) | PG_V;
666 kpm->pm_pdidx[1] = (va + 1*NBPG - KERNBASE) | PG_V;
667 kpm->pm_pdidx[2] = (va + 2*NBPG - KERNBASE) | PG_V;
668 kpm->pm_pdidx[3] = (va + 3*NBPG - KERNBASE) | PG_V;
669 /* map pde recursively into itself */
670 PDE(kpm, PDSLOT_PTE+0) = kpm->pm_pdidx[0] | PG_KW | PG_M | PG_U;
671 PDE(kpm, PDSLOT_PTE+1) = kpm->pm_pdidx[1] | PG_KW | PG_M | PG_U;
672 PDE(kpm, PDSLOT_PTE+2) = kpm->pm_pdidx[2] | PG_KW | PG_M | PG_U;
673 PDE(kpm, PDSLOT_PTE+3) = kpm->pm_pdidx[3] | PG_KW | PG_M | PG_U;
674
675 /* allocate new special PD before transferring all mappings. */
676 if (kpm->pm_pdir_intel) {
677 pd = (uint32_t *)kpm->pm_pdir_intel;
678 kpm->pm_pdir_intel = kpm->pm_pdirpa_intel = 0;
679 pmap_alloc_pdir_intel_pae(kpm);
680 }
681
682 /* transfer all kernel mappings over into pae tables */
683 for (va = KERNBASE, eva = va + (nkpde << PDSHIFT86);
684 va < eva; va += PAGE_SIZE) {
685 if (!pmap_valid_entry(PDE(kpm, pdei(va)))) {
686 ptp = uvm_pagealloc(&kpm->pm_obj, va, NULL,
687 UVM_PGA_ZERO);
688 if (ptp == NULL)
689 panic("%s: uvm_pagealloc() failed", __func__);
690 ptaddr = VM_PAGE_TO_PHYS(ptp);
691 PDE(kpm, pdei(va)) = ptaddr | PG_KW | PG_V |
692 PG_U | PG_M;
693 pmap_pte_set_86((vaddr_t)vtopte(va),
694 ptaddr, PG_KW | PG_V | PG_U | PG_M);
695
696 /* count PTP as resident */
697 kpm->pm_stats.resident_count++;
698 }
699 bits = pmap_pte_bits_86(va) | pmap_pg_g;
700
701 /*
702 * At this point, ideally only kernel text should be executable.
703 * However, we need to leave the ISA hole executable to handle
704 * bios32, pcibios, and apmbios calls that may potentially
705 * happen later since we don't know (yet) which of those may be
706 * in use. Later (in biosattach), we will reset the permissions
707 * according to what we actually need.
708 */
709 if ((va >= (vaddr_t)&kernel_text && va <= (vaddr_t)&etext) ||
710 (va >= (vaddr_t)atdevbase && va <=
711 (vaddr_t)(atdevbase + IOM_SIZE)))
712 bits |= PG_X;
713 else
714 bits &= ~PG_X;
715
716 if (pmap_valid_entry(bits))
717 pmap_pte_set_pae(va, pmap_pte_paddr_86(va), bits);
718 }
719
720 /* Transfer special mappings */
721 if (pd) {
722 uint32_t *ptp;
723 uint32_t l1idx, l2idx;
724 paddr_t npa;
725 struct vm_page *ptppg;
726
727 for (va = KERNBASE, eva = va + (nkpde << PDSHIFT86); va < eva;
728 va += PAGE_SIZE) {
729 l1idx = ((va & PT_MASK86) >> PGSHIFT);
730 l2idx = ((va & PD_MASK86) >> PDSHIFT86);
731
732 if (!pmap_valid_entry(pd[l2idx]))
733 continue;
734
735 npa = pd[l2idx] & PMAP_PA_MASK;
736 ptppg = PHYS_TO_VM_PAGE(npa);
737 mtx_enter(&ptppg->mdpage.pv_mtx);
738
739 /* still running on pmap86 */
740 ptp = (uint32_t *)pmap_tmpmap_pa_86(npa);
741
742 if (!pmap_valid_entry(ptp[l1idx])) {
743 mtx_leave(&ptppg->mdpage.pv_mtx);
744 pmap_tmpunmap_pa_86();
745 continue;
746 }
747 DPRINTF("%s: va 0x%x l2idx %u 0x%x lx1idx %u 0x%x\n",
748 __func__, (uint32_t)va, l2idx, (uint32_t)pd[l2idx],
749 l1idx, (uint32_t)ptp[l1idx]);
750
751 /* protection and cacheability */
752 bits = ptp[l1idx] & (PG_PROT|PG_N|PG_WT);
753 npa = ptp[l1idx] & PMAP_PA_MASK;
754
755 /* still running on pmap86 */
756 pmap_tmpunmap_pa_86();
757 mtx_leave(&ptppg->mdpage.pv_mtx);
758
759 /* enforce use of pmap86 */
760 cpu_pae = 0;
761 pmap_enter_special_pae(va, npa, 0, bits);
762 cpu_pae = 1;
763
764 if (--ptppg->wire_count == 1) {
765 ptppg->wire_count = 0;
766 uvm_pagerealloc(ptppg, NULL, 0);
767 DPRINTF("%s: freeing PT page 0x%x\n", __func__,
768 (uint32_t)VM_PAGE_TO_PHYS(ptppg));
769 }
770 }
771 km_free(pd, NBPG, &kv_any, &kp_dirty);
772 DPRINTF("%s: freeing PDP 0x%x\n", __func__, (uint32_t)pd);
773 }
774
775 if (!cpu_paenable(&kpm->pm_pdidx[0])) {
776 extern struct user *proc0paddr;
777
778 proc0paddr->u_pcb.pcb_cr3 = kpm->pm_pdirpa =
779 (vaddr_t)kpm - KERNBASE;
780 kpm->pm_pdirsize = 4 * NBPG;
781
782 /* Reset cr3 for NMI task switch */
783 cpu_update_nmi_cr3(kpm->pm_pdirpa);
784
785 DPRINTF("%s: pm_pdir 0x%x pm_pdirpa 0x%x pm_pdirsize %d\n",
786 __func__, (uint32_t)kpm->pm_pdir, (uint32_t)kpm->pm_pdirpa,
787 kpm->pm_pdirsize);
788
789 csrc_pte = vtopte(pmap_csrcp);
790 cdst_pte = vtopte(pmap_cdstp);
791 zero_pte = vtopte(pmap_zerop);
792 ptp_pte = vtopte(pmap_ptpp);
793 flsh_pte = vtopte(pmap_flshp);
794
795 nkpde *= 2;
796 nkptp_max = 2048 - PDSLOT_KERN - 4;
797
798 pmap_pte_set_p = pmap_pte_set_pae;
799 pmap_pte_setbits_p = pmap_pte_setbits_pae;
800 pmap_pte_bits_p = pmap_pte_bits_pae;
801 pmap_pte_paddr_p = pmap_pte_paddr_pae;
802 pmap_clear_attrs_p = pmap_clear_attrs_pae;
803 pmap_enter_p = pmap_enter_pae;
804 pmap_enter_special_p = pmap_enter_special_pae;
805 pmap_extract_p = pmap_extract_pae;
806 pmap_growkernel_p = pmap_growkernel_pae;
807 pmap_page_remove_p = pmap_page_remove_pae;
808 pmap_do_remove_p = pmap_do_remove_pae;
809 pmap_test_attrs_p = pmap_test_attrs_pae;
810 pmap_unwire_p = pmap_unwire_pae;
811 pmap_write_protect_p = pmap_write_protect_pae;
812 pmap_pinit_pd_p = pmap_pinit_pd_pae;
813 pmap_zero_phys_p = pmap_zero_phys_pae;
814 pmap_copy_page_p = pmap_copy_page_pae;
815
816 bzero((void *)kpm->pm_pdir + 8, (PDSLOT_PTE-1) * 8);
817 /* TODO also reclaim old PDPs */
818 }
819
820 /* Set region permissions */
821 for (va = (vaddr_t)&PTmap; va < KERNBASE; va += NBPD) {
822 pte = PDE(kpm, pdei(va));
823 PDE(kpm, pdei(va)) = pte | PG_NX;
824 }
825
826 va = (vaddr_t)APTE_BASE;
827 pte = PDE(kpm, pdei(va));
828 PDE(kpm, pdei(va)) = pte | PG_NX;
829
830 pmap_write_protect(kpm, (vaddr_t)&kernel_text, (vaddr_t)&etext,
831 PROT_READ | PROT_EXEC);
832 pmap_write_protect(kpm, (vaddr_t)&__rodata_start,
833 (vaddr_t)&erodata, PROT_READ);
834 pmap_write_protect(kpm, (vaddr_t)&__data_start, (vaddr_t)&edata,
835 PROT_READ | PROT_WRITE);
836 pmap_write_protect(kpm, (vaddr_t)&__bss_start, (vaddr_t)&end,
837 PROT_READ | PROT_WRITE);
838
839 #if defined(DDB) || NKSYMS > 0
840 pmap_write_protect(kpm, ssym, esym, PROT_READ);
841 #endif
842 }
843
844 /*
845 * p t p f u n c t i o n s
846 */
847
848 /*
849 * pmap_alloc_ptp: allocate a PTP for a PMAP
850 *
851 * => pmap should already be locked by caller
852 * => we use the ptp's wire_count to count the number of active mappings
853 * in the PTP (we start it at one to prevent any chance this PTP
854 * will ever leak onto the active/inactive queues)
855 * => we should not be holding any pv_head locks (in case we are forced
856 * to call pmap_steal_ptp())
857 * => we may need to lock pv_head's if we have to steal a PTP
858 */
859
860 struct vm_page *
pmap_alloc_ptp_pae(struct pmap * pmap,int pde_index,pt_entry_t pde_flags)861 pmap_alloc_ptp_pae(struct pmap *pmap, int pde_index, pt_entry_t pde_flags)
862 {
863 struct vm_page *ptp;
864 pd_entry_t *pva_intel;
865
866 ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL,
867 UVM_PGA_USERESERVE|UVM_PGA_ZERO);
868 if (ptp == NULL)
869 return (NULL);
870
871 /* got one! */
872 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
873 ptp->wire_count = 1; /* no mappings yet */
874 PDE(pmap, pde_index) = (pd_entry_t)(VM_PAGE_TO_PHYS(ptp) |
875 PG_RW | PG_V | PG_M | PG_U | pde_flags);
876
877 /*
878 * Meltdown special case - if we are adding a new PDE for
879 * usermode addresses, just copy the PDE to the U-K
880 * table.
881 */
882 if (pmap->pm_pdir_intel && ptp_i2v(pde_index) < VM_MAXUSER_ADDRESS) {
883 pva_intel = (pd_entry_t *)pmap->pm_pdir_intel;
884 pva_intel[pde_index] = PDE(pmap, pde_index);
885 DPRINTF("%s: copying usermode PDE (content=0x%llx) pde_index "
886 "%d from 0x%llx -> 0x%llx\n", __func__,
887 PDE(pmap, pde_index), pde_index,
888 (uint64_t)&PDE(pmap, pde_index),
889 (uint64_t)&(pva_intel[pde_index]));
890 }
891
892 pmap->pm_stats.resident_count++; /* count PTP as resident */
893 pmap->pm_ptphint = ptp;
894 return(ptp);
895 }
896
897 /*
898 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
899 *
900 * => pmap should NOT be pmap_kernel()
901 * => pmap should be locked
902 */
903
904 struct vm_page *
pmap_get_ptp_pae(struct pmap * pmap,int pde_index)905 pmap_get_ptp_pae(struct pmap *pmap, int pde_index)
906 {
907 struct vm_page *ptp;
908
909 if (pmap_valid_entry(PDE(pmap, pde_index))) {
910 /* valid... check hint (saves us a PA->PG lookup) */
911 if (pmap->pm_ptphint &&
912 (PDE(pmap, pde_index) & PG_FRAME) ==
913 VM_PAGE_TO_PHYS(pmap->pm_ptphint))
914 return(pmap->pm_ptphint);
915
916 ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index));
917 #ifdef DIAGNOSTIC
918 if (ptp == NULL)
919 panic("pmap_get_ptp_pae: unmanaged user PTP");
920 #endif
921 pmap->pm_ptphint = ptp;
922 return(ptp);
923 }
924
925 /* allocate a new PTP (updates ptphint) */
926 return (pmap_alloc_ptp_pae(pmap, pde_index, PG_u));
927 }
928
929 void
pmap_drop_ptp_pae(struct pmap * pm,vaddr_t va,struct vm_page * ptp,pt_entry_t * ptes)930 pmap_drop_ptp_pae(struct pmap *pm, vaddr_t va, struct vm_page *ptp,
931 pt_entry_t *ptes)
932 {
933 pd_entry_t *pva_intel;
934
935 i386_atomic_testset_uq(&PDE(pm, pdei(va)), 0);
936 pmap_tlb_shootpage(curcpu()->ci_curpmap, ((vaddr_t)ptes) + ptp->offset);
937 #ifdef MULTIPROCESSOR
938 /*
939 * Always shoot down the other pmap's
940 * self-mapping of the PTP.
941 */
942 pmap_tlb_shootpage(pm, ((vaddr_t)PTE_BASE) + ptp->offset);
943 #endif
944 pm->pm_stats.resident_count--;
945 /* update hint */
946 if (pm->pm_ptphint == ptp)
947 pm->pm_ptphint = RBT_ROOT(uvm_objtree, &pm->pm_obj.memt);
948 ptp->wire_count = 0;
949 /* Postpone free to after shootdown. */
950 uvm_pagerealloc(ptp, NULL, 0);
951
952 if (pm->pm_pdir_intel) {
953 KASSERT(va < VM_MAXUSER_ADDRESS);
954 /* Zap special meltdown PDE */
955 pva_intel = (pd_entry_t *)pm->pm_pdir_intel;
956 i386_atomic_testset_uq(&pva_intel[pdei(va)], 0);
957 DPRINTF("%s: cleared meltdown PDE @ index %lu "
958 "(va range start 0x%x)\n", __func__, pdei(va),
959 (uint32_t)va);
960 }
961 }
962
963 /*
964 * pmap_pinit_pd: given a freshly allocated pmap structure, give it a PD
965 */
966 void
pmap_pinit_pd_pae(struct pmap * pmap)967 pmap_pinit_pd_pae(struct pmap *pmap)
968 {
969 extern int nkpde;
970 vaddr_t va;
971 paddr_t pdidx[4];
972
973 /* allocate PDP */
974 pmap->pm_pdir = (vaddr_t)km_alloc(4 * NBPG, &kv_any, &kp_dirty,
975 &kd_waitok);
976 if (pmap->pm_pdir == 0)
977 panic("kernel_map out of virtual space");
978 /* page index is in the pmap! */
979 pmap_extract(pmap_kernel(), (vaddr_t)pmap, &pmap->pm_pdirpa);
980 va = (vaddr_t)pmap->pm_pdir;
981 pmap_extract(pmap_kernel(), va + 0*NBPG, &pdidx[0]);
982 pmap_extract(pmap_kernel(), va + 1*NBPG, &pdidx[1]);
983 pmap_extract(pmap_kernel(), va + 2*NBPG, &pdidx[2]);
984 pmap_extract(pmap_kernel(), va + 3*NBPG, &pdidx[3]);
985 pmap->pm_pdidx[0] = (uint64_t)pdidx[0];
986 pmap->pm_pdidx[1] = (uint64_t)pdidx[1];
987 pmap->pm_pdidx[2] = (uint64_t)pdidx[2];
988 pmap->pm_pdidx[3] = (uint64_t)pdidx[3];
989 pmap->pm_pdidx[0] |= PG_V;
990 pmap->pm_pdidx[1] |= PG_V;
991 pmap->pm_pdidx[2] |= PG_V;
992 pmap->pm_pdidx[3] |= PG_V;
993 pmap->pm_pdirsize = 4 * NBPG;
994
995 /* init PDP */
996 /* zero init area */
997 bzero((void *)pmap->pm_pdir, PDSLOT_PTE * sizeof(pd_entry_t));
998 /* put in recursive PDE to map the PTEs */
999 PDE(pmap, PDSLOT_PTE+0) = pmap->pm_pdidx[0] | PG_KW | PG_U |
1000 PG_M | PG_V | PG_NX;
1001 PDE(pmap, PDSLOT_PTE+1) = pmap->pm_pdidx[1] | PG_KW | PG_U |
1002 PG_M | PG_V | PG_NX;
1003 PDE(pmap, PDSLOT_PTE+2) = pmap->pm_pdidx[2] | PG_KW | PG_U |
1004 PG_M | PG_V | PG_NX;
1005 PDE(pmap, PDSLOT_PTE+3) = pmap->pm_pdidx[3] | PG_KW | PG_U |
1006 PG_M | PG_V | PG_NX;
1007
1008 /*
1009 * we need to lock pmaps_lock to prevent nkpde from changing on
1010 * us. note that there is no need to splvm to protect us from
1011 * malloc since malloc allocates out of a submap and we should have
1012 * already allocated kernel PTPs to cover the range...
1013 */
1014 /* put in kernel VM PDEs */
1015 bcopy(&PDP_BASE[PDSLOT_KERN], &PDE(pmap, PDSLOT_KERN),
1016 nkpde * sizeof(pd_entry_t));
1017 /* zero the rest */
1018 bzero(&PDE(pmap, PDSLOT_KERN + nkpde), pmap->pm_pdirsize -
1019 ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t)));
1020
1021 /*
1022 * Intel CPUs need a special page table to be used during usermode
1023 * execution, one that lacks all kernel mappings.
1024 */
1025 if (cpu_meltdown) {
1026 pmap_alloc_pdir_intel_pae(pmap);
1027
1028 /* Copy PDEs from pmap_kernel's U-K view */
1029 bcopy((void *)pmap_kernel()->pm_pdir_intel,
1030 (void *)pmap->pm_pdir_intel, 4 * NBPG);
1031
1032 DPRINTF("%s: pmap %p pm_pdir 0x%lx pm_pdirpa 0x%lx "
1033 "pdir_intel 0x%lx pdirpa_intel 0x%lx\n",
1034 __func__, pmap, pmap->pm_pdir, pmap->pm_pdirpa,
1035 pmap->pm_pdir_intel, pmap->pm_pdirpa_intel);
1036 }
1037
1038 mtx_enter(&pmaps_lock);
1039 LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
1040 mtx_leave(&pmaps_lock);
1041 }
1042
1043 /*
1044 * some misc. functions
1045 */
1046
1047 /*
1048 * pmap_extract: extract a PA for the given VA
1049 */
1050
1051 int
pmap_extract_pae(struct pmap * pmap,vaddr_t va,paddr_t * pap)1052 pmap_extract_pae(struct pmap *pmap, vaddr_t va, paddr_t *pap)
1053 {
1054 pt_entry_t *ptes, pte;
1055
1056 ptes = pmap_map_ptes_pae(pmap);
1057 if (pmap_valid_entry(PDE(pmap, pdei(va)))) {
1058 pte = ptes[atop(va)];
1059 pmap_unmap_ptes_pae(pmap);
1060 if (!pmap_valid_entry(pte))
1061 return 0;
1062 if (pap != NULL)
1063 *pap = (pte & PG_FRAME) | (va & ~PG_FRAME);
1064 return 1;
1065 }
1066 pmap_unmap_ptes_pae(pmap);
1067 return 0;
1068 }
1069
1070 extern void (*pagezero)(void *, size_t);
1071
1072 /*
1073 * pmap_zero_phys: same as pmap_zero_page, but for use before vm_pages are
1074 * initialized.
1075 */
1076 void
pmap_zero_phys_pae(paddr_t pa)1077 pmap_zero_phys_pae(paddr_t pa)
1078 {
1079 #ifdef MULTIPROCESSOR
1080 int id = cpu_number();
1081 #endif
1082 pt_entry_t *zpte = PTESLEW(zero_pte, id);
1083 caddr_t zerova = VASLEW(pmap_zerop, id);
1084
1085 #ifdef DIAGNOSTIC
1086 if (*zpte)
1087 panic("pmap_zero_phys_pae: lock botch");
1088 #endif
1089
1090 *zpte = (pa & PG_FRAME) | PG_V | PG_RW; /* map in */
1091 pmap_update_pg((vaddr_t)zerova); /* flush TLB */
1092 pagezero(zerova, PAGE_SIZE); /* zero */
1093 *zpte = 0;
1094 }
1095
1096 /*
1097 * pmap_copy_page: copy a page
1098 */
1099
1100 void
pmap_copy_page_pae(struct vm_page * srcpg,struct vm_page * dstpg)1101 pmap_copy_page_pae(struct vm_page *srcpg, struct vm_page *dstpg)
1102 {
1103 paddr_t srcpa = VM_PAGE_TO_PHYS(srcpg);
1104 paddr_t dstpa = VM_PAGE_TO_PHYS(dstpg);
1105 #ifdef MULTIPROCESSOR
1106 int id = cpu_number();
1107 #endif
1108 pt_entry_t *spte = PTESLEW(csrc_pte, id);
1109 pt_entry_t *dpte = PTESLEW(cdst_pte, id);
1110 caddr_t csrcva = VASLEW(pmap_csrcp, id);
1111 caddr_t cdstva = VASLEW(pmap_cdstp, id);
1112
1113 #ifdef DIAGNOSTIC
1114 if (*spte || *dpte)
1115 panic("pmap_copy_page_pae: lock botch");
1116 #endif
1117
1118 *spte = (srcpa & PG_FRAME) | PG_V | PG_RW;
1119 *dpte = (dstpa & PG_FRAME) | PG_V | PG_RW;
1120 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
1121 bcopy(csrcva, cdstva, PAGE_SIZE);
1122 *spte = *dpte = 0;
1123 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
1124 }
1125
1126 /*
1127 * p m a p r e m o v e f u n c t i o n s
1128 *
1129 * functions that remove mappings
1130 */
1131
1132 /*
1133 * pmap_remove_ptes: remove PTEs from a PTP
1134 *
1135 * => caller must hold pmap's lock
1136 * => PTP must be mapped into KVA
1137 * => PTP should be null if pmap == pmap_kernel()
1138 */
1139
1140 void
pmap_remove_ptes_pae(struct pmap * pmap,struct vm_page * ptp,vaddr_t ptpva,vaddr_t startva,vaddr_t endva,int flags,struct pv_entry ** free_pvs)1141 pmap_remove_ptes_pae(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
1142 vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs)
1143 {
1144 struct pv_entry *pve;
1145 pt_entry_t *pte = (pt_entry_t *) ptpva;
1146 struct vm_page *pg;
1147 pt_entry_t opte;
1148
1149 /*
1150 * note that ptpva points to the PTE that maps startva. this may
1151 * or may not be the first PTE in the PTP.
1152 *
1153 * we loop through the PTP while there are still PTEs to look at
1154 * and the wire_count is greater than 1 (because we use the wire_count
1155 * to keep track of the number of real PTEs in the PTP).
1156 */
1157
1158 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
1159 ; pte++, startva += NBPG) {
1160 if (!pmap_valid_entry(*pte))
1161 continue; /* VA not mapped */
1162
1163 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W))
1164 continue;
1165
1166 /* atomically save the old PTE and zero it */
1167 opte = i386_atomic_testset_uq(pte, 0);
1168
1169 if (opte & PG_W)
1170 pmap->pm_stats.wired_count--;
1171 pmap->pm_stats.resident_count--;
1172
1173 if (ptp)
1174 ptp->wire_count--; /* dropping a PTE */
1175
1176 /*
1177 * Unnecessary work if not PG_PVLIST.
1178 */
1179 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
1180
1181 /*
1182 * if we are not on a pv list we are done.
1183 */
1184 if ((opte & PG_PVLIST) == 0) {
1185 #ifdef DIAGNOSTIC
1186 if (pg != NULL)
1187 panic("pmap_remove_ptes_pae: managed page "
1188 "without PG_PVLIST for 0x%lx", startva);
1189 #endif
1190 continue;
1191 }
1192
1193 #ifdef DIAGNOSTIC
1194 if (pg == NULL)
1195 panic("pmap_remove_ptes_pae: unmanaged page marked "
1196 "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
1197 startva, (u_long)(opte & PG_FRAME));
1198 #endif
1199
1200 /* sync R/M bits */
1201 pmap_sync_flags_pte_pae(pg, opte);
1202 pve = pmap_remove_pv(pg, pmap, startva);
1203 if (pve) {
1204 pve->pv_next = *free_pvs;
1205 *free_pvs = pve;
1206 }
1207
1208 /* end of "for" loop: time for next pte */
1209 }
1210 }
1211
1212 /*
1213 * pmap_remove: top level mapping removal function
1214 *
1215 * => caller should not be holding any pmap locks
1216 */
1217
1218 void
pmap_do_remove_pae(struct pmap * pmap,vaddr_t sva,vaddr_t eva,int flags)1219 pmap_do_remove_pae(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
1220 {
1221 pt_entry_t *ptes;
1222 paddr_t ptppa;
1223 vaddr_t blkendva;
1224 struct vm_page *ptp;
1225 struct pv_entry *pve;
1226 struct pv_entry *free_pvs = NULL;
1227 TAILQ_HEAD(, vm_page) empty_ptps;
1228 int shootall;
1229 vaddr_t va;
1230
1231 TAILQ_INIT(&empty_ptps);
1232
1233 ptes = pmap_map_ptes_pae(pmap); /* locks pmap */
1234
1235 /*
1236 * Decide if we want to shoot the whole tlb or just the range.
1237 * Right now, we simply shoot everything when we remove more
1238 * than 32 pages, but never in the kernel pmap. XXX - tune.
1239 */
1240 if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
1241 shootall = 1;
1242 else
1243 shootall = 0;
1244
1245 for (va = sva ; va < eva ; va = blkendva) {
1246 /* determine range of block */
1247 blkendva = i386_round_pdr(va + 1);
1248 if (blkendva > eva)
1249 blkendva = eva;
1250
1251 /*
1252 * XXXCDC: our PTE mappings should never be removed
1253 * with pmap_remove! if we allow this (and why would
1254 * we?) then we end up freeing the pmap's page
1255 * directory page (PDP) before we are finished using
1256 * it when we hit it in the recursive mapping. this
1257 * is BAD.
1258 *
1259 * long term solution is to move the PTEs out of user
1260 * address space. and into kernel address space (up
1261 * with APTE). then we can set VM_MAXUSER_ADDRESS to
1262 * be VM_MAX_ADDRESS.
1263 */
1264
1265 if (pdei(va) >= PDSLOT_PTE && pdei(va) <= (PDSLOT_PTE + 3))
1266 /* XXXCDC: ugly hack to avoid freeing PDP here */
1267 continue;
1268
1269 if (!pmap_valid_entry(PDE(pmap, pdei(va))))
1270 /* valid block? */
1271 continue;
1272
1273 /* PA of the PTP */
1274 ptppa = PDE(pmap, pdei(va)) & PG_FRAME;
1275
1276 /* get PTP if non-kernel mapping */
1277 if (pmap == pmap_kernel()) {
1278 /* we never free kernel PTPs */
1279 ptp = NULL;
1280 } else {
1281 if (pmap->pm_ptphint &&
1282 VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) {
1283 ptp = pmap->pm_ptphint;
1284 } else {
1285 ptp = PHYS_TO_VM_PAGE(ptppa);
1286 #ifdef DIAGNOSTIC
1287 if (ptp == NULL)
1288 panic("pmap_do_remove_pae: unmanaged "
1289 "PTP detected");
1290 #endif
1291 }
1292 }
1293
1294 pmap_remove_ptes_pae(pmap, ptp, (vaddr_t)&ptes[atop(va)],
1295 va, blkendva, flags, &free_pvs);
1296
1297 /* If PTP is no longer being used, free it. */
1298 if (ptp && ptp->wire_count <= 1) {
1299 pmap_drop_ptp_pae(pmap, va, ptp, ptes);
1300 TAILQ_INSERT_TAIL(&empty_ptps, ptp, pageq);
1301 }
1302
1303 if (!shootall)
1304 pmap_tlb_shootrange(pmap, va, blkendva);
1305 }
1306
1307 if (shootall)
1308 pmap_tlb_shoottlb();
1309
1310 pmap_unmap_ptes_pae(pmap);
1311 pmap_tlb_shootwait();
1312
1313 while ((pve = free_pvs) != NULL) {
1314 free_pvs = pve->pv_next;
1315 pool_put(&pmap_pv_pool, pve);
1316 }
1317
1318 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1319 TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1320 uvm_pagefree(ptp);
1321 }
1322 }
1323
1324 /*
1325 * pmap_page_remove: remove a managed vm_page from all pmaps that map it
1326 *
1327 * => R/M bits are sync'd back to attrs
1328 */
1329
1330 void
pmap_page_remove_pae(struct vm_page * pg)1331 pmap_page_remove_pae(struct vm_page *pg)
1332 {
1333 struct pv_entry *pve;
1334 struct pmap *pm;
1335 pt_entry_t *ptes, opte;
1336 TAILQ_HEAD(, vm_page) empty_ptps;
1337 struct vm_page *ptp;
1338
1339 if (pg->mdpage.pv_list == NULL)
1340 return;
1341
1342 TAILQ_INIT(&empty_ptps);
1343
1344 mtx_enter(&pg->mdpage.pv_mtx);
1345 while ((pve = pg->mdpage.pv_list) != NULL) {
1346 pmap_reference(pve->pv_pmap);
1347 pm = pve->pv_pmap;
1348 mtx_leave(&pg->mdpage.pv_mtx);
1349
1350 ptes = pmap_map_ptes_pae(pm); /* locks pmap */
1351
1352 /*
1353 * We dropped the pvlist lock before grabbing the pmap
1354 * lock to avoid lock ordering problems. This means
1355 * we have to check the pvlist again since somebody
1356 * else might have modified it. All we care about is
1357 * that the pvlist entry matches the pmap we just
1358 * locked. If it doesn't, unlock the pmap and try
1359 * again.
1360 */
1361 mtx_enter(&pg->mdpage.pv_mtx);
1362 if ((pve = pg->mdpage.pv_list) == NULL ||
1363 pve->pv_pmap != pm) {
1364 mtx_leave(&pg->mdpage.pv_mtx);
1365 pmap_unmap_ptes_pae(pm); /* unlocks pmap */
1366 pmap_destroy(pm);
1367 mtx_enter(&pg->mdpage.pv_mtx);
1368 continue;
1369 }
1370
1371 pg->mdpage.pv_list = pve->pv_next;
1372 mtx_leave(&pg->mdpage.pv_mtx);
1373
1374 #ifdef DIAGNOSTIC
1375 if (pve->pv_ptp && (PDE(pve->pv_pmap, pdei(pve->pv_va)) &
1376 PG_FRAME)
1377 != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
1378 printf("pmap_page_remove_pae: pg=%p: va=%lx, "
1379 "pv_ptp=%p\n",
1380 pg, pve->pv_va, pve->pv_ptp);
1381 printf("pmap_page_remove_pae: PTP's phys addr: "
1382 "actual=%llx, recorded=%lx\n",
1383 (PDE(pve->pv_pmap, pdei(pve->pv_va)) &
1384 PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp));
1385 panic("pmap_page_remove_pae: mapped managed page has "
1386 "invalid pv_ptp field");
1387 }
1388 #endif
1389 opte = i386_atomic_testset_uq(&ptes[atop(pve->pv_va)], 0);
1390
1391 if (opte & PG_W)
1392 pve->pv_pmap->pm_stats.wired_count--;
1393 pve->pv_pmap->pm_stats.resident_count--;
1394
1395 /* sync R/M bits */
1396 pmap_sync_flags_pte_pae(pg, opte);
1397
1398 /* update the PTP reference count. free if last reference. */
1399 if (pve->pv_ptp && --pve->pv_ptp->wire_count <= 1) {
1400 pmap_drop_ptp_pae(pve->pv_pmap, pve->pv_va,
1401 pve->pv_ptp, ptes);
1402 TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp, pageq);
1403 }
1404
1405 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
1406
1407 pmap_unmap_ptes_pae(pve->pv_pmap); /* unlocks pmap */
1408 pmap_destroy(pve->pv_pmap);
1409 pool_put(&pmap_pv_pool, pve);
1410 mtx_enter(&pg->mdpage.pv_mtx);
1411 }
1412 mtx_leave(&pg->mdpage.pv_mtx);
1413
1414 pmap_tlb_shootwait();
1415
1416 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1417 TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1418 uvm_pagefree(ptp);
1419 }
1420 }
1421
1422 /*
1423 * p m a p a t t r i b u t e f u n c t i o n s
1424 * functions that test/change managed page's attributes
1425 * since a page can be mapped multiple times we must check each PTE that
1426 * maps it by going down the pv lists.
1427 */
1428
1429 /*
1430 * pmap_test_attrs: test a page's attributes
1431 *
1432 * => we set pv_head => pmap locking
1433 */
1434
1435 int
pmap_test_attrs_pae(struct vm_page * pg,int testbits)1436 pmap_test_attrs_pae(struct vm_page *pg, int testbits)
1437 {
1438 struct pv_entry *pve;
1439 pt_entry_t *ptes, pte;
1440 u_long mybits, testflags;
1441 paddr_t ptppa;
1442
1443 testflags = pmap_pte2flags(testbits);
1444
1445 if (pg->pg_flags & testflags)
1446 return 1;
1447
1448 mybits = 0;
1449 mtx_enter(&pg->mdpage.pv_mtx);
1450 for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0;
1451 pve = pve->pv_next) {
1452 ptppa = PDE(pve->pv_pmap, pdei(pve->pv_va)) & PG_FRAME;
1453 ptes = (pt_entry_t *)pmap_tmpmap_pa(ptppa);
1454 pte = ptes[ptei(pve->pv_va)];
1455 pmap_tmpunmap_pa();
1456 mybits |= (pte & testbits);
1457 }
1458 mtx_leave(&pg->mdpage.pv_mtx);
1459
1460 if (mybits == 0)
1461 return 0;
1462
1463 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits));
1464
1465 return 1;
1466 }
1467
1468 /*
1469 * pmap_clear_attrs: change a page's attributes
1470 *
1471 * => we return 1 if we cleared one of the bits we were asked to
1472 */
1473 int
pmap_clear_attrs_pae(struct vm_page * pg,int clearbits)1474 pmap_clear_attrs_pae(struct vm_page *pg, int clearbits)
1475 {
1476 struct pv_entry *pve;
1477 pt_entry_t *ptes, npte, opte;
1478 u_long clearflags;
1479 paddr_t ptppa;
1480 int result;
1481
1482 clearflags = pmap_pte2flags(clearbits);
1483
1484 result = pg->pg_flags & clearflags;
1485 if (result)
1486 atomic_clearbits_int(&pg->pg_flags, clearflags);
1487
1488 mtx_enter(&pg->mdpage.pv_mtx);
1489 for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) {
1490 ptppa = PDE(pve->pv_pmap, pdei(pve->pv_va)) & PG_FRAME;
1491 ptes = (pt_entry_t *)pmap_tmpmap_pa(ptppa);
1492 #ifdef DIAGNOSTIC
1493 if (!pmap_valid_entry(PDE(pve->pv_pmap, pdei(pve->pv_va))))
1494 panic("pmap_clear_attrs_pae: mapping without PTP "
1495 "detected");
1496 #endif
1497
1498 opte = ptes[ptei(pve->pv_va)];
1499 if (opte & clearbits) {
1500 result = 1;
1501 npte = opte & ~clearbits;
1502 opte = i386_atomic_testset_uq(
1503 &ptes[ptei(pve->pv_va)], npte);
1504 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
1505 }
1506 pmap_tmpunmap_pa();
1507 }
1508 mtx_leave(&pg->mdpage.pv_mtx);
1509
1510 pmap_tlb_shootwait();
1511
1512 return (result != 0);
1513 }
1514
1515
1516 /*
1517 * p m a p p r o t e c t i o n f u n c t i o n s
1518 */
1519
1520 /*
1521 * pmap_page_protect: change the protection of all recorded mappings
1522 * of a managed page
1523 *
1524 * => NOTE: this is an inline function in pmap.h
1525 */
1526
1527 /* see pmap.h */
1528
1529 /*
1530 * pmap_protect: set the protection in of the pages in a pmap
1531 *
1532 * => NOTE: this is an inline function in pmap.h
1533 */
1534
1535 /* see pmap.h */
1536
1537 /*
1538 * pmap_write_protect: write-protect pages in a pmap
1539 */
1540
1541 void
pmap_write_protect_pae(struct pmap * pmap,vaddr_t sva,vaddr_t eva,vm_prot_t prot)1542 pmap_write_protect_pae(struct pmap *pmap, vaddr_t sva, vaddr_t eva,
1543 vm_prot_t prot)
1544 {
1545 pt_entry_t *ptes, *spte, *epte, npte, opte;
1546 vaddr_t blockend;
1547 u_int64_t md_prot;
1548 vaddr_t va;
1549 int shootall = 0;
1550
1551 ptes = pmap_map_ptes_pae(pmap); /* locks pmap */
1552
1553 /* should be ok, but just in case ... */
1554 sva &= PG_FRAME;
1555 eva &= PG_FRAME;
1556
1557 if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
1558 shootall = 1;
1559
1560 for (va = sva; va < eva; va = blockend) {
1561 blockend = (va & PD_MASK) + NBPD;
1562 if (blockend > eva)
1563 blockend = eva;
1564
1565 /*
1566 * XXXCDC: our PTE mappings should never be write-protected!
1567 *
1568 * long term solution is to move the PTEs out of user
1569 * address space. and into kernel address space (up
1570 * with APTE). then we can set VM_MAXUSER_ADDRESS to
1571 * be VM_MAX_ADDRESS.
1572 */
1573
1574 /* XXXCDC: ugly hack to avoid freeing PDP here */
1575 if (pdei(va) >= PDSLOT_PTE && pdei(va) <= (PDSLOT_PTE + 3))
1576 continue;
1577
1578 /* empty block? */
1579 if (!pmap_valid_entry(PDE(pmap, pdei(va))))
1580 continue;
1581
1582 md_prot = protection_codes[prot];
1583 if (!(prot & PROT_EXEC))
1584 md_prot |= PG_NX;
1585 if (va < VM_MAXUSER_ADDRESS)
1586 md_prot |= PG_u;
1587 else if (va < VM_MAX_ADDRESS)
1588 /* XXX: write-prot our PTES? never! */
1589 md_prot |= PG_RW;
1590
1591 spte = &ptes[atop(va)];
1592 epte = &ptes[atop(blockend)];
1593
1594 for (/*null */; spte < epte ; spte++, va += PAGE_SIZE) {
1595
1596 if (!pmap_valid_entry(*spte)) /* no mapping? */
1597 continue;
1598
1599 opte = *spte;
1600 npte = (opte & ~(pt_entry_t)PG_PROT) | md_prot;
1601
1602 if (npte != opte) {
1603 pmap_exec_account(pmap, va, *spte, npte);
1604 i386_atomic_testset_uq(spte, npte);
1605 }
1606 }
1607 }
1608 if (shootall)
1609 pmap_tlb_shoottlb();
1610 else
1611 pmap_tlb_shootrange(pmap, sva, eva);
1612
1613 pmap_unmap_ptes_pae(pmap); /* unlocks pmap */
1614 pmap_tlb_shootwait();
1615 }
1616
1617 /*
1618 * end of protection functions
1619 */
1620
1621 /*
1622 * pmap_unwire: clear the wired bit in the PTE
1623 *
1624 * => mapping should already be in map
1625 */
1626
1627 void
pmap_unwire_pae(struct pmap * pmap,vaddr_t va)1628 pmap_unwire_pae(struct pmap *pmap, vaddr_t va)
1629 {
1630 pt_entry_t *ptes;
1631
1632 if (pmap_valid_entry(PDE(pmap, pdei(va)))) {
1633 ptes = pmap_map_ptes_pae(pmap); /* locks pmap */
1634
1635 #ifdef DIAGNOSTIC
1636 if (!pmap_valid_entry(ptes[atop(va)]))
1637 panic("pmap_unwire_pae: invalid (unmapped) va "
1638 "0x%lx", va);
1639 #endif
1640 if ((ptes[atop(va)] & PG_W) != 0) {
1641 i386_atomic_testset_uq(&ptes[atop(va)],
1642 ptes[atop(va)] & ~PG_W);
1643 pmap->pm_stats.wired_count--;
1644 }
1645 #ifdef DIAGNOSTIC
1646 else {
1647 printf("pmap_unwire_pae: wiring for pmap %p va 0x%lx "
1648 "didn't change!\n", pmap, va);
1649 }
1650 #endif
1651 pmap_unmap_ptes_pae(pmap); /* unlocks map */
1652 }
1653 #ifdef DIAGNOSTIC
1654 else {
1655 panic("pmap_unwire_pae: invalid PDE");
1656 }
1657 #endif
1658 }
1659
1660 /*
1661 * pmap_enter: enter a mapping into a pmap
1662 *
1663 * => must be done "now" ... no lazy-evaluation
1664 */
1665
1666 int
pmap_enter_pae(struct pmap * pmap,vaddr_t va,paddr_t pa,vm_prot_t prot,int flags)1667 pmap_enter_pae(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
1668 int flags)
1669 {
1670 pt_entry_t *ptes, opte, npte;
1671 struct vm_page *ptp;
1672 struct pv_entry *pve, *opve = NULL;
1673 int wired = (flags & PMAP_WIRED) != 0;
1674 int nocache = (pa & PMAP_NOCACHE) != 0;
1675 int wc = (pa & PMAP_WC) != 0;
1676 struct vm_page *pg = NULL;
1677 int error, wired_count, resident_count, ptp_count;
1678
1679 KASSERT(!(wc && nocache));
1680 pa &= PMAP_PA_MASK; /* nuke flags from pa */
1681
1682 #ifdef DIAGNOSTIC
1683 /* sanity check: totally out of range? */
1684 if (va >= VM_MAX_KERNEL_ADDRESS)
1685 panic("pmap_enter_pae: too big");
1686
1687 if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
1688 panic("pmap_enter_pae: trying to map over PDP/APDP!");
1689
1690 /* sanity check: kernel PTPs should already have been pre-allocated */
1691 if (va >= VM_MIN_KERNEL_ADDRESS &&
1692 !pmap_valid_entry(PDE(pmap, pdei(va))))
1693 panic("pmap_enter_pae: missing kernel PTP!");
1694 #endif
1695
1696 if (pmap_initialized)
1697 pve = pool_get(&pmap_pv_pool, PR_NOWAIT);
1698 else
1699 pve = NULL;
1700 wired_count = resident_count = ptp_count = 0;
1701
1702 /*
1703 * map in ptes and get a pointer to our PTP (unless we are the kernel)
1704 */
1705
1706 ptes = pmap_map_ptes_pae(pmap); /* locks pmap */
1707 if (pmap == pmap_kernel()) {
1708 ptp = NULL;
1709 } else {
1710 ptp = pmap_get_ptp_pae(pmap, pdei(va));
1711 if (ptp == NULL) {
1712 if (flags & PMAP_CANFAIL) {
1713 error = ENOMEM;
1714 pmap_unmap_ptes_pae(pmap);
1715 goto out;
1716 }
1717 panic("pmap_enter_pae: get ptp failed");
1718 }
1719 }
1720 /*
1721 * not allowed to sleep after here!
1722 */
1723 opte = ptes[atop(va)]; /* old PTE */
1724
1725 /*
1726 * is there currently a valid mapping at our VA?
1727 */
1728
1729 if (pmap_valid_entry(opte)) {
1730
1731 /*
1732 * first, calculate pm_stats updates. resident count will not
1733 * change since we are replacing/changing a valid
1734 * mapping. wired count might change...
1735 */
1736
1737 if (wired && (opte & PG_W) == 0)
1738 wired_count++;
1739 else if (!wired && (opte & PG_W) != 0)
1740 wired_count--;
1741
1742 /*
1743 * is the currently mapped PA the same as the one we
1744 * want to map?
1745 */
1746
1747 if ((opte & PG_FRAME) == pa) {
1748
1749 /* if this is on the PVLIST, sync R/M bit */
1750 if (opte & PG_PVLIST) {
1751 pg = PHYS_TO_VM_PAGE(pa);
1752 #ifdef DIAGNOSTIC
1753 if (pg == NULL)
1754 panic("pmap_enter_pae: same pa "
1755 "PG_PVLIST mapping with "
1756 "unmanaged page "
1757 "pa = 0x%lx (0x%lx)", pa,
1758 atop(pa));
1759 #endif
1760 pmap_sync_flags_pte_pae(pg, opte);
1761 }
1762 goto enter_now;
1763 }
1764
1765 /*
1766 * changing PAs: we must remove the old one first
1767 */
1768
1769 /*
1770 * if current mapping is on a pvlist,
1771 * remove it (sync R/M bits)
1772 */
1773
1774 if (opte & PG_PVLIST) {
1775 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
1776 #ifdef DIAGNOSTIC
1777 if (pg == NULL)
1778 panic("pmap_enter_pae: PG_PVLIST mapping with "
1779 "unmanaged page "
1780 "pa = 0x%lx (0x%lx)", pa, atop(pa));
1781 #endif
1782 pmap_sync_flags_pte_pae(pg, opte);
1783 opve = pmap_remove_pv(pg, pmap, va);
1784 pg = NULL; /* This is not the page we are looking for */
1785 }
1786 } else { /* opte not valid */
1787 resident_count++;
1788 if (wired)
1789 wired_count++;
1790 if (ptp)
1791 ptp_count++; /* count # of valid entries */
1792 }
1793
1794 /*
1795 * pve is either NULL or points to a now-free pv_entry structure
1796 * (the latter case is if we called pmap_remove_pv above).
1797 *
1798 * if this entry is to be on a pvlist, enter it now.
1799 */
1800
1801 if (pmap_initialized && pg == NULL)
1802 pg = PHYS_TO_VM_PAGE(pa);
1803
1804 if (pg != NULL) {
1805 if (pve == NULL) {
1806 pve = opve;
1807 opve = NULL;
1808 }
1809 if (pve == NULL) {
1810 if (flags & PMAP_CANFAIL) {
1811 pmap_unmap_ptes_pae(pmap);
1812 error = ENOMEM;
1813 goto out;
1814 }
1815 panic("pmap_enter_pae: no pv entries available");
1816 }
1817 /* lock pg when adding */
1818 pmap_enter_pv(pg, pve, pmap, va, ptp);
1819 pve = NULL;
1820 }
1821
1822 enter_now:
1823 /*
1824 * at this point pg is !NULL if we want the PG_PVLIST bit set
1825 */
1826
1827 npte = pa | protection_codes[prot] | PG_V;
1828 if (!(prot & PROT_EXEC))
1829 npte |= PG_NX;
1830 pmap_exec_account(pmap, va, opte, npte);
1831 if (wired)
1832 npte |= PG_W;
1833 if (nocache)
1834 npte |= PG_N;
1835 if (va < VM_MAXUSER_ADDRESS)
1836 npte |= PG_u;
1837 else if (va < VM_MAX_ADDRESS)
1838 npte |= PG_RW; /* XXXCDC: no longer needed? */
1839 if (pmap == pmap_kernel())
1840 npte |= pmap_pg_g;
1841 if (flags & PROT_READ)
1842 npte |= PG_U;
1843 if (flags & PROT_WRITE)
1844 npte |= PG_M;
1845 if (pg) {
1846 npte |= PG_PVLIST;
1847 if (pg->pg_flags & PG_PMAP_WC) {
1848 KASSERT(nocache == 0);
1849 wc = 1;
1850 }
1851 pmap_sync_flags_pte_pae(pg, npte);
1852 }
1853 if (wc)
1854 npte |= pmap_pg_wc;
1855
1856 opte = i386_atomic_testset_uq(&ptes[atop(va)], npte);
1857 if (ptp)
1858 ptp->wire_count += ptp_count;
1859 pmap->pm_stats.resident_count += resident_count;
1860 pmap->pm_stats.wired_count += wired_count;
1861
1862 if (pmap_valid_entry(opte)) {
1863 if (nocache && (opte & PG_N) == 0)
1864 wbinvd_on_all_cpus(); /* XXX clflush before we enter? */
1865 pmap_tlb_shootpage(pmap, va);
1866 }
1867
1868 pmap_unmap_ptes_pae(pmap);
1869 pmap_tlb_shootwait();
1870
1871 error = 0;
1872
1873 out:
1874 if (pve)
1875 pool_put(&pmap_pv_pool, pve);
1876 if (opve)
1877 pool_put(&pmap_pv_pool, opve);
1878
1879 return error;
1880 }
1881
1882 /*
1883 * Allocate an extra PDPT and PT pages as needed to map kernel pages
1884 * used for the U-K mappings. These special mappings are set up
1885 * during bootstrap and get never removed and are part of pmap_kernel.
1886 *
1887 * New pmaps inherit the kernel portion of pmap_kernel including
1888 * the special mappings (see pmap_pinit_pd_pae()).
1889 */
1890 void
pmap_enter_special_pae(vaddr_t va,paddr_t pa,vm_prot_t prot,u_int32_t flags)1891 pmap_enter_special_pae(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int32_t flags)
1892 {
1893 struct pmap *pmap = pmap_kernel();
1894 struct vm_page *ptppg = NULL;
1895 pd_entry_t *pd, *ptp;
1896 pt_entry_t *ptes;
1897 uint32_t l2idx, l1idx;
1898 paddr_t npa;
1899
1900 /* If CPU is secure, no need to do anything */
1901 if (!cpu_meltdown)
1902 return;
1903
1904 /* Must be kernel VA */
1905 if (va < VM_MIN_KERNEL_ADDRESS)
1906 panic("invalid special mapping va 0x%lx requested", va);
1907
1908 KASSERT(pmap->pm_pdir_intel != 0);
1909
1910 DPRINTF("%s: pm_pdir_intel 0x%x pm_pdirpa_intel 0x%x\n", __func__,
1911 (uint32_t)pmap->pm_pdir_intel, (uint32_t)pmap->pm_pdirpa_intel);
1912
1913 /* These are the PAE versions of pdei() and ptei() */
1914 l2idx = pdei(va);
1915 l1idx = ptei(va);
1916
1917 DPRINTF("%s: va 0x%08lx pa 0x%08lx prot 0x%08lx flags 0x%08x "
1918 "l2idx %u l1idx %u\n", __func__, va, pa, (unsigned long)prot,
1919 flags, l2idx, l1idx);
1920
1921 if ((pd = (pd_entry_t *)pmap->pm_pdir_intel) == 0)
1922 panic("%s: PD not initialized for pmap @ %p", __func__, pmap);
1923
1924 /* npa = physaddr of PT page */
1925 npa = pd[l2idx] & PMAP_PA_MASK;
1926
1927 /* Valid PDE for the 2MB region containing va? */
1928 if (!npa) {
1929 /*
1930 * No valid PDE - allocate PT page and set PDE. We
1931 * get it from pm_obj, which is used for PT pages.
1932 * We calculate the offset from l2idx+2048, so we are
1933 * beyond the regular PT pages. For their l2dix
1934 * 0 <= l2idx < 2048 holds.
1935 */
1936 ptppg = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(l2idx + 2048),
1937 NULL, UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1938 if (ptppg == NULL)
1939 panic("%s: failed to allocate PT page", __func__);
1940
1941 atomic_clearbits_int(&ptppg->pg_flags, PG_BUSY);
1942 ptppg->wire_count = 1; /* no mappings yet */
1943
1944 npa = VM_PAGE_TO_PHYS(ptppg);
1945 pd[l2idx] = (npa | PG_RW | PG_V | PG_M | PG_U);
1946
1947 DPRINTF("%s: allocated new PT page at phys 0x%x, "
1948 "setting PDE[%d] = 0x%llx\n", __func__, (uint32_t)npa,
1949 l2idx, pd[l2idx]);
1950 }
1951
1952 /* temporarily map PT page and set PTE for U-K mapping */
1953 if (ptppg == NULL && (ptppg = PHYS_TO_VM_PAGE(npa)) == NULL)
1954 panic("%s: no vm_page for PT page", __func__);
1955 mtx_enter(&ptppg->mdpage.pv_mtx);
1956 ptp = (pd_entry_t *)pmap_tmpmap_pa(npa);
1957 ptp[l1idx] = (pa | protection_codes[prot] | PG_V | PG_M | PG_U | flags);
1958 DPRINTF("%s: setting PTE[%d] = 0x%llx\n", __func__, l1idx, ptp[l1idx]);
1959 pmap_tmpunmap_pa();
1960 mtx_leave(&ptppg->mdpage.pv_mtx);
1961
1962 /* if supported, set the PG_G flag on the corresponding U+K entry */
1963 if (!(cpu_feature & CPUID_PGE))
1964 return;
1965 ptes = pmap_map_ptes_pae(pmap); /* pmap_kernel -> PTE_BASE */
1966 if (pmap_valid_entry(ptes[atop(va)]))
1967 ptes[atop(va)] |= PG_G;
1968 else
1969 DPRINTF("%s: no U+K mapping for special mapping?\n", __func__);
1970 pmap_unmap_ptes_pae(pmap); /* pmap_kernel -> nothing */
1971 }
1972
1973 /*
1974 * pmap_growkernel: increase usage of KVM space
1975 *
1976 * => we allocate new PTPs for the kernel and install them in all
1977 * the pmaps on the system.
1978 */
1979
1980 vaddr_t
pmap_growkernel_pae(vaddr_t maxkvaddr)1981 pmap_growkernel_pae(vaddr_t maxkvaddr)
1982 {
1983 extern int nkpde;
1984 struct pmap *kpm = pmap_kernel(), *pm;
1985 int needed_kpde; /* needed number of kernel PTPs */
1986 int s;
1987 paddr_t ptaddr;
1988
1989 needed_kpde = (int)(maxkvaddr - VM_MIN_KERNEL_ADDRESS + (NBPD-1))
1990 / NBPD;
1991 if (needed_kpde <= nkpde)
1992 goto out; /* we are OK */
1993
1994 /*
1995 * whoops! we need to add kernel PTPs
1996 */
1997
1998 s = splhigh(); /* to be safe */
1999
2000 for (/*null*/ ; nkpde < needed_kpde ; nkpde++) {
2001
2002 if (uvm.page_init_done == 0) {
2003
2004 /*
2005 * we're growing the kernel pmap early (from
2006 * uvm_pageboot_alloc()). this case must be
2007 * handled a little differently.
2008 */
2009
2010 if (uvm_page_physget(&ptaddr) == 0)
2011 panic("pmap_growkernel: out of memory");
2012 pmap_zero_phys_pae(ptaddr);
2013
2014 PDE(kpm, PDSLOT_KERN + nkpde) =
2015 ptaddr | PG_RW | PG_V | PG_U | PG_M;
2016
2017 /* count PTP as resident */
2018 kpm->pm_stats.resident_count++;
2019 continue;
2020 }
2021
2022 /*
2023 * THIS *MUST* BE CODED SO AS TO WORK IN THE
2024 * pmap_initialized == 0 CASE! WE MAY BE
2025 * INVOKED WHILE pmap_init() IS RUNNING!
2026 */
2027
2028 while (!pmap_alloc_ptp_pae(kpm, PDSLOT_KERN + nkpde, 0))
2029 uvm_wait("pmap_growkernel");
2030
2031 /* distribute new kernel PTP to all active pmaps */
2032 mtx_enter(&pmaps_lock);
2033 LIST_FOREACH(pm, &pmaps, pm_list) {
2034 PDE(pm, PDSLOT_KERN + nkpde) =
2035 PDE(kpm, PDSLOT_KERN + nkpde);
2036 }
2037 mtx_leave(&pmaps_lock);
2038 }
2039
2040 splx(s);
2041
2042 out:
2043 return (VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD));
2044 }
2045
2046 /*
2047 * Pre-allocate PTP 0 for low memory, so that 1:1 mappings for various
2048 * trampoline code can be entered.
2049 */
2050 void
pmap_prealloc_lowmem_ptp_pae(void)2051 pmap_prealloc_lowmem_ptp_pae(void)
2052 {
2053 pt_entry_t *pte, npte;
2054 vaddr_t ptpva = (vaddr_t)vtopte(0);
2055
2056 /* enter pa for pte 0 into recursive map */
2057 pte = vtopte(ptpva);
2058 npte = PTP0_PA | PG_RW | PG_V | PG_U | PG_M;
2059
2060 i386_atomic_testset_uq(pte, npte);
2061
2062 /* make sure it is clean before using */
2063 memset((void *)ptpva, 0, NBPG);
2064 }
2065
2066 /*
2067 * pmap_tmpmap_pa_pae: map a page in for tmp usage
2068 */
2069
2070 vaddr_t
pmap_tmpmap_pa_pae(paddr_t pa)2071 pmap_tmpmap_pa_pae(paddr_t pa)
2072 {
2073 #ifdef MULTIPROCESSOR
2074 int id = cpu_number();
2075 #endif
2076 pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
2077 caddr_t ptpva = VASLEW(pmap_ptpp, id);
2078 #if defined(DIAGNOSTIC)
2079 if (*ptpte)
2080 panic("pmap_tmpmap_pa_pae: ptp_pte in use?");
2081 #endif
2082 *ptpte = PG_V | PG_RW | pa; /* always a new mapping */
2083 return((vaddr_t)ptpva);
2084 }
2085
2086 /*
2087 * pmap_tmpunmap_pa_pae: unmap a tmp use page (undoes pmap_tmpmap_pa_pae)
2088 */
2089
2090 void
pmap_tmpunmap_pa_pae(void)2091 pmap_tmpunmap_pa_pae(void)
2092 {
2093 #ifdef MULTIPROCESSOR
2094 int id = cpu_number();
2095 #endif
2096 pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
2097 caddr_t ptpva = VASLEW(pmap_ptpp, id);
2098 #if defined(DIAGNOSTIC)
2099 if (!pmap_valid_entry(*ptpte))
2100 panic("pmap_tmpunmap_pa_pae: our pte invalid?");
2101 #endif
2102 *ptpte = 0;
2103 pmap_update_pg((vaddr_t)ptpva);
2104 #ifdef MULTIPROCESSOR
2105 /*
2106 * No need for tlb shootdown here, since ptp_pte is per-CPU.
2107 */
2108 #endif
2109 }
2110
2111 paddr_t
vtophys_pae(vaddr_t va)2112 vtophys_pae(vaddr_t va)
2113 {
2114 return ((*vtopte(va) & PG_FRAME) | (va & ~PG_FRAME));
2115 }
2116
2117 void
pmap_flush_page_pae(paddr_t pa)2118 pmap_flush_page_pae(paddr_t pa)
2119 {
2120 #ifdef MULTIPROCESSOR
2121 int id = cpu_number();
2122 #endif
2123 pt_entry_t *pte = PTESLEW(flsh_pte, id);
2124 caddr_t va = VASLEW(pmap_flshp, id);
2125
2126 KDASSERT(PHYS_TO_VM_PAGE(pa) != NULL);
2127 #ifdef DIAGNOSTIC
2128 if (*pte)
2129 panic("pmap_flush_page_pae: lock botch");
2130 #endif
2131
2132 *pte = (pa & PG_FRAME) | PG_V | PG_RW;
2133 pmap_update_pg(va);
2134 pmap_flush_cache((vaddr_t)va, PAGE_SIZE);
2135 *pte = 0;
2136 pmap_update_pg(va);
2137 }
2138