1 /* $OpenBSD: pmap.c,v 1.226 2024/11/08 13:18:29 jsg Exp $ */
2 /* $NetBSD: pmap.c,v 1.91 2000/06/02 17:46:37 thorpej Exp $ */
3
4 /*
5 * Copyright (c) 1997 Charles D. Cranor and Washington University.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * pmap.c: i386 pmap module rewrite
31 * Chuck Cranor <chuck@ccrc.wustl.edu>
32 * 11-Aug-97
33 *
34 * history of this pmap module: in addition to my own input, i used
35 * the following references for this rewrite of the i386 pmap:
36 *
37 * [1] the NetBSD i386 pmap. this pmap appears to be based on the
38 * BSD hp300 pmap done by Mike Hibler at University of Utah.
39 * it was then ported to the i386 by William Jolitz of UUNET
40 * Technologies, Inc. Then Charles M. Hannum of the NetBSD
41 * project fixed some bugs and provided some speed ups.
42 *
43 * [2] the FreeBSD i386 pmap. this pmap seems to be the
44 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
45 * and David Greenman.
46 *
47 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated
48 * between several processors. the VAX version was done by
49 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386
50 * version was done by Lance Berc, Mike Kupfer, Bob Baron,
51 * David Golub, and Richard Draves. the alpha version was
52 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou
53 * (NetBSD/alpha).
54 */
55
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/atomic.h>
59 #include <sys/proc.h>
60 #include <sys/pool.h>
61 #include <sys/user.h>
62 #include <sys/mutex.h>
63
64 #include <uvm/uvm.h>
65
66 #include <machine/specialreg.h>
67
68 #include <sys/msgbuf.h>
69 #include <stand/boot/bootarg.h>
70
71 /* #define PMAP_DEBUG */
72
73 #ifdef PMAP_DEBUG
74 #define DPRINTF(x...) do { printf(x); } while(0)
75 #else
76 #define DPRINTF(x...)
77 #endif /* PMAP_DEBUG */
78
79 /*
80 * this file contains the code for the "pmap module." the module's
81 * job is to manage the hardware's virtual to physical address mappings.
82 * note that there are two levels of mapping in the VM system:
83 *
84 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
85 * to map ranges of virtual address space to objects/files. for
86 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
87 * to the file /bin/ls starting at offset zero." note that
88 * the upper layer mapping is not concerned with how individual
89 * vm_pages are mapped.
90 *
91 * [2] the lower layer of the VM system (the pmap) maintains the mappings
92 * from virtual addresses. it is concerned with which vm_page is
93 * mapped where. for example, when you run /bin/ls and start
94 * at page 0x1000 the fault routine may lookup the correct page
95 * of the /bin/ls file and then ask the pmap layer to establish
96 * a mapping for it.
97 *
98 * note that information in the lower layer of the VM system can be
99 * thrown away since it can easily be reconstructed from the info
100 * in the upper layer.
101 *
102 * data structures we use include:
103 *
104 * - struct pmap: describes the address space of one thread
105 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA
106 * - struct pv_head: there is one pv_head per managed page of
107 * physical memory. the pv_head points to a list of pv_entry
108 * structures which describe all the <PMAP,VA> pairs that this
109 * page is mapped in. this is critical for page based operations
110 * such as pmap_page_protect() [change protection on _all_ mappings
111 * of a page]
112 */
113 /*
114 * i386 MMU hardware structure:
115 *
116 * the i386 MMU is a two-level MMU which maps 4GB of virtual memory.
117 * the pagesize is 4K (4096 [0x1000] bytes), although newer pentium
118 * processors can support a 4MB pagesize as well.
119 *
120 * the first level table (segment table?) is called a "page directory"
121 * and it contains 1024 page directory entries (PDEs). each PDE is
122 * 4 bytes (an int), so a PD fits in a single 4K page. this page is
123 * the page directory page (PDP). each PDE in a PDP maps 4MB of space
124 * (1024 * 4MB = 4GB). a PDE contains the physical address of the
125 * second level table: the page table. or, if 4MB pages are being used,
126 * then the PDE contains the PA of the 4MB page being mapped.
127 *
128 * a page table consists of 1024 page table entries (PTEs). each PTE is
129 * 4 bytes (an int), so a page table also fits in a single 4K page. a
130 * 4K page being used as a page table is called a page table page (PTP).
131 * each PTE in a PTP maps one 4K page (1024 * 4K = 4MB). a PTE contains
132 * the physical address of the page it maps and some flag bits (described
133 * below).
134 *
135 * the processor has a special register, "cr3", which points to the
136 * the PDP which is currently controlling the mappings of the virtual
137 * address space.
138 *
139 * the following picture shows the translation process for a 4K page:
140 *
141 * %cr3 register [PA of PDP]
142 * |
143 * |
144 * | bits <31-22> of VA bits <21-12> of VA bits <11-0>
145 * | index the PDP (0 - 1023) index the PTP are the page offset
146 * | | | |
147 * | v | |
148 * +--->+----------+ | |
149 * | PD Page | PA of v |
150 * | |---PTP-------->+------------+ |
151 * | 1024 PDE | | page table |--PTE--+ |
152 * | entries | | (aka PTP) | | |
153 * +----------+ | 1024 PTE | | |
154 * | entries | | |
155 * +------------+ | |
156 * | |
157 * bits <31-12> bits <11-0>
158 * p h y s i c a l a d d r
159 *
160 * the i386 caches PTEs in a TLB. it is important to flush out old
161 * TLB mappings when making a change to a mapping. writing to the
162 * %cr3 will flush the entire TLB. newer processors also have an
163 * instruction that will invalidate the mapping of a single page (which
164 * is useful if you are changing a single mapping because it preserves
165 * all the cached TLB entries).
166 *
167 * as shows, bits 31-12 of the PTE contain PA of the page being mapped.
168 * the rest of the PTE is defined as follows:
169 * bit# name use
170 * 11 n/a available for OS use, hardware ignores it
171 * 10 n/a available for OS use, hardware ignores it
172 * 9 n/a available for OS use, hardware ignores it
173 * 8 G global bit (see discussion below)
174 * 7 PS page size [for PDEs] (0=4k, 1=4M <if supported>)
175 * 6 D dirty (modified) page
176 * 5 A accessed (referenced) page
177 * 4 PCD cache disable
178 * 3 PWT prevent write through (cache)
179 * 2 U/S user/supervisor bit (0=supervisor only, 1=both u&s)
180 * 1 R/W read/write bit (0=read only, 1=read-write)
181 * 0 P present (valid)
182 *
183 * notes:
184 * - on the i386 the R/W bit is ignored if processor is in supervisor
185 * state (bug!)
186 * - PS is only supported on newer processors
187 * - PTEs with the G bit are global in the sense that they are not
188 * flushed from the TLB when %cr3 is written (to flush, use the
189 * "flush single page" instruction). this is only supported on
190 * newer processors. this bit can be used to keep the kernel's
191 * TLB entries around while context switching. since the kernel
192 * is mapped into all processes at the same place it does not make
193 * sense to flush these entries when switching from one process'
194 * pmap to another.
195 */
196 /*
197 * A pmap describes a process' 4GB virtual address space. This
198 * virtual address space can be broken up into 1024 4MB regions which
199 * are described by PDEs in the PDP. The PDEs are defined as follows:
200 *
201 * Ranges are inclusive -> exclusive, just like vm_map_entry start/end.
202 * The following assumes that KERNBASE is 0xd0000000.
203 *
204 * PDE#s VA range Usage
205 * 0->831 0x0 -> 0xcfc00000 user address space, note that the
206 * max user address is 0xcfbfe000
207 * the final two pages in the last 4MB
208 * used to be reserved for the UAREA
209 * but now are no longer used.
210 * 831 0xcfc00000-> recursive mapping of PDP (used for
211 * 0xd0000000 linear mapping of PTPs).
212 * 832->1023 0xd0000000-> kernel address space (constant
213 * 0xffc00000 across all pmaps/processes).
214 * 1023 0xffc00000-> "alternate" recursive PDP mapping
215 * <end> (for other pmaps).
216 *
217 *
218 * Note: A recursive PDP mapping provides a way to map all the PTEs for
219 * a 4GB address space into a linear chunk of virtual memory. In other
220 * words, the PTE for page 0 is the first int mapped into the 4MB recursive
221 * area. The PTE for page 1 is the second int. The very last int in the
222 * 4MB range is the PTE that maps VA 0xffffe000 (the last page in a 4GB
223 * address).
224 *
225 * All pmaps' PDs must have the same values in slots 832->1023 so that
226 * the kernel is always mapped in every process. These values are loaded
227 * into the PD at pmap creation time.
228 *
229 * At any one time only one pmap can be active on a processor. This is
230 * the pmap whose PDP is pointed to by processor register %cr3. This pmap
231 * will have all its PTEs mapped into memory at the recursive mapping
232 * point (slot #831 as show above). When the pmap code wants to find the
233 * PTE for a virtual address, all it has to do is the following:
234 *
235 * Address of PTE = (831 * 4MB) + (VA / PAGE_SIZE) * sizeof(pt_entry_t)
236 * = 0xcfc00000 + (VA / 4096) * 4
237 *
238 * What happens if the pmap layer is asked to perform an operation
239 * on a pmap that is not the one which is currently active? In that
240 * case we take the PA of the PDP of the non-active pmap and put it in
241 * slot 1023 of the active pmap. This causes the non-active pmap's
242 * PTEs to get mapped in the final 4MB of the 4GB address space
243 * (e.g. starting at 0xffc00000).
244 *
245 * The following figure shows the effects of the recursive PDP mapping:
246 *
247 * PDP (%cr3)
248 * +----+
249 * | 0| -> PTP#0 that maps VA 0x0 -> 0x400000
250 * | |
251 * | |
252 * | 831| -> points back to PDP (%cr3) mapping VA 0xcfc00000 -> 0xd0000000
253 * | 832| -> first kernel PTP (maps 0xd0000000 -> 0xe0400000)
254 * | |
255 * |1023| -> points to alternate pmap's PDP (maps 0xffc00000 -> end)
256 * +----+
257 *
258 * Note that the PDE#831 VA (0xcfc00000) is defined as "PTE_BASE".
259 * Note that the PDE#1023 VA (0xffc00000) is defined as "APTE_BASE".
260 *
261 * Starting at VA 0xcfc00000 the current active PDP (%cr3) acts as a
262 * PTP:
263 *
264 * PTP#831 == PDP(%cr3) => maps VA 0xcfc00000 -> 0xd0000000
265 * +----+
266 * | 0| -> maps the contents of PTP#0 at VA 0xcfc00000->0xcfc01000
267 * | |
268 * | |
269 * | 831| -> maps the contents of PTP#831 (the PDP) at VA 0xcff3f000
270 * | 832| -> maps the contents of first kernel PTP
271 * | |
272 * |1023|
273 * +----+
274 *
275 * Note that mapping of the PDP at PTP#831's VA (0xcff3f000) is
276 * defined as "PDP_BASE".... within that mapping there are two
277 * defines:
278 * "PDP_PDE" (0xcff3fcfc) is the VA of the PDE in the PDP
279 * which points back to itself.
280 * "APDP_PDE" (0xcff3fffc) is the VA of the PDE in the PDP which
281 * establishes the recursive mapping of the alternate pmap.
282 * To set the alternate PDP, one just has to put the correct
283 * PA info in *APDP_PDE.
284 *
285 * Note that in the APTE_BASE space, the APDP appears at VA
286 * "APDP_BASE" (0xfffff000).
287 */
288 #define PG_FRAME 0xfffff000 /* page frame mask */
289 #define PG_LGFRAME 0xffc00000 /* large (4M) page frame mask */
290
291 /*
292 * The following defines give the virtual addresses of various MMU
293 * data structures:
294 * PTE_BASE and APTE_BASE: the base VA of the linear PTE mappings
295 * PDP_PDE and APDP_PDE: the VA of the PDE that points back to the PDP/APDP
296 */
297 #define PTE_BASE ((pt_entry_t *) (PDSLOT_PTE * NBPD))
298 #define APTE_BASE ((pt_entry_t *) (PDSLOT_APTE * NBPD))
299 #define PDP_BASE ((pd_entry_t *)(((char *)PTE_BASE) + (PDSLOT_PTE * NBPG)))
300 #define APDP_BASE ((pd_entry_t *)(((char *)APTE_BASE) + (PDSLOT_APTE * NBPG)))
301 #define PDP_PDE (PDP_BASE + PDSLOT_PTE)
302 #define APDP_PDE (PDP_BASE + PDSLOT_APTE)
303
304 /*
305 * pdei/ptei: generate index into PDP/PTP from a VA
306 */
307 #define PD_MASK 0xffc00000 /* page directory address bits */
308 #define PT_MASK 0x003ff000 /* page table address bits */
309 #define pdei(VA) (((VA) & PD_MASK) >> PDSHIFT)
310 #define ptei(VA) (((VA) & PT_MASK) >> PGSHIFT)
311
312 /*
313 * Mach derived conversion macros
314 */
315 #define i386_round_pdr(x) ((((unsigned)(x)) + ~PD_MASK) & PD_MASK)
316
317 /*
318 * various address macros
319 *
320 * vtopte: return a pointer to the PTE mapping a VA
321 */
322 #define vtopte(VA) (PTE_BASE + atop((vaddr_t)VA))
323
324 /*
325 * PTP macros:
326 * A PTP's index is the PD index of the PDE that points to it.
327 * A PTP's offset is the byte-offset in the PTE space that this PTP is at.
328 * A PTP's VA is the first VA mapped by that PTP.
329 *
330 * Note that NBPG == number of bytes in a PTP (4096 bytes == 1024 entries)
331 * NBPD == number of bytes a PTP can map (4MB)
332 */
333
334 #define ptp_i2o(I) ((I) * NBPG) /* index => offset */
335 #define ptp_o2i(O) ((O) / NBPG) /* offset => index */
336 #define ptp_i2v(I) ((I) * NBPD) /* index => VA */
337 #define ptp_v2i(V) ((V) / NBPD) /* VA => index (same as pdei) */
338
339 /*
340 * Access PD and PT
341 */
342 #define PDE(pm,i) (((pd_entry_t *)(pm)->pm_pdir)[(i)])
343
344 /*
345 * here we define the data types for PDEs and PTEs
346 */
347 typedef u_int32_t pd_entry_t; /* PDE */
348 typedef u_int32_t pt_entry_t; /* PTE */
349
350 /*
351 * Number of PTEs per cache line. 4 byte pte, 64-byte cache line
352 * Used to avoid false sharing of cache lines.
353 */
354 #define NPTECL 16
355
356 /*
357 * global data structures
358 */
359
360 /* The kernel's pmap (proc0), 32 byte aligned in case we are using PAE */
361 struct pmap __attribute__ ((aligned (32))) kernel_pmap_store;
362
363 /*
364 * nkpde is the number of kernel PTPs allocated for the kernel at
365 * boot time (NKPTP is a compile time override). this number can
366 * grow dynamically as needed (but once allocated, we never free
367 * kernel PTPs).
368 */
369
370 int nkpde = NKPTP;
371 int nkptp_max = 1024 - (KERNBASE / NBPD) - 1;
372
373 /*
374 * pg_g_kern: if CPU is affected by Meltdown pg_g_kern is 0,
375 * otherwise it is set to PG_G. pmap_pg_g will be derived
376 * from pg_g_kern, see pmap_bootstrap().
377 */
378 extern int pg_g_kern;
379
380 /*
381 * pmap_pg_g: if our processor supports PG_G in the PTE then we
382 * set pmap_pg_g to PG_G (otherwise it is zero).
383 */
384
385 int pmap_pg_g = 0;
386
387 /*
388 * pmap_pg_wc: if our processor supports PAT then we set this
389 * to be the pte bits for Write Combining. Else we fall back to
390 * UC- so mtrrs can override the cacheability
391 */
392 int pmap_pg_wc = PG_UCMINUS;
393
394 /*
395 * other data structures
396 */
397
398 uint32_t protection_codes[8]; /* maps MI prot to i386 prot code */
399 int pmap_initialized = 0; /* pmap_init done yet? */
400
401 /*
402 * MULTIPROCESSOR: special VAs/ PTEs are actually allocated inside a
403 * MAXCPUS*NPTECL array of PTEs, to avoid cache line thrashing
404 * due to false sharing.
405 */
406
407 #ifdef MULTIPROCESSOR
408 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
409 #define VASLEW(va,id) ((va)+(id)*NPTECL*NBPG)
410 #else
411 #define PTESLEW(pte, id) (pte)
412 #define VASLEW(va,id) (va)
413 #endif
414
415 /*
416 * pv management structures.
417 */
418 struct pool pmap_pv_pool;
419
420 #define PVE_LOWAT (PVE_PER_PVPAGE / 2) /* free pv_entry low water mark */
421 #define PVE_HIWAT (PVE_LOWAT + (PVE_PER_PVPAGE * 2))
422 /* high water mark */
423
424 /*
425 * the following two vaddr_t's are used during system startup
426 * to keep track of how much of the kernel's VM space we have used.
427 * once the system is started, the management of the remaining kernel
428 * VM space is turned over to the kernel_map vm_map.
429 */
430
431 static vaddr_t virtual_avail; /* VA of first free KVA */
432 static vaddr_t virtual_end; /* VA of last free KVA */
433
434 /*
435 * linked list of all non-kernel pmaps
436 */
437
438 struct pmap_head pmaps;
439 struct mutex pmaps_lock = MUTEX_INITIALIZER(IPL_VM);
440
441 /*
442 * pool that pmap structures are allocated from
443 */
444
445 struct pool pmap_pmap_pool;
446
447 /*
448 * special VAs and the PTEs that map them
449 */
450
451 pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *flsh_pte;
452 caddr_t pmap_csrcp, pmap_cdstp, pmap_zerop, pmap_ptpp, pmap_flshp;
453 caddr_t vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
454
455 extern uint32_t cpu_meltdown;
456
457 /*
458 * local prototypes
459 */
460 struct vm_page *pmap_alloc_ptp_86(struct pmap *, int, pt_entry_t);
461 struct vm_page *pmap_get_ptp_86(struct pmap *, int);
462 pt_entry_t *pmap_map_ptes_86(struct pmap *);
463 void pmap_unmap_ptes_86(struct pmap *);
464 void pmap_do_remove_86(struct pmap *, vaddr_t, vaddr_t, int);
465 void pmap_remove_ptes_86(struct pmap *, struct vm_page *, vaddr_t,
466 vaddr_t, vaddr_t, int, struct pv_entry **);
467 void *pmap_pv_page_alloc(struct pool *, int, int *);
468 void pmap_pv_page_free(struct pool *, void *);
469
470 struct pool_allocator pmap_pv_page_allocator = {
471 pmap_pv_page_alloc, pmap_pv_page_free,
472 };
473
474 void pmap_sync_flags_pte_86(struct vm_page *, pt_entry_t);
475
476 void pmap_drop_ptp_86(struct pmap *, vaddr_t, struct vm_page *,
477 pt_entry_t *);
478
479 void setcslimit(struct pmap *, struct trapframe *, struct pcb *,
480 vaddr_t);
481 void pmap_pinit_pd_86(struct pmap *);
482
483 static __inline u_int
pmap_pte2flags(pt_entry_t pte)484 pmap_pte2flags(pt_entry_t pte)
485 {
486 return (((pte & PG_U) ? PG_PMAP_REF : 0) |
487 ((pte & PG_M) ? PG_PMAP_MOD : 0));
488 }
489
490 void
pmap_sync_flags_pte_86(struct vm_page * pg,pt_entry_t pte)491 pmap_sync_flags_pte_86(struct vm_page *pg, pt_entry_t pte)
492 {
493 if (pte & (PG_U|PG_M)) {
494 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte));
495 }
496 }
497
498 void
pmap_apte_flush(void)499 pmap_apte_flush(void)
500 {
501 pmap_tlb_shoottlb();
502 pmap_tlb_shootwait();
503 }
504
505 /*
506 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
507 *
508 * => we lock enough pmaps to keep things locked in
509 * => must be undone with pmap_unmap_ptes before returning
510 */
511
512 pt_entry_t *
pmap_map_ptes_86(struct pmap * pmap)513 pmap_map_ptes_86(struct pmap *pmap)
514 {
515 pd_entry_t opde;
516
517 /* the kernel's pmap is always accessible */
518 if (pmap == pmap_kernel()) {
519 return(PTE_BASE);
520 }
521
522 mtx_enter(&pmap->pm_mtx);
523
524 /* if curpmap then we are always mapped */
525 if (pmap_is_curpmap(pmap)) {
526 return(PTE_BASE);
527 }
528
529 mtx_enter(&curcpu()->ci_curpmap->pm_apte_mtx);
530
531 /* need to load a new alternate pt space into curpmap? */
532 opde = *APDP_PDE;
533 #if defined(MULTIPROCESSOR) && defined(DIAGNOSTIC)
534 if (pmap_valid_entry(opde))
535 panic("pmap_map_ptes_86: APTE valid");
536 #endif
537 if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdirpa) {
538 *APDP_PDE = (pd_entry_t) (pmap->pm_pdirpa | PG_RW | PG_V |
539 PG_U | PG_M);
540 if (pmap_valid_entry(opde))
541 pmap_apte_flush();
542 }
543 return(APTE_BASE);
544 }
545
546 /*
547 * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
548 */
549
550 void
pmap_unmap_ptes_86(struct pmap * pmap)551 pmap_unmap_ptes_86(struct pmap *pmap)
552 {
553 if (pmap == pmap_kernel())
554 return;
555
556 if (!pmap_is_curpmap(pmap)) {
557 #if defined(MULTIPROCESSOR)
558 *APDP_PDE = 0;
559 pmap_apte_flush();
560 #endif
561 mtx_leave(&curcpu()->ci_curpmap->pm_apte_mtx);
562 }
563
564 mtx_leave(&pmap->pm_mtx);
565 }
566
567 void
pmap_exec_account(struct pmap * pm,vaddr_t va,uint32_t opte,uint32_t npte)568 pmap_exec_account(struct pmap *pm, vaddr_t va,
569 uint32_t opte, uint32_t npte)
570 {
571 if (pm == pmap_kernel())
572 return;
573
574 if (curproc->p_vmspace == NULL ||
575 pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
576 return;
577
578 if ((opte ^ npte) & PG_X)
579 pmap_tlb_shootpage(pm, va);
580
581 if (cpu_pae)
582 return;
583
584 /*
585 * Executability was removed on the last executable change.
586 * Reset the code segment to something conservative and
587 * let the trap handler deal with setting the right limit.
588 * We can't do that because of locking constraints on the vm map.
589 *
590 * XXX - floating cs - set this _really_ low.
591 */
592 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
593 struct trapframe *tf = curproc->p_md.md_regs;
594 struct pcb *pcb = &curproc->p_addr->u_pcb;
595
596 KERNEL_LOCK();
597 pm->pm_hiexec = I386_MAX_EXE_ADDR;
598 setcslimit(pm, tf, pcb, I386_MAX_EXE_ADDR);
599 KERNEL_UNLOCK();
600 }
601 }
602
603 /*
604 * Fixup the code segment to cover all potential executable mappings.
605 * Called by kernel SEGV trap handler.
606 * returns 0 if no changes to the code segment were made.
607 */
608 int
pmap_exec_fixup(struct vm_map * map,struct trapframe * tf,vaddr_t gdt_cs,struct pcb * pcb)609 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, vaddr_t gdt_cs,
610 struct pcb *pcb)
611 {
612 struct vm_map_entry *ent;
613 struct pmap *pm = vm_map_pmap(map);
614 vaddr_t va = 0;
615 vaddr_t pm_cs;
616
617 KERNEL_LOCK();
618
619 vm_map_lock(map);
620 RBT_FOREACH_REVERSE(ent, uvm_map_addr, &map->addr) {
621 if (ent->protection & PROT_EXEC)
622 break;
623 }
624 /*
625 * This entry has greater va than the entries before.
626 * We need to make it point to the last page, not past it.
627 */
628 if (ent)
629 va = trunc_page(ent->end - 1);
630 vm_map_unlock(map);
631
632 KERNEL_ASSERT_LOCKED();
633
634 pm_cs = SEGDESC_LIMIT(pm->pm_codeseg);
635
636 /*
637 * Another thread running on another cpu can change
638 * pm_hiexec and pm_codeseg. If this has happened
639 * during our timeslice, our gdt code segment will
640 * be stale. So only allow the fault through if the
641 * faulting address is less then pm_hiexec and our
642 * gdt code segment is not stale.
643 */
644 if (va <= pm->pm_hiexec && pm_cs == pm->pm_hiexec &&
645 gdt_cs == pm->pm_hiexec) {
646 KERNEL_UNLOCK();
647 return (0);
648 }
649
650 pm->pm_hiexec = va;
651
652 /*
653 * We have a new 'highest executable' va, so we need to update
654 * the value for the code segment limit, which is stored in the
655 * PCB.
656 */
657 setcslimit(pm, tf, pcb, va);
658
659 KERNEL_UNLOCK();
660 return (1);
661 }
662
663 u_int32_t
pmap_pte_set_86(vaddr_t va,paddr_t pa,u_int32_t bits)664 pmap_pte_set_86(vaddr_t va, paddr_t pa, u_int32_t bits)
665 {
666 pt_entry_t pte, *ptep = vtopte(va);
667
668 pa &= PMAP_PA_MASK;
669
670 pte = i386_atomic_testset_ul(ptep, pa | bits); /* zap! */
671 return (pte & ~PG_FRAME);
672 }
673
674 u_int32_t
pmap_pte_setbits_86(vaddr_t va,u_int32_t set,u_int32_t clr)675 pmap_pte_setbits_86(vaddr_t va, u_int32_t set, u_int32_t clr)
676 {
677 pt_entry_t *ptep = vtopte(va);
678 pt_entry_t pte = *ptep;
679
680 *ptep = (pte | set) & ~clr;
681 return (pte & ~PG_FRAME);
682 }
683
684 u_int32_t
pmap_pte_bits_86(vaddr_t va)685 pmap_pte_bits_86(vaddr_t va)
686 {
687 pt_entry_t *ptep = vtopte(va);
688
689 return (*ptep & ~PG_FRAME);
690 }
691
692 paddr_t
pmap_pte_paddr_86(vaddr_t va)693 pmap_pte_paddr_86(vaddr_t va)
694 {
695 pt_entry_t *ptep = vtopte(va);
696
697 return (*ptep & PG_FRAME);
698 }
699
700 /*
701 * pmap_tmpmap_pa: map a page in for tmp usage
702 */
703
704 vaddr_t
pmap_tmpmap_pa_86(paddr_t pa)705 pmap_tmpmap_pa_86(paddr_t pa)
706 {
707 #ifdef MULTIPROCESSOR
708 int id = cpu_number();
709 #endif
710 pt_entry_t *ptpte;
711 caddr_t ptpva;
712
713 ptpte = PTESLEW(ptp_pte, id);
714 ptpva = VASLEW(pmap_ptpp, id);
715
716 #if defined(DIAGNOSTIC)
717 if (*ptpte)
718 panic("pmap_tmpmap_pa: ptp_pte in use?");
719 #endif
720 *ptpte = PG_V | PG_RW | pa; /* always a new mapping */
721 return((vaddr_t)ptpva);
722 }
723
724
725 vaddr_t
pmap_tmpmap_pa(paddr_t pa)726 pmap_tmpmap_pa(paddr_t pa)
727 {
728 if (cpu_pae)
729 return pmap_tmpmap_pa_pae(pa);
730
731 return pmap_tmpmap_pa_86(pa);
732 }
733
734 /*
735 * pmap_tmpunmap_pa: unmap a tmp use page (undoes pmap_tmpmap_pa)
736 */
737
738 void
pmap_tmpunmap_pa_86(void)739 pmap_tmpunmap_pa_86(void)
740 {
741 #ifdef MULTIPROCESSOR
742 int id = cpu_number();
743 #endif
744 pt_entry_t *ptpte;
745 caddr_t ptpva;
746
747 ptpte = PTESLEW(ptp_pte, id);
748 ptpva = VASLEW(pmap_ptpp, id);
749
750 #if defined(DIAGNOSTIC)
751 if (!pmap_valid_entry(*ptpte))
752 panic("pmap_tmpunmap_pa: our pte invalid?");
753 #endif
754
755 *ptpte = 0;
756 pmap_update_pg((vaddr_t)ptpva);
757 #ifdef MULTIPROCESSOR
758 /*
759 * No need for tlb shootdown here, since ptp_pte is per-CPU.
760 */
761 #endif
762 }
763
764 void
pmap_tmpunmap_pa(void)765 pmap_tmpunmap_pa(void)
766 {
767 if (cpu_pae) {
768 pmap_tmpunmap_pa_pae();
769 return;
770 }
771
772 pmap_tmpunmap_pa_86();
773 }
774
775 paddr_t
vtophys(vaddr_t va)776 vtophys(vaddr_t va)
777 {
778 if (cpu_pae)
779 return vtophys_pae(va);
780 else
781 return ((*vtopte(va) & PG_FRAME) | (va & ~PG_FRAME));
782 }
783
784 void
setcslimit(struct pmap * pm,struct trapframe * tf,struct pcb * pcb,vaddr_t limit)785 setcslimit(struct pmap *pm, struct trapframe *tf, struct pcb *pcb,
786 vaddr_t limit)
787 {
788 /*
789 * Called when we have a new 'highest executable' va, so we need
790 * to update the value for the code segment limit, which is stored
791 * in the PCB.
792 *
793 * There are no caching issues to be concerned with: the
794 * processor reads the whole descriptor from the GDT when the
795 * appropriate selector is loaded into a segment register, and
796 * this only happens on the return to userland.
797 *
798 * This also works in the MP case, since whichever CPU gets to
799 * run the process will pick up the right descriptor value from
800 * the PCB.
801 */
802 limit = min(limit, VM_MAXUSER_ADDRESS - 1);
803
804 setsegment(&pm->pm_codeseg, 0, atop(limit),
805 SDT_MEMERA, SEL_UPL, 1, 1);
806
807 /* And update the GDT since we may be called by the
808 * trap handler (cpu_switch won't get a chance).
809 */
810 curcpu()->ci_gdt[GUCODE_SEL].sd = pm->pm_codeseg;
811
812 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
813 }
814
815 /*
816 * p m a p k e n t e r f u n c t i o n s
817 *
818 * functions to quickly enter/remove pages from the kernel address
819 * space. pmap_kremove is exported to MI kernel. we make use of
820 * the recursive PTE mappings.
821 */
822
823 /*
824 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
825 *
826 * => no need to lock anything, assume va is already allocated
827 * => should be faster than normal pmap enter function
828 */
829
830 void
pmap_kenter_pa(vaddr_t va,paddr_t pa,vm_prot_t prot)831 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
832 {
833 uint32_t bits;
834 uint32_t global = 0;
835
836 /* special 1:1 mappings in the first large page must not be global */
837 if (!cpu_pae) {
838 if (va >= (vaddr_t)NBPD) /* 4MB pages on non-PAE */
839 global = pmap_pg_g;
840 } else {
841 if (va >= (vaddr_t)NBPD / 2) /* 2MB pages on PAE */
842 global = pmap_pg_g;
843 }
844
845 bits = pmap_pte_set(va, pa, ((prot & PROT_WRITE) ? PG_RW : PG_RO) |
846 PG_V | global | PG_U | PG_M |
847 ((prot & PROT_EXEC) ? PG_X : 0) |
848 ((pa & PMAP_NOCACHE) ? PG_N : 0) |
849 ((pa & PMAP_WC) ? pmap_pg_wc : 0));
850 if (pmap_valid_entry(bits)) {
851 if (pa & PMAP_NOCACHE && (bits & PG_N) == 0)
852 wbinvd_on_all_cpus();
853 /* NB. - this should not happen. */
854 pmap_tlb_shootpage(pmap_kernel(), va);
855 pmap_tlb_shootwait();
856 }
857 }
858
859 /*
860 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
861 *
862 * => no need to lock anything
863 * => caller must dispose of any vm_page mapped in the va range
864 * => note: not an inline function
865 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
866 */
867
868 void
pmap_kremove(vaddr_t sva,vsize_t len)869 pmap_kremove(vaddr_t sva, vsize_t len)
870 {
871 uint32_t bits;
872 vaddr_t va, eva;
873
874 eva = sva + len;
875
876 for (va = sva; va != eva; va += PAGE_SIZE) {
877 bits = pmap_pte_set(va, 0, 0);
878 #ifdef DIAGNOSTIC
879 if (bits & PG_PVLIST)
880 panic("pmap_kremove: PG_PVLIST mapping for 0x%lx", va);
881 #endif
882 }
883 pmap_tlb_shootrange(pmap_kernel(), sva, eva);
884 pmap_tlb_shootwait();
885 }
886
887 /*
888 * Allocate a new PD for Intel's U-K.
889 */
890 void
pmap_alloc_pdir_intel_x86(struct pmap * pmap)891 pmap_alloc_pdir_intel_x86(struct pmap *pmap)
892 {
893 vaddr_t va;
894
895 KASSERT(pmap->pm_pdir_intel == 0);
896
897 va = (vaddr_t)km_alloc(NBPG, &kv_any, &kp_zero, &kd_waitok);
898 if (va == 0)
899 panic("kernel_map out of virtual space");
900 pmap->pm_pdir_intel = va;
901 if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel,
902 &pmap->pm_pdirpa_intel))
903 panic("can't locate PD page");
904 }
905
906 /*
907 * p m a p i n i t f u n c t i o n s
908 *
909 * pmap_bootstrap and pmap_init are called during system startup
910 * to init the pmap module. pmap_bootstrap() does a low level
911 * init just to get things rolling. pmap_init() finishes the job.
912 */
913
914 /*
915 * pmap_bootstrap: get the system in a state where it can run with VM
916 * properly enabled (called before main()). the VM system is
917 * fully init'd later...
918 *
919 * => on i386, locore.s has already enabled the MMU by allocating
920 * a PDP for the kernel, and nkpde PTPs for the kernel.
921 * => kva_start is the first free virtual address in kernel space
922 */
923
924 void
pmap_bootstrap(vaddr_t kva_start)925 pmap_bootstrap(vaddr_t kva_start)
926 {
927 struct pmap *kpm;
928 vaddr_t kva;
929 pt_entry_t *pte;
930
931 /*
932 * set the page size (default value is 4K which is ok)
933 */
934
935 uvm_setpagesize();
936
937 /*
938 * a quick sanity check
939 */
940
941 if (PAGE_SIZE != NBPG)
942 panic("pmap_bootstrap: PAGE_SIZE != NBPG");
943
944 /*
945 * set up our local static global vars that keep track of the
946 * usage of KVM before kernel_map is set up
947 */
948
949 virtual_avail = kva_start; /* first free KVA */
950 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */
951
952 /*
953 * set up protection_codes: we need to be able to convert from
954 * a MI protection code (some combo of VM_PROT...) to something
955 * we can jam into a i386 PTE.
956 */
957
958 protection_codes[PROT_NONE] = 0; /* --- */
959 protection_codes[PROT_EXEC] = PG_X; /* --x */
960 protection_codes[PROT_READ] = PG_RO; /* -r- */
961 protection_codes[PROT_READ | PROT_EXEC] = PG_X; /* -rx */
962 protection_codes[PROT_WRITE] = PG_RW; /* w-- */
963 protection_codes[PROT_WRITE | PROT_EXEC] = PG_RW|PG_X; /* w-x */
964 protection_codes[PROT_READ | PROT_WRITE] = PG_RW; /* wr- */
965 protection_codes[PROT_READ | PROT_WRITE | PROT_EXEC] = PG_RW|PG_X; /* wrx */
966
967 /*
968 * now we init the kernel's pmap
969 *
970 * the kernel pmap's pm_obj is not used for much. however, in
971 * user pmaps the pm_obj contains the list of active PTPs.
972 * the pm_obj currently does not have a pager. it might be possible
973 * to add a pager that would allow a process to read-only mmap its
974 * own page tables (fast user level vtophys?). this may or may not
975 * be useful.
976 */
977
978 kpm = pmap_kernel();
979 mtx_init(&kpm->pm_mtx, -1); /* must not be used */
980 mtx_init(&kpm->pm_apte_mtx, IPL_VM);
981 uvm_obj_init(&kpm->pm_obj, &pmap_pager, 1);
982 bzero(&kpm->pm_list, sizeof(kpm->pm_list)); /* pm_list not used */
983 kpm->pm_pdir = (vaddr_t)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
984 kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3;
985 kpm->pm_pdir_intel = 0;
986 kpm->pm_pdirpa_intel = 0;
987 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
988 atop(kva_start - VM_MIN_KERNEL_ADDRESS);
989
990 /*
991 * the above is just a rough estimate and not critical to the proper
992 * operation of the system.
993 */
994
995 /*
996 * enable global TLB entries if they are supported and the
997 * CPU is not affected by Meltdown.
998 */
999
1000 if (cpu_feature & CPUID_PGE) {
1001 lcr4(rcr4() | CR4_PGE); /* enable hardware (via %cr4) */
1002 pmap_pg_g = pg_g_kern; /* if safe to use, enable software */
1003
1004 /* add PG_G attribute to already mapped kernel pages */
1005 for (kva = VM_MIN_KERNEL_ADDRESS; kva < virtual_avail;
1006 kva += PAGE_SIZE)
1007 if (pmap_valid_entry(PTE_BASE[atop(kva)]))
1008 PTE_BASE[atop(kva)] |= pmap_pg_g;
1009 }
1010
1011 /*
1012 * now we allocate the "special" VAs which are used for tmp mappings
1013 * by the pmap (and other modules). we allocate the VAs by advancing
1014 * virtual_avail (note that there are no pages mapped at these VAs).
1015 * we find the PTE that maps the allocated VA via the linear PTE
1016 * mapping.
1017 */
1018
1019 pte = PTE_BASE + atop(virtual_avail);
1020
1021 #ifdef MULTIPROCESSOR
1022 /*
1023 * Waste some VA space to avoid false sharing of cache lines
1024 * for page table pages: Give each possible CPU a cache line
1025 * of PTEs (16) to play with, though we only need 4. We could
1026 * recycle some of this waste by putting the idle stacks here
1027 * as well; we could waste less space if we knew the largest
1028 * CPU ID beforehand.
1029 */
1030 pmap_csrcp = (caddr_t) virtual_avail; csrc_pte = pte;
1031
1032 pmap_cdstp = (caddr_t) virtual_avail+PAGE_SIZE; cdst_pte = pte+1;
1033
1034 pmap_zerop = (caddr_t) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2;
1035
1036 pmap_ptpp = (caddr_t) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3;
1037
1038 pmap_flshp = (caddr_t) virtual_avail+PAGE_SIZE*4; flsh_pte = pte+4;
1039
1040 virtual_avail += PAGE_SIZE * MAXCPUS * NPTECL;
1041 pte += MAXCPUS * NPTECL;
1042 #else
1043 pmap_csrcp = (caddr_t) virtual_avail; csrc_pte = pte; /* allocate */
1044 virtual_avail += PAGE_SIZE; pte++; /* advance */
1045
1046 pmap_cdstp = (caddr_t) virtual_avail; cdst_pte = pte;
1047 virtual_avail += PAGE_SIZE; pte++;
1048
1049 pmap_zerop = (caddr_t) virtual_avail; zero_pte = pte;
1050 virtual_avail += PAGE_SIZE; pte++;
1051
1052 pmap_ptpp = (caddr_t) virtual_avail; ptp_pte = pte;
1053 virtual_avail += PAGE_SIZE; pte++;
1054
1055 pmap_flshp = (caddr_t) virtual_avail; flsh_pte = pte;
1056 virtual_avail += PAGE_SIZE; pte++;
1057 #endif
1058
1059 /* XXX: vmmap used by mem.c... should be uvm_map_reserve */
1060 vmmap = (char *)virtual_avail; /* don't need pte */
1061 virtual_avail += PAGE_SIZE;
1062
1063 msgbufp = (struct msgbuf *)virtual_avail; /* don't need pte */
1064 virtual_avail += round_page(MSGBUFSIZE); pte++;
1065
1066 bootargp = (bootarg_t *)virtual_avail;
1067 virtual_avail += round_page(bootargc); pte++;
1068
1069 /*
1070 * now we reserve some VM for mapping pages when doing a crash dump
1071 */
1072
1073 virtual_avail = reserve_dumppages(virtual_avail);
1074
1075 /*
1076 * init the static-global locks and global lists.
1077 */
1078
1079 LIST_INIT(&pmaps);
1080
1081 /*
1082 * initialize the pmap pool.
1083 */
1084
1085 pool_init(&pmap_pmap_pool, sizeof(struct pmap), 32, IPL_NONE, 0,
1086 "pmappl", NULL);
1087 pool_init(&pmap_pv_pool, sizeof(struct pv_entry), 0, IPL_VM, 0,
1088 "pvpl", &pmap_pv_page_allocator);
1089
1090 /*
1091 * ensure the TLB is sync'd with reality by flushing it...
1092 */
1093
1094 tlbflush();
1095 }
1096
1097 /*
1098 * Pre-allocate PTP 0 for low memory, so that 1:1 mappings for various
1099 * trampoline code can be entered.
1100 */
1101 void
pmap_prealloc_lowmem_ptp(void)1102 pmap_prealloc_lowmem_ptp(void)
1103 {
1104 pt_entry_t *pte, npte;
1105 vaddr_t ptpva = (vaddr_t)vtopte(0);
1106
1107 /* If PAE, use the PAE-specific preallocator */
1108 if (cpu_pae) {
1109 pmap_prealloc_lowmem_ptp_pae();
1110 return;
1111 }
1112
1113 /* enter pa for pte 0 into recursive map */
1114 pte = vtopte(ptpva);
1115 npte = PTP0_PA | PG_RW | PG_V | PG_U | PG_M;
1116
1117 i386_atomic_testset_ul(pte, npte);
1118
1119 /* make sure it is clean before using */
1120 memset((void *)ptpva, 0, NBPG);
1121 }
1122
1123 /*
1124 * pmap_init: called from uvm_init, our job is to get the pmap
1125 * system ready to manage mappings... this mainly means initing
1126 * the pv_entry stuff.
1127 */
1128
1129 void
pmap_init(void)1130 pmap_init(void)
1131 {
1132 /*
1133 * prime the pool with pv_entry structures to allow us to get
1134 * the kmem_map allocated and inited (done after this function
1135 * is finished). we do this by setting a low water mark such
1136 * that we are more likely to have these around in extreme
1137 * memory starvation.
1138 */
1139
1140 pool_setlowat(&pmap_pv_pool, PVE_LOWAT);
1141 pool_sethiwat(&pmap_pv_pool, PVE_HIWAT);
1142
1143 /*
1144 * done: pmap module is up (and ready for business)
1145 */
1146
1147 pmap_initialized = 1;
1148 }
1149
1150 /*
1151 * p v _ e n t r y f u n c t i o n s
1152 */
1153
1154 void *
pmap_pv_page_alloc(struct pool * pp,int flags,int * slowdown)1155 pmap_pv_page_alloc(struct pool *pp, int flags, int *slowdown)
1156 {
1157 struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1158
1159 kd.kd_waitok = ISSET(flags, PR_WAITOK);
1160 kd.kd_slowdown = slowdown;
1161
1162 return (km_alloc(pp->pr_pgsize,
1163 pmap_initialized ? &kv_page : &kv_any, pp->pr_crange, &kd));
1164 }
1165
1166 void
pmap_pv_page_free(struct pool * pp,void * v)1167 pmap_pv_page_free(struct pool *pp, void *v)
1168 {
1169 km_free(v, pp->pr_pgsize, &kv_page, pp->pr_crange);
1170 }
1171
1172 /*
1173 * main pv_entry manipulation functions:
1174 * pmap_enter_pv: enter a mapping onto a pv list
1175 * pmap_remove_pv: remove a mapping from a pv list
1176 */
1177
1178 /*
1179 * pmap_enter_pv: enter a mapping onto a pv list
1180 *
1181 * => caller should have pmap locked
1182 * => we will gain the lock on the pv and allocate the new pv_entry
1183 * => caller should adjust ptp's wire_count before calling
1184 *
1185 * pve: preallocated pve for us to use
1186 * ptp: PTP in pmap that maps this VA
1187 */
1188
1189 void
pmap_enter_pv(struct vm_page * pg,struct pv_entry * pve,struct pmap * pmap,vaddr_t va,struct vm_page * ptp)1190 pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, struct pmap *pmap,
1191 vaddr_t va, struct vm_page *ptp)
1192 {
1193 pve->pv_pmap = pmap;
1194 pve->pv_va = va;
1195 pve->pv_ptp = ptp; /* NULL for kernel pmap */
1196 mtx_enter(&pg->mdpage.pv_mtx);
1197 pve->pv_next = pg->mdpage.pv_list; /* add to ... */
1198 pg->mdpage.pv_list = pve; /* ... locked list */
1199 mtx_leave(&pg->mdpage.pv_mtx);
1200 }
1201
1202 /*
1203 * pmap_remove_pv: try to remove a mapping from a pv_list
1204 *
1205 * => pmap should be locked
1206 * => caller should hold lock on pv [so that attrs can be adjusted]
1207 * => caller should adjust ptp's wire_count and free PTP if needed
1208 * => we return the removed pve
1209 */
1210
1211 struct pv_entry *
pmap_remove_pv(struct vm_page * pg,struct pmap * pmap,vaddr_t va)1212 pmap_remove_pv(struct vm_page *pg, struct pmap *pmap, vaddr_t va)
1213 {
1214 struct pv_entry *pve, **prevptr;
1215
1216 mtx_enter(&pg->mdpage.pv_mtx);
1217 prevptr = &pg->mdpage.pv_list; /* previous pv_entry pointer */
1218 while ((pve = *prevptr) != NULL) {
1219 if (pve->pv_pmap == pmap && pve->pv_va == va) { /* match? */
1220 *prevptr = pve->pv_next; /* remove it! */
1221 break;
1222 }
1223 prevptr = &pve->pv_next; /* previous pointer */
1224 }
1225 mtx_leave(&pg->mdpage.pv_mtx);
1226 return(pve); /* return removed pve */
1227 }
1228
1229 /*
1230 * p t p f u n c t i o n s
1231 */
1232
1233 /*
1234 * pmap_alloc_ptp: allocate a PTP for a PMAP
1235 *
1236 * => pmap should already be locked by caller
1237 * => we use the ptp's wire_count to count the number of active mappings
1238 * in the PTP (we start it at one to prevent any chance this PTP
1239 * will ever leak onto the active/inactive queues)
1240 */
1241
1242 struct vm_page *
pmap_alloc_ptp_86(struct pmap * pmap,int pde_index,pt_entry_t pde_flags)1243 pmap_alloc_ptp_86(struct pmap *pmap, int pde_index, pt_entry_t pde_flags)
1244 {
1245 struct vm_page *ptp;
1246 pd_entry_t *pva_intel;
1247
1248 ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL,
1249 UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1250 if (ptp == NULL)
1251 return (NULL);
1252
1253 /* got one! */
1254 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
1255 ptp->wire_count = 1; /* no mappings yet */
1256 PDE(pmap, pde_index) = (pd_entry_t)(VM_PAGE_TO_PHYS(ptp) |
1257 PG_RW | PG_V | PG_M | PG_U | pde_flags);
1258
1259 /*
1260 * Meltdown special case - if we are adding a new PDE for
1261 * usermode addresses, just copy the PDE to the U-K page
1262 * table.
1263 */
1264 if (pmap->pm_pdir_intel && ptp_i2v(pde_index) < VM_MAXUSER_ADDRESS) {
1265 pva_intel = (pd_entry_t *)pmap->pm_pdir_intel;
1266 pva_intel[pde_index] = PDE(pmap, pde_index);
1267 DPRINTF("%s: copying usermode PDE (content=0x%x) pde_index %d "
1268 "from 0x%x -> 0x%x\n", __func__, PDE(pmap, pde_index),
1269 pde_index, (uint32_t)&PDE(pmap, pde_index),
1270 (uint32_t)&(pva_intel[pde_index]));
1271 }
1272
1273 pmap->pm_stats.resident_count++; /* count PTP as resident */
1274 pmap->pm_ptphint = ptp;
1275 return(ptp);
1276 }
1277
1278 /*
1279 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1280 *
1281 * => pmap should NOT be pmap_kernel()
1282 * => pmap should be locked
1283 */
1284
1285 struct vm_page *
pmap_get_ptp_86(struct pmap * pmap,int pde_index)1286 pmap_get_ptp_86(struct pmap *pmap, int pde_index)
1287 {
1288 struct vm_page *ptp;
1289
1290 if (pmap_valid_entry(PDE(pmap, pde_index))) {
1291 /* valid... check hint (saves us a PA->PG lookup) */
1292 if (pmap->pm_ptphint &&
1293 (PDE(pmap, pde_index) & PG_FRAME) ==
1294 VM_PAGE_TO_PHYS(pmap->pm_ptphint))
1295 return(pmap->pm_ptphint);
1296
1297 ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index));
1298 #ifdef DIAGNOSTIC
1299 if (ptp == NULL)
1300 panic("pmap_get_ptp_86: unmanaged user PTP");
1301 #endif
1302 pmap->pm_ptphint = ptp;
1303 return(ptp);
1304 }
1305
1306 /* allocate a new PTP (updates ptphint) */
1307 return (pmap_alloc_ptp_86(pmap, pde_index, PG_u));
1308 }
1309
1310 void
pmap_drop_ptp_86(struct pmap * pm,vaddr_t va,struct vm_page * ptp,pt_entry_t * ptes)1311 pmap_drop_ptp_86(struct pmap *pm, vaddr_t va, struct vm_page *ptp,
1312 pt_entry_t *ptes)
1313 {
1314 pd_entry_t *pva_intel;
1315
1316 i386_atomic_testset_ul(&PDE(pm, pdei(va)), 0);
1317 pmap_tlb_shootpage(curcpu()->ci_curpmap, ((vaddr_t)ptes) + ptp->offset);
1318 #ifdef MULTIPROCESSOR
1319 /*
1320 * Always shoot down the other pmap's
1321 * self-mapping of the PTP.
1322 */
1323 pmap_tlb_shootpage(pm, ((vaddr_t)PTE_BASE) + ptp->offset);
1324 #endif
1325 pm->pm_stats.resident_count--;
1326 /* update hint */
1327 if (pm->pm_ptphint == ptp)
1328 pm->pm_ptphint = RBT_ROOT(uvm_objtree, &pm->pm_obj.memt);
1329 ptp->wire_count = 0;
1330 /* Postpone free to after shootdown. */
1331 uvm_pagerealloc(ptp, NULL, 0);
1332
1333 if (pm->pm_pdir_intel) {
1334 KASSERT(va < VM_MAXUSER_ADDRESS);
1335 /* Zap special meltdown PDE */
1336 pva_intel = (pd_entry_t *)pm->pm_pdir_intel;
1337 i386_atomic_testset_ul(&pva_intel[pdei(va)], 0);
1338 DPRINTF("%s: cleared meltdown PDE @ index %lu "
1339 "(va range start 0x%x)\n", __func__, pdei(va),
1340 (uint32_t)va);
1341 }
1342 }
1343
1344 /*
1345 * p m a p l i f e c y c l e f u n c t i o n s
1346 */
1347
1348 /*
1349 * pmap_create: create a pmap
1350 *
1351 * => note: old pmap interface took a "size" args which allowed for
1352 * the creation of "software only" pmaps (not in bsd).
1353 */
1354
1355 struct pmap *
pmap_create(void)1356 pmap_create(void)
1357 {
1358 struct pmap *pmap;
1359
1360 pmap = pool_get(&pmap_pmap_pool, PR_WAITOK);
1361
1362 mtx_init(&pmap->pm_mtx, IPL_VM);
1363 mtx_init(&pmap->pm_apte_mtx, IPL_VM);
1364
1365 /* init uvm_object */
1366 uvm_obj_init(&pmap->pm_obj, &pmap_pager, 1);
1367 pmap->pm_stats.wired_count = 0;
1368 pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */
1369 pmap->pm_ptphint = NULL;
1370 pmap->pm_hiexec = 0;
1371 pmap->pm_flags = 0;
1372 pmap->pm_pdir_intel = 0;
1373 pmap->pm_pdirpa_intel = 0;
1374
1375 initcodesegment(&pmap->pm_codeseg);
1376
1377 pmap_pinit_pd(pmap);
1378 return (pmap);
1379 }
1380
1381 void
pmap_pinit_pd_86(struct pmap * pmap)1382 pmap_pinit_pd_86(struct pmap *pmap)
1383 {
1384 /* allocate PDP */
1385 pmap->pm_pdir = (vaddr_t)km_alloc(NBPG, &kv_any, &kp_dirty, &kd_waitok);
1386 if (pmap->pm_pdir == 0)
1387 panic("kernel_map out of virtual space");
1388 pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir,
1389 &pmap->pm_pdirpa);
1390 pmap->pm_pdirsize = NBPG;
1391
1392 /* init PDP */
1393 /* zero init area */
1394 bzero((void *)pmap->pm_pdir, PDSLOT_PTE * sizeof(pd_entry_t));
1395 /* put in recursive PDE to map the PTEs */
1396 PDE(pmap, PDSLOT_PTE) = pmap->pm_pdirpa | PG_V | PG_KW | PG_U | PG_M;
1397 PDE(pmap, PDSLOT_PTE + 1) = 0;
1398
1399 /*
1400 * we need to lock pmaps_lock to prevent nkpde from changing on
1401 * us. note that there is no need to splvm to protect us from
1402 * malloc since malloc allocates out of a submap and we should have
1403 * already allocated kernel PTPs to cover the range...
1404 */
1405 /* put in kernel VM PDEs */
1406 bcopy(&PDP_BASE[PDSLOT_KERN], &PDE(pmap, PDSLOT_KERN),
1407 nkpde * sizeof(pd_entry_t));
1408 /* zero the rest */
1409 bzero(&PDE(pmap, PDSLOT_KERN + nkpde),
1410 NBPG - ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t)));
1411
1412 /*
1413 * Intel CPUs need a special page table to be used during usermode
1414 * execution, one that lacks all kernel mappings.
1415 */
1416 if (cpu_meltdown) {
1417 pmap_alloc_pdir_intel_x86(pmap);
1418
1419 /* Copy PDEs from pmap_kernel's U-K view */
1420 bcopy((void *)pmap_kernel()->pm_pdir_intel,
1421 (void *)pmap->pm_pdir_intel, NBPG);
1422
1423 DPRINTF("%s: pmap %p pm_pdir 0x%lx pm_pdirpa 0x%lx "
1424 "pdir_intel 0x%lx pdirpa_intel 0x%lx\n",
1425 __func__, pmap, pmap->pm_pdir, pmap->pm_pdirpa,
1426 pmap->pm_pdir_intel, pmap->pm_pdirpa_intel);
1427 }
1428
1429 mtx_enter(&pmaps_lock);
1430 LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
1431 mtx_leave(&pmaps_lock);
1432 }
1433
1434 /*
1435 * pmap_destroy: drop reference count on pmap. free pmap if
1436 * reference count goes to zero.
1437 */
1438
1439 void
pmap_destroy(struct pmap * pmap)1440 pmap_destroy(struct pmap *pmap)
1441 {
1442 struct vm_page *pg;
1443 int refs;
1444
1445 refs = atomic_dec_int_nv(&pmap->pm_obj.uo_refs);
1446 if (refs > 0)
1447 return;
1448
1449 #ifdef MULTIPROCESSOR
1450 pmap_tlb_droppmap(pmap);
1451 #endif
1452
1453 mtx_enter(&pmaps_lock);
1454 LIST_REMOVE(pmap, pm_list);
1455 mtx_leave(&pmaps_lock);
1456
1457 /* Free any remaining PTPs. */
1458 while ((pg = RBT_ROOT(uvm_objtree, &pmap->pm_obj.memt)) != NULL) {
1459 pg->wire_count = 0;
1460 uvm_pagefree(pg);
1461 }
1462
1463 km_free((void *)pmap->pm_pdir, pmap->pm_pdirsize, &kv_any, &kp_dirty);
1464 pmap->pm_pdir = 0;
1465
1466 if (pmap->pm_pdir_intel) {
1467 km_free((void *)pmap->pm_pdir_intel, pmap->pm_pdirsize,
1468 &kv_any, &kp_dirty);
1469 pmap->pm_pdir_intel = 0;
1470 }
1471
1472 pool_put(&pmap_pmap_pool, pmap);
1473 }
1474
1475
1476 /*
1477 * Add a reference to the specified pmap.
1478 */
1479
1480 void
pmap_reference(struct pmap * pmap)1481 pmap_reference(struct pmap *pmap)
1482 {
1483 atomic_inc_int(&pmap->pm_obj.uo_refs);
1484 }
1485
1486 void
pmap_activate(struct proc * p)1487 pmap_activate(struct proc *p)
1488 {
1489 KASSERT(curproc == p);
1490 KASSERT(&p->p_addr->u_pcb == curpcb);
1491 pmap_switch(NULL, p);
1492 }
1493
1494 int nlazy_cr3_hit;
1495 int nlazy_cr3;
1496
1497 void
pmap_switch(struct proc * o,struct proc * p)1498 pmap_switch(struct proc *o, struct proc *p)
1499 {
1500 struct pcb *pcb = &p->p_addr->u_pcb;
1501 struct pmap *pmap, *opmap;
1502 struct cpu_info *self = curcpu();
1503
1504 pmap = p->p_vmspace->vm_map.pmap;
1505 opmap = self->ci_curpmap;
1506
1507 pcb->pcb_pmap = pmap;
1508 pcb->pcb_cr3 = pmap->pm_pdirpa;
1509
1510 if (opmap == pmap) {
1511 if (pmap != pmap_kernel())
1512 nlazy_cr3_hit++;
1513 } else if (o != NULL && pmap == pmap_kernel()) {
1514 nlazy_cr3++;
1515 } else {
1516 self->ci_curpmap = pmap;
1517 lcr3(pmap->pm_pdirpa);
1518 }
1519
1520 /*
1521 * Meltdown: iff we're doing separate U+K and U-K page tables,
1522 * then record them in cpu_info for easy access in syscall and
1523 * interrupt trampolines.
1524 */
1525 if (pmap->pm_pdirpa_intel) {
1526 self->ci_kern_cr3 = pmap->pm_pdirpa;
1527 self->ci_user_cr3 = pmap->pm_pdirpa_intel;
1528 }
1529
1530 /*
1531 * Set the correct descriptor value (i.e. with the
1532 * correct code segment X limit) in the GDT.
1533 */
1534 self->ci_gdt[GUCODE_SEL].sd = pmap->pm_codeseg;
1535 self->ci_gdt[GUFS_SEL].sd = pcb->pcb_threadsegs[TSEG_FS];
1536 self->ci_gdt[GUGS_SEL].sd = pcb->pcb_threadsegs[TSEG_GS];
1537 }
1538
1539 void
pmap_deactivate(struct proc * p)1540 pmap_deactivate(struct proc *p)
1541 {
1542 }
1543
1544 /*
1545 * pmap_extract: extract a PA for the given VA
1546 */
1547
1548 int
pmap_extract_86(struct pmap * pmap,vaddr_t va,paddr_t * pap)1549 pmap_extract_86(struct pmap *pmap, vaddr_t va, paddr_t *pap)
1550 {
1551 pt_entry_t *ptes, pte;
1552
1553 ptes = pmap_map_ptes_86(pmap);
1554 if (pmap_valid_entry(PDE(pmap, pdei(va)))) {
1555 pte = ptes[atop(va)];
1556 pmap_unmap_ptes_86(pmap);
1557 if (!pmap_valid_entry(pte))
1558 return 0;
1559 if (pap != NULL)
1560 *pap = (pte & PG_FRAME) | (va & ~PG_FRAME);
1561 return 1;
1562 }
1563 pmap_unmap_ptes_86(pmap);
1564 return 0;
1565 }
1566
1567 /*
1568 * pmap_virtual_space: used during bootup [uvm_pageboot_alloc] to
1569 * determine the bounds of the kernel virtual address space.
1570 */
1571
1572 void
pmap_virtual_space(vaddr_t * startp,vaddr_t * endp)1573 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
1574 {
1575 *startp = virtual_avail;
1576 *endp = virtual_end;
1577 }
1578
1579 /*
1580 * pmap_zero_page: zero a page
1581 */
1582 void (*pagezero)(void *, size_t) = bzero;
1583
1584 void
pmap_zero_page(struct vm_page * pg)1585 pmap_zero_page(struct vm_page *pg)
1586 {
1587 pmap_zero_phys(VM_PAGE_TO_PHYS(pg));
1588 }
1589
1590 /*
1591 * pmap_zero_phys: same as pmap_zero_page, but for use before vm_pages are
1592 * initialized.
1593 */
1594 void
pmap_zero_phys_86(paddr_t pa)1595 pmap_zero_phys_86(paddr_t pa)
1596 {
1597 #ifdef MULTIPROCESSOR
1598 int id = cpu_number();
1599 #endif
1600 pt_entry_t *zpte = PTESLEW(zero_pte, id);
1601 caddr_t zerova = VASLEW(pmap_zerop, id);
1602
1603 #ifdef DIAGNOSTIC
1604 if (*zpte)
1605 panic("pmap_zero_phys_86: lock botch");
1606 #endif
1607
1608 *zpte = (pa & PG_FRAME) | PG_V | PG_RW; /* map in */
1609 pmap_update_pg((vaddr_t)zerova); /* flush TLB */
1610 pagezero(zerova, PAGE_SIZE); /* zero */
1611 *zpte = 0;
1612 }
1613
1614 /*
1615 * pmap_flush_cache: flush the cache for a virtual address.
1616 */
1617 void
pmap_flush_cache(vaddr_t addr,vsize_t len)1618 pmap_flush_cache(vaddr_t addr, vsize_t len)
1619 {
1620 vaddr_t i;
1621
1622 if (curcpu()->ci_cflushsz == 0) {
1623 wbinvd_on_all_cpus();
1624 return;
1625 }
1626
1627 mfence();
1628 for (i = addr; i < addr + len; i += curcpu()->ci_cflushsz)
1629 clflush(i);
1630 mfence();
1631 }
1632
1633 void
pmap_flush_page(paddr_t pa)1634 pmap_flush_page(paddr_t pa)
1635 {
1636 #ifdef MULTIPROCESSOR
1637 int id = cpu_number();
1638 #endif
1639 pt_entry_t *pte;
1640 caddr_t va;
1641
1642 KDASSERT(PHYS_TO_VM_PAGE(pa) != NULL);
1643
1644 if (cpu_pae) {
1645 pmap_flush_page_pae(pa);
1646 return;
1647 }
1648
1649 pte = PTESLEW(flsh_pte, id);
1650 va = VASLEW(pmap_flshp, id);
1651
1652 #ifdef DIAGNOSTIC
1653 if (*pte)
1654 panic("pmap_flush_page: lock botch");
1655 #endif
1656
1657 *pte = (pa & PG_FRAME) | PG_V | PG_RW;
1658 pmap_update_pg(va);
1659 pmap_flush_cache((vaddr_t)va, PAGE_SIZE);
1660 *pte = 0;
1661 pmap_update_pg(va);
1662 }
1663
1664 /*
1665 * pmap_copy_page: copy a page
1666 */
1667
1668 void
pmap_copy_page_86(struct vm_page * srcpg,struct vm_page * dstpg)1669 pmap_copy_page_86(struct vm_page *srcpg, struct vm_page *dstpg)
1670 {
1671 paddr_t srcpa = VM_PAGE_TO_PHYS(srcpg);
1672 paddr_t dstpa = VM_PAGE_TO_PHYS(dstpg);
1673 #ifdef MULTIPROCESSOR
1674 int id = cpu_number();
1675 #endif
1676 pt_entry_t *spte = PTESLEW(csrc_pte, id);
1677 pt_entry_t *dpte = PTESLEW(cdst_pte, id);
1678 caddr_t csrcva = VASLEW(pmap_csrcp, id);
1679 caddr_t cdstva = VASLEW(pmap_cdstp, id);
1680
1681 #ifdef DIAGNOSTIC
1682 if (*spte || *dpte)
1683 panic("pmap_copy_page_86: lock botch");
1684 #endif
1685
1686 *spte = (srcpa & PG_FRAME) | PG_V | PG_RW;
1687 *dpte = (dstpa & PG_FRAME) | PG_V | PG_RW;
1688 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
1689 bcopy(csrcva, cdstva, PAGE_SIZE);
1690 *spte = *dpte = 0;
1691 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
1692 }
1693
1694 /*
1695 * p m a p r e m o v e f u n c t i o n s
1696 *
1697 * functions that remove mappings
1698 */
1699
1700 /*
1701 * pmap_remove_ptes: remove PTEs from a PTP
1702 *
1703 * => caller must hold pmap's lock
1704 * => PTP must be mapped into KVA
1705 * => PTP should be null if pmap == pmap_kernel()
1706 */
1707
1708 void
pmap_remove_ptes_86(struct pmap * pmap,struct vm_page * ptp,vaddr_t ptpva,vaddr_t startva,vaddr_t endva,int flags,struct pv_entry ** free_pvs)1709 pmap_remove_ptes_86(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
1710 vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs)
1711 {
1712 struct pv_entry *pve;
1713 pt_entry_t *pte = (pt_entry_t *) ptpva;
1714 struct vm_page *pg;
1715 pt_entry_t opte;
1716
1717 /*
1718 * note that ptpva points to the PTE that maps startva. this may
1719 * or may not be the first PTE in the PTP.
1720 *
1721 * we loop through the PTP while there are still PTEs to look at
1722 * and the wire_count is greater than 1 (because we use the wire_count
1723 * to keep track of the number of real PTEs in the PTP).
1724 */
1725
1726 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
1727 ; pte++, startva += NBPG) {
1728 if (!pmap_valid_entry(*pte))
1729 continue; /* VA not mapped */
1730
1731 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W))
1732 continue;
1733
1734 /* atomically save the old PTE and zero it */
1735 opte = i386_atomic_testset_ul(pte, 0);
1736
1737 if (opte & PG_W)
1738 pmap->pm_stats.wired_count--;
1739 pmap->pm_stats.resident_count--;
1740
1741 if (ptp)
1742 ptp->wire_count--; /* dropping a PTE */
1743
1744 /*
1745 * Unnecessary work if not PG_PVLIST.
1746 */
1747 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
1748
1749 /*
1750 * if we are not on a pv list we are done.
1751 */
1752 if ((opte & PG_PVLIST) == 0) {
1753 #ifdef DIAGNOSTIC
1754 if (pg != NULL)
1755 panic("pmap_remove_ptes_86: managed page "
1756 "without PG_PVLIST for 0x%lx", startva);
1757 #endif
1758 continue;
1759 }
1760
1761 #ifdef DIAGNOSTIC
1762 if (pg == NULL)
1763 panic("pmap_remove_ptes_86: unmanaged page marked "
1764 "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
1765 startva, (u_long)(opte & PG_FRAME));
1766 #endif
1767
1768 /* sync R/M bits */
1769 pmap_sync_flags_pte_86(pg, opte);
1770 pve = pmap_remove_pv(pg, pmap, startva);
1771 if (pve) {
1772 pve->pv_next = *free_pvs;
1773 *free_pvs = pve;
1774 }
1775
1776 /* end of "for" loop: time for next pte */
1777 }
1778 }
1779
1780 /*
1781 * pmap_remove: top level mapping removal function
1782 *
1783 * => caller should not be holding any pmap locks
1784 */
1785
1786 void
pmap_remove(struct pmap * pmap,vaddr_t sva,vaddr_t eva)1787 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
1788 {
1789 pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
1790 }
1791
1792 void
pmap_do_remove_86(struct pmap * pmap,vaddr_t sva,vaddr_t eva,int flags)1793 pmap_do_remove_86(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
1794 {
1795 pt_entry_t *ptes;
1796 paddr_t ptppa;
1797 vaddr_t blkendva;
1798 struct vm_page *ptp;
1799 struct pv_entry *pve;
1800 struct pv_entry *free_pvs = NULL;
1801 TAILQ_HEAD(, vm_page) empty_ptps;
1802 int shootall;
1803 vaddr_t va;
1804
1805 TAILQ_INIT(&empty_ptps);
1806
1807 ptes = pmap_map_ptes_86(pmap); /* locks pmap */
1808
1809 /*
1810 * Decide if we want to shoot the whole tlb or just the range.
1811 * Right now, we simply shoot everything when we remove more
1812 * than 32 pages, but never in the kernel pmap. XXX - tune.
1813 */
1814 if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
1815 shootall = 1;
1816 else
1817 shootall = 0;
1818
1819 for (va = sva ; va < eva ; va = blkendva) {
1820 /* determine range of block */
1821 blkendva = i386_round_pdr(va + 1);
1822 if (blkendva > eva)
1823 blkendva = eva;
1824
1825 /*
1826 * XXXCDC: our PTE mappings should never be removed
1827 * with pmap_remove! if we allow this (and why would
1828 * we?) then we end up freeing the pmap's page
1829 * directory page (PDP) before we are finished using
1830 * it when we hit it in the recursive mapping. this
1831 * is BAD.
1832 *
1833 * long term solution is to move the PTEs out of user
1834 * address space. and into kernel address space (up
1835 * with APTE). then we can set VM_MAXUSER_ADDRESS to
1836 * be VM_MAX_ADDRESS.
1837 */
1838
1839 if (pdei(va) == PDSLOT_PTE)
1840 /* XXXCDC: ugly hack to avoid freeing PDP here */
1841 continue;
1842
1843 if (!pmap_valid_entry(PDE(pmap, pdei(va))))
1844 /* valid block? */
1845 continue;
1846
1847 /* PA of the PTP */
1848 ptppa = PDE(pmap, pdei(va)) & PG_FRAME;
1849
1850 /* get PTP if non-kernel mapping */
1851 if (pmap == pmap_kernel()) {
1852 /* we never free kernel PTPs */
1853 ptp = NULL;
1854 } else {
1855 if (pmap->pm_ptphint &&
1856 VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) {
1857 ptp = pmap->pm_ptphint;
1858 } else {
1859 ptp = PHYS_TO_VM_PAGE(ptppa);
1860 #ifdef DIAGNOSTIC
1861 if (ptp == NULL)
1862 panic("pmap_do_remove_86: unmanaged "
1863 "PTP detected");
1864 #endif
1865 }
1866 }
1867 pmap_remove_ptes_86(pmap, ptp, (vaddr_t)&ptes[atop(va)],
1868 va, blkendva, flags, &free_pvs);
1869
1870 /* If PTP is no longer being used, free it. */
1871 if (ptp && ptp->wire_count <= 1) {
1872 pmap_drop_ptp_86(pmap, va, ptp, ptes);
1873 TAILQ_INSERT_TAIL(&empty_ptps, ptp, pageq);
1874 }
1875
1876 if (!shootall)
1877 pmap_tlb_shootrange(pmap, va, blkendva);
1878 }
1879
1880 if (shootall)
1881 pmap_tlb_shoottlb();
1882
1883 pmap_unmap_ptes_86(pmap);
1884 pmap_tlb_shootwait();
1885
1886 while ((pve = free_pvs) != NULL) {
1887 free_pvs = pve->pv_next;
1888 pool_put(&pmap_pv_pool, pve);
1889 }
1890
1891 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1892 TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1893 uvm_pagefree(ptp);
1894 }
1895 }
1896
1897 /*
1898 * pmap_page_remove: remove a managed vm_page from all pmaps that map it
1899 *
1900 * => R/M bits are sync'd back to attrs
1901 */
1902
1903 void
pmap_page_remove_86(struct vm_page * pg)1904 pmap_page_remove_86(struct vm_page *pg)
1905 {
1906 struct pv_entry *pve;
1907 struct pmap *pm;
1908 pt_entry_t *ptes, opte;
1909 TAILQ_HEAD(, vm_page) empty_ptps;
1910 struct vm_page *ptp;
1911
1912 if (pg->mdpage.pv_list == NULL)
1913 return;
1914
1915 TAILQ_INIT(&empty_ptps);
1916
1917 mtx_enter(&pg->mdpage.pv_mtx);
1918 while ((pve = pg->mdpage.pv_list) != NULL) {
1919 pmap_reference(pve->pv_pmap);
1920 pm = pve->pv_pmap;
1921 mtx_leave(&pg->mdpage.pv_mtx);
1922
1923 ptes = pmap_map_ptes_86(pm); /* locks pmap */
1924
1925 /*
1926 * We dropped the pvlist lock before grabbing the pmap
1927 * lock to avoid lock ordering problems. This means
1928 * we have to check the pvlist again since somebody
1929 * else might have modified it. All we care about is
1930 * that the pvlist entry matches the pmap we just
1931 * locked. If it doesn't, unlock the pmap and try
1932 * again.
1933 */
1934 mtx_enter(&pg->mdpage.pv_mtx);
1935 if ((pve = pg->mdpage.pv_list) == NULL ||
1936 pve->pv_pmap != pm) {
1937 mtx_leave(&pg->mdpage.pv_mtx);
1938 pmap_unmap_ptes_86(pm); /* unlocks pmap */
1939 pmap_destroy(pm);
1940 mtx_enter(&pg->mdpage.pv_mtx);
1941 continue;
1942 }
1943
1944 pg->mdpage.pv_list = pve->pv_next;
1945 mtx_leave(&pg->mdpage.pv_mtx);
1946
1947 #ifdef DIAGNOSTIC
1948 if (pve->pv_ptp && (PDE(pve->pv_pmap, pdei(pve->pv_va)) &
1949 PG_FRAME)
1950 != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
1951 printf("pmap_page_remove_86: pg=%p: va=%lx, "
1952 "pv_ptp=%p\n",
1953 pg, pve->pv_va, pve->pv_ptp);
1954 printf("pmap_page_remove_86: PTP's phys addr: "
1955 "actual=%x, recorded=%lx\n",
1956 (PDE(pve->pv_pmap, pdei(pve->pv_va)) &
1957 PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp));
1958 panic("pmap_page_remove_86: mapped managed page has "
1959 "invalid pv_ptp field");
1960 }
1961 #endif
1962 opte = i386_atomic_testset_ul(&ptes[atop(pve->pv_va)], 0);
1963
1964 if (opte & PG_W)
1965 pve->pv_pmap->pm_stats.wired_count--;
1966 pve->pv_pmap->pm_stats.resident_count--;
1967
1968 /* sync R/M bits */
1969 pmap_sync_flags_pte_86(pg, opte);
1970
1971 /* update the PTP reference count. free if last reference. */
1972 if (pve->pv_ptp && --pve->pv_ptp->wire_count <= 1) {
1973 pmap_drop_ptp_86(pve->pv_pmap, pve->pv_va,
1974 pve->pv_ptp, ptes);
1975 TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp, pageq);
1976 }
1977
1978 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
1979
1980 pmap_unmap_ptes_86(pve->pv_pmap); /* unlocks pmap */
1981 pmap_destroy(pve->pv_pmap);
1982 pool_put(&pmap_pv_pool, pve);
1983 mtx_enter(&pg->mdpage.pv_mtx);
1984 }
1985 mtx_leave(&pg->mdpage.pv_mtx);
1986
1987 pmap_tlb_shootwait();
1988
1989 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1990 TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1991 uvm_pagefree(ptp);
1992 }
1993 }
1994
1995 /*
1996 * p m a p a t t r i b u t e f u n c t i o n s
1997 * functions that test/change managed page's attributes
1998 * since a page can be mapped multiple times we must check each PTE that
1999 * maps it by going down the pv lists.
2000 */
2001
2002 /*
2003 * pmap_test_attrs: test a page's attributes
2004 */
2005
2006 int
pmap_test_attrs_86(struct vm_page * pg,int testbits)2007 pmap_test_attrs_86(struct vm_page *pg, int testbits)
2008 {
2009 struct pv_entry *pve;
2010 pt_entry_t *ptes, pte;
2011 u_long mybits, testflags;
2012 paddr_t ptppa;
2013
2014 testflags = pmap_pte2flags(testbits);
2015
2016 if (pg->pg_flags & testflags)
2017 return 1;
2018
2019 mybits = 0;
2020 mtx_enter(&pg->mdpage.pv_mtx);
2021 for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0;
2022 pve = pve->pv_next) {
2023 ptppa = PDE(pve->pv_pmap, pdei(pve->pv_va)) & PG_FRAME;
2024 ptes = (pt_entry_t *)pmap_tmpmap_pa(ptppa);
2025 pte = ptes[ptei(pve->pv_va)];
2026 pmap_tmpunmap_pa();
2027 mybits |= (pte & testbits);
2028 }
2029 mtx_leave(&pg->mdpage.pv_mtx);
2030
2031 if (mybits == 0)
2032 return 0;
2033
2034 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits));
2035
2036 return 1;
2037 }
2038
2039 /*
2040 * pmap_clear_attrs: change a page's attributes
2041 *
2042 * => we return 1 if we cleared one of the bits we were asked to
2043 */
2044
2045 int
pmap_clear_attrs_86(struct vm_page * pg,int clearbits)2046 pmap_clear_attrs_86(struct vm_page *pg, int clearbits)
2047 {
2048 struct pv_entry *pve;
2049 pt_entry_t *ptes, opte;
2050 u_long clearflags;
2051 paddr_t ptppa;
2052 int result;
2053
2054 clearflags = pmap_pte2flags(clearbits);
2055
2056 result = pg->pg_flags & clearflags;
2057 if (result)
2058 atomic_clearbits_int(&pg->pg_flags, clearflags);
2059
2060 mtx_enter(&pg->mdpage.pv_mtx);
2061 for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) {
2062 ptppa = PDE(pve->pv_pmap, pdei(pve->pv_va)) & PG_FRAME;
2063 ptes = (pt_entry_t *)pmap_tmpmap_pa(ptppa);
2064 #ifdef DIAGNOSTIC
2065 if (!pmap_valid_entry(PDE(pve->pv_pmap, pdei(pve->pv_va))))
2066 panic("pmap_clear_attrs_86: mapping without PTP "
2067 "detected");
2068 #endif
2069
2070 opte = ptes[ptei(pve->pv_va)];
2071 if (opte & clearbits) {
2072 result = 1;
2073 i386_atomic_clearbits_l(&ptes[ptei(pve->pv_va)],
2074 (opte & clearbits));
2075 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
2076 }
2077 pmap_tmpunmap_pa();
2078 }
2079 mtx_leave(&pg->mdpage.pv_mtx);
2080
2081 pmap_tlb_shootwait();
2082
2083 return (result != 0);
2084 }
2085
2086 /*
2087 * p m a p p r o t e c t i o n f u n c t i o n s
2088 */
2089
2090 /*
2091 * pmap_page_protect: change the protection of all recorded mappings
2092 * of a managed page
2093 *
2094 * => NOTE: this is an inline function in pmap.h
2095 */
2096
2097 /* see pmap.h */
2098
2099 /*
2100 * pmap_protect: set the protection in of the pages in a pmap
2101 *
2102 * => NOTE: this is an inline function in pmap.h
2103 */
2104
2105 /* see pmap.h */
2106
2107 /*
2108 * pmap_write_protect: write-protect pages in a pmap
2109 */
2110
2111 void
pmap_write_protect_86(struct pmap * pmap,vaddr_t sva,vaddr_t eva,vm_prot_t prot)2112 pmap_write_protect_86(struct pmap *pmap, vaddr_t sva, vaddr_t eva,
2113 vm_prot_t prot)
2114 {
2115 pt_entry_t *ptes, *spte, *epte, npte, opte;
2116 vaddr_t blockend;
2117 u_int32_t md_prot;
2118 vaddr_t va;
2119 int shootall = 0;
2120
2121 ptes = pmap_map_ptes_86(pmap); /* locks pmap */
2122
2123 /* should be ok, but just in case ... */
2124 sva &= PG_FRAME;
2125 eva &= PG_FRAME;
2126
2127 if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
2128 shootall = 1;
2129
2130 for (va = sva; va < eva; va = blockend) {
2131 blockend = (va & PD_MASK) + NBPD;
2132 if (blockend > eva)
2133 blockend = eva;
2134
2135 /*
2136 * XXXCDC: our PTE mappings should never be write-protected!
2137 *
2138 * long term solution is to move the PTEs out of user
2139 * address space. and into kernel address space (up
2140 * with APTE). then we can set VM_MAXUSER_ADDRESS to
2141 * be VM_MAX_ADDRESS.
2142 */
2143
2144 /* XXXCDC: ugly hack to avoid freeing PDP here */
2145 if (pdei(va) == PDSLOT_PTE)
2146 continue;
2147
2148 /* empty block? */
2149 if (!pmap_valid_entry(PDE(pmap, pdei(va))))
2150 continue;
2151
2152 md_prot = protection_codes[prot];
2153 if (va < VM_MAXUSER_ADDRESS)
2154 md_prot |= PG_u;
2155 else if (va < VM_MAX_ADDRESS)
2156 /* XXX: write-prot our PTES? never! */
2157 md_prot |= PG_RW;
2158
2159 spte = &ptes[atop(va)];
2160 epte = &ptes[atop(blockend)];
2161
2162 for (/*null */; spte < epte ; spte++, va += PAGE_SIZE) {
2163
2164 if (!pmap_valid_entry(*spte)) /* no mapping? */
2165 continue;
2166
2167 opte = *spte;
2168 npte = (opte & ~PG_PROT) | md_prot;
2169
2170 if (npte != opte) {
2171 pmap_exec_account(pmap, va, *spte, npte);
2172 i386_atomic_clearbits_l(spte,
2173 (~md_prot & opte) & PG_PROT);
2174 i386_atomic_setbits_l(spte, md_prot);
2175 }
2176 }
2177 }
2178 if (shootall)
2179 pmap_tlb_shoottlb();
2180 else
2181 pmap_tlb_shootrange(pmap, sva, eva);
2182
2183 pmap_unmap_ptes_86(pmap); /* unlocks pmap */
2184 pmap_tlb_shootwait();
2185 }
2186
2187 /*
2188 * end of protection functions
2189 */
2190
2191 /*
2192 * pmap_unwire: clear the wired bit in the PTE
2193 *
2194 * => mapping should already be in map
2195 */
2196
2197 void
pmap_unwire_86(struct pmap * pmap,vaddr_t va)2198 pmap_unwire_86(struct pmap *pmap, vaddr_t va)
2199 {
2200 pt_entry_t *ptes;
2201
2202 if (pmap_valid_entry(PDE(pmap, pdei(va)))) {
2203 ptes = pmap_map_ptes_86(pmap); /* locks pmap */
2204
2205 #ifdef DIAGNOSTIC
2206 if (!pmap_valid_entry(ptes[atop(va)]))
2207 panic("pmap_unwire_86: invalid (unmapped) va "
2208 "0x%lx", va);
2209 #endif
2210
2211 if ((ptes[atop(va)] & PG_W) != 0) {
2212 i386_atomic_clearbits_l(&ptes[atop(va)], PG_W);
2213 pmap->pm_stats.wired_count--;
2214 }
2215 #ifdef DIAGNOSTIC
2216 else {
2217 printf("pmap_unwire_86: wiring for pmap %p va 0x%lx "
2218 "didn't change!\n", pmap, va);
2219 }
2220 #endif
2221 pmap_unmap_ptes_86(pmap); /* unlocks map */
2222 }
2223 #ifdef DIAGNOSTIC
2224 else {
2225 panic("pmap_unwire_86: invalid PDE");
2226 }
2227 #endif
2228 }
2229
2230 /*
2231 * pmap_enter: enter a mapping into a pmap
2232 *
2233 * => must be done "now" ... no lazy-evaluation
2234 */
2235
2236 int
pmap_enter_86(struct pmap * pmap,vaddr_t va,paddr_t pa,vm_prot_t prot,int flags)2237 pmap_enter_86(struct pmap *pmap, vaddr_t va, paddr_t pa,
2238 vm_prot_t prot, int flags)
2239 {
2240 pt_entry_t *ptes, opte, npte;
2241 struct vm_page *ptp;
2242 struct pv_entry *pve, *opve = NULL;
2243 int wired = (flags & PMAP_WIRED) != 0;
2244 int nocache = (pa & PMAP_NOCACHE) != 0;
2245 int wc = (pa & PMAP_WC) != 0;
2246 struct vm_page *pg = NULL;
2247 int error, wired_count, resident_count, ptp_count;
2248
2249 KASSERT(!(wc && nocache));
2250 pa &= PMAP_PA_MASK; /* nuke flags from pa */
2251
2252 #ifdef DIAGNOSTIC
2253 /* sanity check: totally out of range? */
2254 if (va >= VM_MAX_KERNEL_ADDRESS)
2255 panic("pmap_enter_86: too big");
2256
2257 if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
2258 panic("pmap_enter_86: trying to map over PDP/APDP!");
2259
2260 /* sanity check: kernel PTPs should already have been pre-allocated */
2261 if (va >= VM_MIN_KERNEL_ADDRESS &&
2262 !pmap_valid_entry(PDE(pmap, pdei(va))))
2263 panic("pmap_enter: missing kernel PTP!");
2264 #endif
2265 if (pmap_initialized)
2266 pve = pool_get(&pmap_pv_pool, PR_NOWAIT);
2267 else
2268 pve = NULL;
2269 wired_count = resident_count = ptp_count = 0;
2270
2271 /*
2272 * map in ptes and get a pointer to our PTP (unless we are the kernel)
2273 */
2274
2275 ptes = pmap_map_ptes_86(pmap); /* locks pmap */
2276 if (pmap == pmap_kernel()) {
2277 ptp = NULL;
2278 } else {
2279 ptp = pmap_get_ptp_86(pmap, pdei(va));
2280 if (ptp == NULL) {
2281 if (flags & PMAP_CANFAIL) {
2282 pmap_unmap_ptes_86(pmap);
2283 error = ENOMEM;
2284 goto out;
2285 }
2286 panic("pmap_enter_86: get ptp failed");
2287 }
2288 }
2289 /*
2290 * not allowed to sleep after here!
2291 */
2292 opte = ptes[atop(va)]; /* old PTE */
2293
2294 /*
2295 * is there currently a valid mapping at our VA?
2296 */
2297
2298 if (pmap_valid_entry(opte)) {
2299
2300 /*
2301 * first, calculate pm_stats updates. resident count will not
2302 * change since we are replacing/changing a valid
2303 * mapping. wired count might change...
2304 */
2305
2306 if (wired && (opte & PG_W) == 0)
2307 wired_count++;
2308 else if (!wired && (opte & PG_W) != 0)
2309 wired_count--;
2310
2311 /*
2312 * is the currently mapped PA the same as the one we
2313 * want to map?
2314 */
2315
2316 if ((opte & PG_FRAME) == pa) {
2317
2318 /* if this is on the PVLIST, sync R/M bit */
2319 if (opte & PG_PVLIST) {
2320 pg = PHYS_TO_VM_PAGE(pa);
2321 #ifdef DIAGNOSTIC
2322 if (pg == NULL)
2323 panic("pmap_enter_86: same pa "
2324 "PG_PVLIST mapping with "
2325 "unmanaged page "
2326 "pa = 0x%lx (0x%lx)", pa,
2327 atop(pa));
2328 #endif
2329 pmap_sync_flags_pte_86(pg, opte);
2330 }
2331 goto enter_now;
2332 }
2333
2334 /*
2335 * changing PAs: we must remove the old one first
2336 */
2337
2338 /*
2339 * if current mapping is on a pvlist,
2340 * remove it (sync R/M bits)
2341 */
2342
2343 if (opte & PG_PVLIST) {
2344 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
2345 #ifdef DIAGNOSTIC
2346 if (pg == NULL)
2347 panic("pmap_enter_86: PG_PVLIST mapping with "
2348 "unmanaged page "
2349 "pa = 0x%lx (0x%lx)", pa, atop(pa));
2350 #endif
2351 pmap_sync_flags_pte_86(pg, opte);
2352 opve = pmap_remove_pv(pg, pmap, va);
2353 pg = NULL; /* This is not the page we are looking for */
2354 }
2355 } else { /* opte not valid */
2356 resident_count++;
2357 if (wired)
2358 wired_count++;
2359 if (ptp)
2360 ptp_count++; /* count # of valid entries */
2361 }
2362
2363 /*
2364 * pve is either NULL or points to a now-free pv_entry structure
2365 * (the latter case is if we called pmap_remove_pv above).
2366 *
2367 * if this entry is to be on a pvlist, enter it now.
2368 */
2369
2370 if (pmap_initialized && pg == NULL)
2371 pg = PHYS_TO_VM_PAGE(pa);
2372
2373 if (pg != NULL) {
2374 if (pve == NULL) {
2375 pve = opve;
2376 opve = NULL;
2377 }
2378 if (pve == NULL) {
2379 if (flags & PMAP_CANFAIL) {
2380 pmap_unmap_ptes_86(pmap);
2381 error = ENOMEM;
2382 goto out;
2383 }
2384 panic("pmap_enter_86: no pv entries available");
2385 }
2386 /* lock pg when adding */
2387 pmap_enter_pv(pg, pve, pmap, va, ptp);
2388 pve = NULL;
2389 }
2390
2391 enter_now:
2392 /*
2393 * at this point pg is !NULL if we want the PG_PVLIST bit set
2394 */
2395
2396 npte = pa | protection_codes[prot] | PG_V;
2397 pmap_exec_account(pmap, va, opte, npte);
2398 if (wired)
2399 npte |= PG_W;
2400 if (nocache)
2401 npte |= PG_N;
2402 if (va < VM_MAXUSER_ADDRESS)
2403 npte |= PG_u;
2404 else if (va < VM_MAX_ADDRESS)
2405 npte |= PG_RW; /* XXXCDC: no longer needed? */
2406 if (pmap == pmap_kernel())
2407 npte |= pmap_pg_g;
2408 if (flags & PROT_READ)
2409 npte |= PG_U;
2410 if (flags & PROT_WRITE)
2411 npte |= PG_M;
2412 if (pg) {
2413 npte |= PG_PVLIST;
2414 if (pg->pg_flags & PG_PMAP_WC) {
2415 KASSERT(nocache == 0);
2416 wc = 1;
2417 }
2418 pmap_sync_flags_pte_86(pg, npte);
2419 }
2420 if (wc)
2421 npte |= pmap_pg_wc;
2422
2423 opte = i386_atomic_testset_ul(&ptes[atop(va)], npte);
2424 if (ptp)
2425 ptp->wire_count += ptp_count;
2426 pmap->pm_stats.resident_count += resident_count;
2427 pmap->pm_stats.wired_count += wired_count;
2428
2429 if (pmap_valid_entry(opte)) {
2430 if (nocache && (opte & PG_N) == 0)
2431 wbinvd_on_all_cpus(); /* XXX clflush before we enter? */
2432 pmap_tlb_shootpage(pmap, va);
2433 }
2434
2435 pmap_unmap_ptes_86(pmap);
2436 pmap_tlb_shootwait();
2437
2438 error = 0;
2439
2440 out:
2441 if (pve)
2442 pool_put(&pmap_pv_pool, pve);
2443 if (opve)
2444 pool_put(&pmap_pv_pool, opve);
2445
2446 return error;
2447 }
2448
2449 /*
2450 * Allocate an extra PD page and PT pages as needed to map kernel
2451 * pages used for the U-K mappings. These special mappings are set
2452 * up during bootstrap and get never removed and are part of
2453 * pmap_kernel.
2454 *
2455 * New pmaps inherit the kernel portion of pmap_kernel including
2456 * the special mappings (see pmap_pinit_pd_86()).
2457 *
2458 * To be able to release PT pages when migrating to PAE paging, use
2459 * wire_count for number of PTEs in the PT page.
2460 */
2461 void
pmap_enter_special_86(vaddr_t va,paddr_t pa,vm_prot_t prot,u_int32_t flags)2462 pmap_enter_special_86(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int32_t flags)
2463 {
2464 struct pmap *pmap = pmap_kernel();
2465 struct vm_page *ptppg = NULL;
2466 pd_entry_t *pd, *ptp;
2467 pt_entry_t *ptes;
2468 uint32_t l2idx, l1idx;
2469 paddr_t npa;
2470
2471 /* If CPU is secure, no need to do anything */
2472 if (!cpu_meltdown)
2473 return;
2474
2475 /* Must be kernel VA */
2476 if (va < VM_MIN_KERNEL_ADDRESS)
2477 panic("invalid special mapping va 0x%lx requested", va);
2478
2479 if (!pmap->pm_pdir_intel)
2480 pmap_alloc_pdir_intel_x86(pmap);
2481
2482 DPRINTF("%s: pm_pdir_intel 0x%x pm_pdirpa_intel 0x%x\n", __func__,
2483 (uint32_t)pmap->pm_pdir_intel, (uint32_t)pmap->pm_pdirpa_intel);
2484
2485 l2idx = pdei(va);
2486 l1idx = ptei(va);
2487
2488 DPRINTF("%s: va 0x%08lx pa 0x%08lx prot 0x%08lx flags 0x%08x "
2489 "l2idx %u l1idx %u\n", __func__, va, pa, (unsigned long)prot,
2490 flags, l2idx, l1idx);
2491
2492 if ((pd = (pd_entry_t *)pmap->pm_pdir_intel) == NULL)
2493 panic("%s: PD not initialized for pmap @ %p", __func__, pmap);
2494
2495 /* npa = physaddr of PT page */
2496 npa = pd[l2idx] & PMAP_PA_MASK;
2497
2498 /* Valid PDE for the 4MB region containing va? */
2499 if (!npa) {
2500 /*
2501 * No valid PDE - allocate PT page and set PDE. We
2502 * get it from pm_obj, which is used for PT pages.
2503 * We calculate the offset from l2idx+1024, so we are
2504 * beyond the regular PT pages. For their l2dix
2505 * 0 <= l2idx < 1024 holds.
2506 */
2507 ptppg = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(l2idx + 1024),
2508 NULL, UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2509 if (ptppg == NULL)
2510 panic("%s: failed to allocate PT page", __func__);
2511
2512 atomic_clearbits_int(&ptppg->pg_flags, PG_BUSY);
2513 ptppg->wire_count = 1; /* no mappings yet */
2514
2515 npa = VM_PAGE_TO_PHYS(ptppg);
2516 pd[l2idx] = (npa | PG_RW | PG_V | PG_M | PG_U);
2517
2518 DPRINTF("%s: allocated new PT page at phys 0x%x, "
2519 "setting PDE[%d] = 0x%x\n", __func__, (uint32_t)npa,
2520 l2idx, pd[l2idx]);
2521 }
2522
2523 /* temporarily map PT page and set PTE for U-K mapping */
2524 if (ptppg == NULL && (ptppg = PHYS_TO_VM_PAGE(npa)) == NULL)
2525 panic("%s: no vm_page for PT page", __func__);
2526 mtx_enter(&ptppg->mdpage.pv_mtx);
2527 ptp = (pd_entry_t *)pmap_tmpmap_pa(npa);
2528 ptp[l1idx] = (pa | protection_codes[prot] | PG_V | PG_M | PG_U | flags);
2529 ptppg->wire_count++;
2530 DPRINTF("%s: setting PTE[%d] = 0x%x (wire_count %d)\n", __func__,
2531 l1idx, ptp[l1idx], ptppg->wire_count);
2532 pmap_tmpunmap_pa();
2533 mtx_leave(&ptppg->mdpage.pv_mtx);
2534
2535 /*
2536 * if supported, set the PG_G flag on the corresponding U+K
2537 * entry. U+K mappings can use PG_G, as they are mapped
2538 * along with user land anyway.
2539 */
2540 if (!(cpu_feature & CPUID_PGE))
2541 return;
2542 ptes = pmap_map_ptes_86(pmap); /* pmap_kernel -> PTE_BASE */
2543 if (pmap_valid_entry(ptes[atop(va)]))
2544 ptes[atop(va)] |= PG_G;
2545 else
2546 DPRINTF("%s: no U+K mapping for special mapping?\n", __func__);
2547 pmap_unmap_ptes_86(pmap); /* pmap_kernel -> nothing */
2548 }
2549
2550 /*
2551 * pmap_growkernel: increase usage of KVM space
2552 *
2553 * => we allocate new PTPs for the kernel and install them in all
2554 * the pmaps on the system.
2555 */
2556
2557 vaddr_t
pmap_growkernel_86(vaddr_t maxkvaddr)2558 pmap_growkernel_86(vaddr_t maxkvaddr)
2559 {
2560 struct pmap *kpm = pmap_kernel(), *pm;
2561 int needed_kpde; /* needed number of kernel PTPs */
2562 int s;
2563 paddr_t ptaddr;
2564
2565 needed_kpde = (int)(maxkvaddr - VM_MIN_KERNEL_ADDRESS + (NBPD-1))
2566 / NBPD;
2567 if (needed_kpde <= nkpde)
2568 goto out; /* we are OK */
2569
2570 /*
2571 * whoops! we need to add kernel PTPs
2572 */
2573
2574 s = splhigh(); /* to be safe */
2575
2576 for (/*null*/ ; nkpde < needed_kpde ; nkpde++) {
2577
2578 if (uvm.page_init_done == 0) {
2579
2580 /*
2581 * we're growing the kernel pmap early (from
2582 * uvm_pageboot_alloc()). this case must be
2583 * handled a little differently.
2584 */
2585
2586 if (uvm_page_physget(&ptaddr) == 0)
2587 panic("pmap_growkernel: out of memory");
2588 pmap_zero_phys_86(ptaddr);
2589
2590 PDE(kpm, PDSLOT_KERN + nkpde) =
2591 ptaddr | PG_RW | PG_V | PG_U | PG_M;
2592
2593 /* count PTP as resident */
2594 kpm->pm_stats.resident_count++;
2595 continue;
2596 }
2597
2598 /*
2599 * THIS *MUST* BE CODED SO AS TO WORK IN THE
2600 * pmap_initialized == 0 CASE! WE MAY BE
2601 * INVOKED WHILE pmap_init() IS RUNNING!
2602 */
2603
2604 while (!pmap_alloc_ptp_86(kpm, PDSLOT_KERN + nkpde, 0))
2605 uvm_wait("pmap_growkernel");
2606
2607 /* distribute new kernel PTP to all active pmaps */
2608 mtx_enter(&pmaps_lock);
2609 LIST_FOREACH(pm, &pmaps, pm_list) {
2610 PDE(pm, PDSLOT_KERN + nkpde) =
2611 PDE(kpm, PDSLOT_KERN + nkpde);
2612 }
2613 mtx_leave(&pmaps_lock);
2614 }
2615
2616 splx(s);
2617
2618 out:
2619 return (VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD));
2620 }
2621
2622 #ifdef MULTIPROCESSOR
2623 /*
2624 * Locking for tlb shootdown.
2625 *
2626 * We lock by setting tlb_shoot_wait to the number of cpus that will
2627 * receive our tlb shootdown. After sending the IPIs, we don't need to
2628 * worry about locking order or interrupts spinning for the lock because
2629 * the call that grabs the "lock" isn't the one that releases it. And
2630 * there is nothing that can block the IPI that releases the lock.
2631 *
2632 * The functions are organized so that we first count the number of
2633 * cpus we need to send the IPI to, then we grab the counter, then
2634 * we send the IPIs, then we finally do our own shootdown.
2635 *
2636 * Our shootdown is last to make it parallel with the other cpus
2637 * to shorten the spin time.
2638 *
2639 * Notice that we depend on failures to send IPIs only being able to
2640 * happen during boot. If they happen later, the above assumption
2641 * doesn't hold since we can end up in situations where noone will
2642 * release the lock if we get an interrupt in a bad moment.
2643 */
2644
2645 volatile int tlb_shoot_wait __attribute__((section(".kudata")));
2646
2647 volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata")));
2648 volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata")));
2649
2650 void
pmap_tlb_shootpage(struct pmap * pm,vaddr_t va)2651 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va)
2652 {
2653 struct cpu_info *ci, *self = curcpu();
2654 CPU_INFO_ITERATOR cii;
2655 int wait = 0;
2656 u_int64_t mask = 0;
2657
2658 CPU_INFO_FOREACH(cii, ci) {
2659 if (ci == self || !pmap_is_active(pm, ci) ||
2660 !(ci->ci_flags & CPUF_RUNNING))
2661 continue;
2662 mask |= (1ULL << ci->ci_cpuid);
2663 wait++;
2664 }
2665
2666 if (wait > 0) {
2667 int s = splvm();
2668
2669 while (atomic_cas_uint(&tlb_shoot_wait, 0, wait) != 0) {
2670 while (tlb_shoot_wait != 0)
2671 CPU_BUSY_CYCLE();
2672 }
2673 tlb_shoot_addr1 = va;
2674 CPU_INFO_FOREACH(cii, ci) {
2675 if ((mask & (1ULL << ci->ci_cpuid)) == 0)
2676 continue;
2677 if (i386_fast_ipi(ci, LAPIC_IPI_INVLPG) != 0)
2678 panic("pmap_tlb_shootpage: ipi failed");
2679 }
2680 splx(s);
2681 }
2682
2683 if (pmap_is_curpmap(pm))
2684 pmap_update_pg(va);
2685 }
2686
2687 void
pmap_tlb_shootrange(struct pmap * pm,vaddr_t sva,vaddr_t eva)2688 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva)
2689 {
2690 struct cpu_info *ci, *self = curcpu();
2691 CPU_INFO_ITERATOR cii;
2692 int wait = 0;
2693 u_int64_t mask = 0;
2694 vaddr_t va;
2695
2696 CPU_INFO_FOREACH(cii, ci) {
2697 if (ci == self || !pmap_is_active(pm, ci) ||
2698 !(ci->ci_flags & CPUF_RUNNING))
2699 continue;
2700 mask |= (1ULL << ci->ci_cpuid);
2701 wait++;
2702 }
2703
2704 if (wait > 0) {
2705 int s = splvm();
2706
2707 while (atomic_cas_uint(&tlb_shoot_wait, 0, wait) != 0) {
2708 while (tlb_shoot_wait != 0)
2709 CPU_BUSY_CYCLE();
2710 }
2711 tlb_shoot_addr1 = sva;
2712 tlb_shoot_addr2 = eva;
2713 CPU_INFO_FOREACH(cii, ci) {
2714 if ((mask & (1ULL << ci->ci_cpuid)) == 0)
2715 continue;
2716 if (i386_fast_ipi(ci, LAPIC_IPI_INVLRANGE) != 0)
2717 panic("pmap_tlb_shootrange: ipi failed");
2718 }
2719 splx(s);
2720 }
2721
2722 if (pmap_is_curpmap(pm))
2723 for (va = sva; va < eva; va += PAGE_SIZE)
2724 pmap_update_pg(va);
2725 }
2726
2727 void
pmap_tlb_shoottlb(void)2728 pmap_tlb_shoottlb(void)
2729 {
2730 struct cpu_info *ci, *self = curcpu();
2731 CPU_INFO_ITERATOR cii;
2732 int wait = 0;
2733 u_int64_t mask = 0;
2734
2735 CPU_INFO_FOREACH(cii, ci) {
2736 if (ci == self || !(ci->ci_flags & CPUF_RUNNING))
2737 continue;
2738 mask |= (1ULL << ci->ci_cpuid);
2739 wait++;
2740 }
2741
2742 if (wait) {
2743 int s = splvm();
2744
2745 while (atomic_cas_uint(&tlb_shoot_wait, 0, wait) != 0) {
2746 while (tlb_shoot_wait != 0)
2747 CPU_BUSY_CYCLE();
2748 }
2749
2750 CPU_INFO_FOREACH(cii, ci) {
2751 if ((mask & (1ULL << ci->ci_cpuid)) == 0)
2752 continue;
2753 if (i386_fast_ipi(ci, LAPIC_IPI_INVLTLB) != 0)
2754 panic("pmap_tlb_shoottlb: ipi failed");
2755 }
2756 splx(s);
2757 }
2758
2759 tlbflush();
2760 }
2761
2762 void
pmap_tlb_droppmap(struct pmap * pm)2763 pmap_tlb_droppmap(struct pmap *pm)
2764 {
2765 struct cpu_info *ci, *self = curcpu();
2766 CPU_INFO_ITERATOR cii;
2767 int wait = 0;
2768 u_int64_t mask = 0;
2769
2770 CPU_INFO_FOREACH(cii, ci) {
2771 if (ci == self || !(ci->ci_flags & CPUF_RUNNING) ||
2772 ci->ci_curpmap != pm)
2773 continue;
2774 mask |= (1ULL << ci->ci_cpuid);
2775 wait++;
2776 }
2777
2778 if (wait) {
2779 int s = splvm();
2780
2781 while (atomic_cas_uint(&tlb_shoot_wait, 0, wait) != 0) {
2782 while (tlb_shoot_wait != 0)
2783 CPU_BUSY_CYCLE();
2784 }
2785
2786 CPU_INFO_FOREACH(cii, ci) {
2787 if ((mask & (1ULL << ci->ci_cpuid)) == 0)
2788 continue;
2789 if (i386_fast_ipi(ci, LAPIC_IPI_RELOADCR3) != 0)
2790 panic("pmap_tlb_droppmap: ipi failed");
2791 }
2792 splx(s);
2793 }
2794
2795 if (self->ci_curpmap == pm)
2796 pmap_activate(curproc);
2797
2798 pmap_tlb_shootwait();
2799 }
2800
2801 void
pmap_tlb_shootwait(void)2802 pmap_tlb_shootwait(void)
2803 {
2804 while (tlb_shoot_wait != 0)
2805 CPU_BUSY_CYCLE();
2806 }
2807
2808 #else
2809
2810 void
pmap_tlb_shootpage(struct pmap * pm,vaddr_t va)2811 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va)
2812 {
2813 if (pmap_is_curpmap(pm))
2814 pmap_update_pg(va);
2815
2816 }
2817
2818 void
pmap_tlb_shootrange(struct pmap * pm,vaddr_t sva,vaddr_t eva)2819 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva)
2820 {
2821 vaddr_t va;
2822
2823 for (va = sva; va < eva; va += PAGE_SIZE)
2824 pmap_update_pg(va);
2825 }
2826
2827 void
pmap_tlb_shoottlb(void)2828 pmap_tlb_shoottlb(void)
2829 {
2830 tlbflush();
2831 }
2832 #endif /* MULTIPROCESSOR */
2833
2834 u_int32_t (*pmap_pte_set_p)(vaddr_t, paddr_t, u_int32_t) =
2835 pmap_pte_set_86;
2836 u_int32_t (*pmap_pte_setbits_p)(vaddr_t, u_int32_t, u_int32_t) =
2837 pmap_pte_setbits_86;
2838 u_int32_t (*pmap_pte_bits_p)(vaddr_t) = pmap_pte_bits_86;
2839 paddr_t (*pmap_pte_paddr_p)(vaddr_t) = pmap_pte_paddr_86;
2840 int (*pmap_clear_attrs_p)(struct vm_page *, int) =
2841 pmap_clear_attrs_86;
2842 int (*pmap_enter_p)(pmap_t, vaddr_t, paddr_t, vm_prot_t, int) =
2843 pmap_enter_86;
2844 void (*pmap_enter_special_p)(vaddr_t, paddr_t, vm_prot_t,
2845 u_int32_t) = pmap_enter_special_86;
2846 int (*pmap_extract_p)(pmap_t, vaddr_t, paddr_t *) =
2847 pmap_extract_86;
2848 vaddr_t (*pmap_growkernel_p)(vaddr_t) = pmap_growkernel_86;
2849 void (*pmap_page_remove_p)(struct vm_page *) = pmap_page_remove_86;
2850 void (*pmap_do_remove_p)(struct pmap *, vaddr_t, vaddr_t, int) =
2851 pmap_do_remove_86;
2852 int (*pmap_test_attrs_p)(struct vm_page *, int) =
2853 pmap_test_attrs_86;
2854 void (*pmap_unwire_p)(struct pmap *, vaddr_t) = pmap_unwire_86;
2855 void (*pmap_write_protect_p)(struct pmap *, vaddr_t, vaddr_t,
2856 vm_prot_t) = pmap_write_protect_86;
2857 void (*pmap_pinit_pd_p)(pmap_t) = pmap_pinit_pd_86;
2858 void (*pmap_zero_phys_p)(paddr_t) = pmap_zero_phys_86;
2859 void (*pmap_copy_page_p)(struct vm_page *, struct vm_page *) =
2860 pmap_copy_page_86;
2861