1 /* $OpenBSD: pmap.h,v 1.88 2023/12/29 13:23:28 jca Exp $ */
2 /* $NetBSD: pmap.h,v 1.1 2003/04/26 18:39:46 fvdl Exp $ */
3
4 /*
5 * Copyright (c) 1997 Charles D. Cranor and Washington University.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * Copyright (c) 2001 Wasabi Systems, Inc.
31 * All rights reserved.
32 *
33 * Written by Frank van der Linden for Wasabi Systems, Inc.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. All advertising materials mentioning features or use of this software
44 * must display the following acknowledgement:
45 * This product includes software developed for the NetBSD Project by
46 * Wasabi Systems, Inc.
47 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
48 * or promote products derived from this software without specific prior
49 * written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
53 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
54 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
55 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
56 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
57 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
58 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
59 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
60 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
61 * POSSIBILITY OF SUCH DAMAGE.
62 */
63
64 /*
65 * pmap.h: see pmap.c for the history of this pmap module.
66 */
67
68 #ifndef _MACHINE_PMAP_H_
69 #define _MACHINE_PMAP_H_
70
71 #ifndef _LOCORE
72 #ifdef _KERNEL
73 #include <lib/libkern/libkern.h> /* for KASSERT() */
74 #include <machine/cpufunc.h>
75 #endif /* _KERNEL */
76 #include <sys/mutex.h>
77 #include <uvm/uvm_object.h>
78 #include <machine/pte.h>
79 #endif
80
81 /*
82 * The x86_64 pmap module closely resembles the i386 one. It uses
83 * the same recursive entry scheme. See the i386 pmap.h for a
84 * description. The alternate area trick for accessing non-current
85 * pmaps has been removed, though, because it performs badly on SMP
86 * systems.
87 * The most obvious difference to i386 is that 2 extra levels of page
88 * table need to be dealt with. The level 1 page table pages are at:
89 *
90 * l1: 0x00007f8000000000 - 0x00007fffffffffff (39 bits, needs PML4 entry)
91 *
92 * The other levels are kept as physical pages in 3 UVM objects and are
93 * temporarily mapped for virtual access when needed.
94 *
95 * The other obvious difference from i386 is that it has a direct map of all
96 * physical memory in the VA range:
97 *
98 * 0xfffffd8000000000 - 0xffffff7fffffffff
99 *
100 * The direct map is used in some cases to access PTEs of non-current pmaps.
101 *
102 * Note that address space is signed, so the layout for 48 bits is:
103 *
104 * +---------------------------------+ 0xffffffffffffffff
105 * | Kernel Image |
106 * +---------------------------------+ 0xffffff8000000000
107 * | Direct Map |
108 * +---------------------------------+ 0xfffffd8000000000
109 * ~ ~
110 * | |
111 * | Kernel Space |
112 * | |
113 * | |
114 * +---------------------------------+ 0xffff800000000000 = 0x0000800000000000
115 * | L1 table (PTE pages) |
116 * +---------------------------------+ 0x00007f8000000000
117 * ~ ~
118 * | |
119 * | User Space |
120 * | |
121 * | |
122 * +---------------------------------+ 0x0000000000000000
123 *
124 * In other words, there is a 'VA hole' at 0x0000800000000000 -
125 * 0xffff800000000000 which will trap, just as on, for example,
126 * sparcv9.
127 *
128 * The unused space can be used if needed, but it adds a little more
129 * complexity to the calculations.
130 */
131
132 /*
133 * Mask to get rid of the sign-extended part of addresses.
134 */
135 #define VA_SIGN_MASK 0xffff000000000000
136 #define VA_SIGN_NEG(va) ((va) | VA_SIGN_MASK)
137 /*
138 * XXXfvdl this one's not right.
139 */
140 #define VA_SIGN_POS(va) ((va) & ~VA_SIGN_MASK)
141
142 #define L4_SLOT_PTE 255
143 #define L4_SLOT_KERN 256
144 #define L4_SLOT_KERNBASE 511
145 #define NUM_L4_SLOT_DIRECT 4
146 #define L4_SLOT_DIRECT (L4_SLOT_KERNBASE - NUM_L4_SLOT_DIRECT)
147 #define L4_SLOT_EARLY (L4_SLOT_DIRECT - 1)
148
149 #define PDIR_SLOT_KERN L4_SLOT_KERN
150 #define PDIR_SLOT_PTE L4_SLOT_PTE
151 #define PDIR_SLOT_DIRECT L4_SLOT_DIRECT
152 #define PDIR_SLOT_EARLY L4_SLOT_EARLY
153
154 /*
155 * the following defines give the virtual addresses of various MMU
156 * data structures:
157 * PTE_BASE: the base VA of the linear PTE mappings
158 * PDP_PDE: the VA of the PDE that points back to the PDP
159 *
160 */
161
162 #define PTE_BASE ((pt_entry_t *) (L4_SLOT_PTE * NBPD_L4))
163 #define PMAP_DIRECT_BASE (VA_SIGN_NEG((L4_SLOT_DIRECT * NBPD_L4)))
164 #define PMAP_DIRECT_END (VA_SIGN_NEG(((L4_SLOT_DIRECT + \
165 NUM_L4_SLOT_DIRECT) * NBPD_L4)))
166
167 #define L1_BASE PTE_BASE
168
169 #define L2_BASE ((pd_entry_t *)((char *)L1_BASE + L4_SLOT_PTE * NBPD_L3))
170 #define L3_BASE ((pd_entry_t *)((char *)L2_BASE + L4_SLOT_PTE * NBPD_L2))
171 #define L4_BASE ((pd_entry_t *)((char *)L3_BASE + L4_SLOT_PTE * NBPD_L1))
172
173 #define PDP_PDE (L4_BASE + PDIR_SLOT_PTE)
174
175 #define PDP_BASE L4_BASE
176
177 #define NKL4_MAX_ENTRIES (unsigned long)1
178 #define NKL3_MAX_ENTRIES (unsigned long)(NKL4_MAX_ENTRIES * 512)
179 #define NKL2_MAX_ENTRIES (unsigned long)(NKL3_MAX_ENTRIES * 512)
180 #define NKL1_MAX_ENTRIES (unsigned long)(NKL2_MAX_ENTRIES * 512)
181
182 #define NKL4_KIMG_ENTRIES 1
183 #define NKL3_KIMG_ENTRIES 1
184 #define NKL2_KIMG_ENTRIES 64
185
186 /* number of pages of direct map entries set up by locore0.S */
187 #define NDML4_ENTRIES 1
188 #define NDML3_ENTRIES 1
189 #define NDML2_ENTRIES 4 /* 4GB */
190
191 /*
192 * Since kva space is below the kernel in its entirety, we start off
193 * with zero entries on each level.
194 */
195 #define NKL4_START_ENTRIES 0
196 #define NKL3_START_ENTRIES 0
197 #define NKL2_START_ENTRIES 0
198 #define NKL1_START_ENTRIES 0 /* XXX */
199
200 #define NTOPLEVEL_PDES (PAGE_SIZE / (sizeof (pd_entry_t)))
201
202 #define NPDPG (PAGE_SIZE / sizeof (pd_entry_t))
203
204 /*
205 * pl*_pi: index in the ptp page for a pde mapping a VA.
206 * (pl*_i below is the index in the virtual array of all pdes per level)
207 */
208 #define pl1_pi(VA) (((VA_SIGN_POS(VA)) & L1_MASK) >> L1_SHIFT)
209 #define pl2_pi(VA) (((VA_SIGN_POS(VA)) & L2_MASK) >> L2_SHIFT)
210 #define pl3_pi(VA) (((VA_SIGN_POS(VA)) & L3_MASK) >> L3_SHIFT)
211 #define pl4_pi(VA) (((VA_SIGN_POS(VA)) & L4_MASK) >> L4_SHIFT)
212
213 /*
214 * pl*_i: generate index into pde/pte arrays in virtual space
215 */
216 #define pl1_i(VA) (((VA_SIGN_POS(VA)) & L1_FRAME) >> L1_SHIFT)
217 #define pl2_i(VA) (((VA_SIGN_POS(VA)) & L2_FRAME) >> L2_SHIFT)
218 #define pl3_i(VA) (((VA_SIGN_POS(VA)) & L3_FRAME) >> L3_SHIFT)
219 #define pl4_i(VA) (((VA_SIGN_POS(VA)) & L4_FRAME) >> L4_SHIFT)
220 #define pl_i(va, lvl) \
221 (((VA_SIGN_POS(va)) & ptp_masks[(lvl)-1]) >> ptp_shifts[(lvl)-1])
222
223 #define PTP_MASK_INITIALIZER { L1_FRAME, L2_FRAME, L3_FRAME, L4_FRAME }
224 #define PTP_SHIFT_INITIALIZER { L1_SHIFT, L2_SHIFT, L3_SHIFT, L4_SHIFT }
225 #define NKPTP_INITIALIZER { NKL1_START_ENTRIES, NKL2_START_ENTRIES, \
226 NKL3_START_ENTRIES, NKL4_START_ENTRIES }
227 #define NKPTPMAX_INITIALIZER { NKL1_MAX_ENTRIES, NKL2_MAX_ENTRIES, \
228 NKL3_MAX_ENTRIES, NKL4_MAX_ENTRIES }
229 #define NBPD_INITIALIZER { NBPD_L1, NBPD_L2, NBPD_L3, NBPD_L4 }
230 #define PDES_INITIALIZER { L2_BASE, L3_BASE, L4_BASE }
231
232 /*
233 * PTP macros:
234 * a PTP's index is the PD index of the PDE that points to it
235 * a PTP's offset is the byte-offset in the PTE space that this PTP is at
236 * a PTP's VA is the first VA mapped by that PTP
237 */
238
239 #define ptp_va2o(va, lvl) (pl_i(va, (lvl)+1) * PAGE_SIZE)
240
241 #define PTP_LEVELS 4
242
243 /*
244 * PG_AVAIL usage: we make use of the ignored bits of the PTE
245 */
246
247 #define PG_W PG_AVAIL1 /* "wired" mapping */
248 #define PG_PVLIST PG_AVAIL2 /* mapping has entry on pvlist */
249 /* PG_AVAIL3 not used */
250
251 /*
252 * PCID assignments.
253 * The shootdown code assumes KERN, PROC, and PROC_INTEL are both
254 * consecutive and in that order.
255 */
256 #define PCID_KERN 0 /* for pmap_kernel() */
257 #define PCID_PROC 1 /* non-pmap_kernel(), U+K */
258 #define PCID_PROC_INTEL 2 /* non-pmap_kernel(), U-K (meltdown) */
259 #define PCID_TEMP 3 /* temp mapping of another non-pmap_kernel() */
260 #define PCID_EFI 4 /* EFI runtime services */
261
262 extern int pmap_use_pcid; /* non-zero if PCID support is enabled */
263
264 /*
265 * Number of PTEs per cache line. 8 byte pte, 64-byte cache line
266 * Used to avoid false sharing of cache lines.
267 */
268 #define NPTECL 8
269
270
271 #if defined(_KERNEL) && !defined(_LOCORE)
272 /*
273 * pmap data structures: see pmap.c for details of locking.
274 */
275
276 struct pmap;
277 typedef struct pmap *pmap_t;
278
279 /*
280 * we maintain a list of all non-kernel pmaps
281 */
282
283 LIST_HEAD(pmap_head, pmap); /* struct pmap_head: head of a pmap list */
284
285 /*
286 * the pmap structure
287 *
288 * note that the pm_obj contains the reference count,
289 * page list, and number of PTPs within the pmap.
290 */
291
292 #define PMAP_TYPE_NORMAL 1
293 #define PMAP_TYPE_EPT 2
294 #define PMAP_TYPE_RVI 3
295 #define pmap_nested(pm) ((pm)->pm_type != PMAP_TYPE_NORMAL)
296
297 struct pmap {
298 struct mutex pm_mtx;
299 struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */
300 LIST_ENTRY(pmap) pm_list; /* list (lck by pm_list lock) */
301 /*
302 * pm_pdir : VA of page table to be used when executing in
303 * privileged mode
304 * pm_pdirpa : PA of page table to be used when executing in
305 * privileged mode
306 * pm_pdir_intel : VA of special page table to be used when executing
307 * on an Intel CPU in usermode (no kernel mappings)
308 * pm_pdirpa_intel : PA of special page table to be used when executing
309 * on an Intel CPU in usermode (no kernel mappings)
310 */
311 pd_entry_t *pm_pdir, *pm_pdir_intel;
312 paddr_t pm_pdirpa, pm_pdirpa_intel;
313
314 struct vm_page *pm_ptphint[PTP_LEVELS-1];
315 /* pointer to a PTP in our pmap */
316 struct pmap_statistics pm_stats; /* pmap stats (lck by object lock) */
317
318 int pm_type; /* Type of pmap this is (PMAP_TYPE_x) */
319 uint64_t eptp; /* cached EPTP (used by vmm) */
320 };
321
322 #define PMAP_EFI PMAP_MD0
323
324 /*
325 * MD flags that we use for pmap_enter (in the pa):
326 */
327 #define PMAP_PA_MASK ~((paddr_t)PAGE_MASK) /* to remove the flags */
328 #define PMAP_NOCACHE 0x1 /* set the non-cacheable bit. */
329 #define PMAP_WC 0x2 /* set page write combining. */
330
331 /*
332 * We keep mod/ref flags in struct vm_page->pg_flags.
333 */
334 #define PG_PMAP_MOD PG_PMAP0
335 #define PG_PMAP_REF PG_PMAP1
336 #define PG_PMAP_WC PG_PMAP2
337
338 /*
339 * for each managed physical page we maintain a list of <PMAP,VA>'s
340 * which it is mapped at.
341 */
342 struct pv_entry { /* locked by its list's pvh_lock */
343 struct pv_entry *pv_next; /* next entry */
344 struct pmap *pv_pmap; /* the pmap */
345 vaddr_t pv_va; /* the virtual address */
346 struct vm_page *pv_ptp; /* the vm_page of the PTP */
347 };
348
349 /*
350 * global kernel variables
351 */
352
353 extern struct pmap kernel_pmap_store; /* kernel pmap */
354
355 extern long nkptp[];
356
357 extern const paddr_t ptp_masks[];
358 extern const int ptp_shifts[];
359 extern const long nbpd[], nkptpmax[];
360
361 /*
362 * macros
363 */
364
365 #define pmap_kernel() (&kernel_pmap_store)
366 #define pmap_resident_count(pmap) ((pmap)->pm_stats.resident_count)
367 #define pmap_wired_count(pmap) ((pmap)->pm_stats.wired_count)
368 #define pmap_update(pmap) /* nothing (yet) */
369
370 #define pmap_clear_modify(pg) pmap_clear_attrs(pg, PG_M)
371 #define pmap_clear_reference(pg) pmap_clear_attrs(pg, PG_U)
372 #define pmap_is_modified(pg) pmap_test_attrs(pg, PG_M)
373 #define pmap_is_referenced(pg) pmap_test_attrs(pg, PG_U)
374 #define pmap_move(DP,SP,D,L,S)
375 #define pmap_valid_entry(E) ((E) & PG_V) /* is PDE or PTE valid? */
376
377 #define pmap_proc_iflush(p,va,len) /* nothing */
378 #define pmap_unuse_final(p) /* nothing */
379 #define pmap_remove_holes(vm) do { /* nothing */ } while (0)
380
381
382 /*
383 * prototypes
384 */
385
386 void map_tramps(void); /* machdep.c */
387 paddr_t pmap_bootstrap(paddr_t, paddr_t);
388 void pmap_init_percpu(void);
389 void pmap_randomize(void);
390 void pmap_randomize_level(pd_entry_t *, int);
391 int pmap_clear_attrs(struct vm_page *, unsigned long);
392 static void pmap_page_protect(struct vm_page *, vm_prot_t);
393 void pmap_page_remove (struct vm_page *);
394 static void pmap_protect(struct pmap *, vaddr_t,
395 vaddr_t, vm_prot_t);
396 void pmap_remove(struct pmap *, vaddr_t, vaddr_t);
397 int pmap_test_attrs(struct vm_page *, unsigned);
398 static void pmap_update_pg(vaddr_t);
399 void pmap_write_protect(struct pmap *, vaddr_t,
400 vaddr_t, vm_prot_t);
401 void pmap_fix_ept(struct pmap *, vaddr_t);
402
403 paddr_t pmap_prealloc_lowmem_ptps(paddr_t);
404
405 void pagezero(vaddr_t);
406
407 void pmap_convert(struct pmap *, int);
408 void pmap_enter_special(vaddr_t, paddr_t, vm_prot_t);
409 vaddr_t pmap_set_pml4_early(paddr_t pa);
410 void pmap_clear_pml4_early(void);
411
412 /*
413 * functions for flushing the cache for vaddrs and pages.
414 * these functions are not part of the MI pmap interface and thus
415 * should not be used as such.
416 */
417 void pmap_flush_cache(vaddr_t, vsize_t);
418 #define pmap_flush_page(paddr) do { \
419 KDASSERT(PHYS_TO_VM_PAGE(paddr) != NULL); \
420 pmap_flush_cache(PMAP_DIRECT_MAP(paddr), PAGE_SIZE); \
421 } while (/* CONSTCOND */ 0)
422
423 #define PMAP_CHECK_COPYIN (pg_xo == 0)
424
425 #define PMAP_STEAL_MEMORY /* enable pmap_steal_memory() */
426 #define PMAP_GROWKERNEL /* turn on pmap_growkernel interface */
427
428 /*
429 * inline functions
430 */
431
432 static inline void
pmap_remove_all(struct pmap * pmap)433 pmap_remove_all(struct pmap *pmap)
434 {
435 /* Nothing. */
436 }
437
438 /*
439 * pmap_update_pg: flush one page from the TLB (or flush the whole thing
440 * if hardware doesn't support one-page flushing)
441 */
442
443 static inline void
pmap_update_pg(vaddr_t va)444 pmap_update_pg(vaddr_t va)
445 {
446 invlpg(va);
447 }
448
449 /*
450 * pmap_page_protect: change the protection of all recorded mappings
451 * of a managed page
452 *
453 * => this function is a frontend for pmap_page_remove/pmap_clear_attrs
454 * => we only have to worry about making the page more protected.
455 * unprotecting a page is done on-demand at fault time.
456 */
457
458 static inline void
pmap_page_protect(struct vm_page * pg,vm_prot_t prot)459 pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
460 {
461 if (prot == PROT_READ) {
462 (void) pmap_clear_attrs(pg, PG_RW);
463 } else {
464 KASSERT(prot == PROT_NONE);
465 pmap_page_remove(pg);
466 }
467 }
468
469 /*
470 * pmap_protect: change the protection of pages in a pmap
471 *
472 * => this function is a frontend for pmap_remove/pmap_write_protect
473 * => we only have to worry about making the page more protected.
474 * unprotecting a page is done on-demand at fault time.
475 */
476
477 static inline void
pmap_protect(struct pmap * pmap,vaddr_t sva,vaddr_t eva,vm_prot_t prot)478 pmap_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
479 {
480 if (prot != PROT_NONE) {
481 pmap_write_protect(pmap, sva, eva, prot);
482 } else {
483 pmap_remove(pmap, sva, eva);
484 }
485 }
486
487 /*
488 * various address inlines
489 *
490 * vtopte: return a pointer to the PTE mapping a VA, works only for
491 * user and PT addresses
492 *
493 * kvtopte: return a pointer to the PTE mapping a kernel VA
494 */
495
496 static inline pt_entry_t *
vtopte(vaddr_t va)497 vtopte(vaddr_t va)
498 {
499 return (PTE_BASE + pl1_i(va));
500 }
501
502 static inline pt_entry_t *
kvtopte(vaddr_t va)503 kvtopte(vaddr_t va)
504 {
505 #ifdef LARGEPAGES
506 {
507 pd_entry_t *pde;
508
509 pde = L1_BASE + pl2_i(va);
510 if (*pde & PG_PS)
511 return ((pt_entry_t *)pde);
512 }
513 #endif
514
515 return (PTE_BASE + pl1_i(va));
516 }
517
518 #define PMAP_DIRECT_MAP(pa) ((vaddr_t)PMAP_DIRECT_BASE + (pa))
519 #define PMAP_DIRECT_UNMAP(va) ((paddr_t)(va) - PMAP_DIRECT_BASE)
520 #define pmap_map_direct(pg) PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(pg))
521 #define pmap_unmap_direct(va) PHYS_TO_VM_PAGE(PMAP_DIRECT_UNMAP(va))
522
523 #define __HAVE_PMAP_DIRECT
524 #define __HAVE_PMAP_MPSAFE_ENTER_COW
525
526 #endif /* _KERNEL && !_LOCORE */
527
528 #ifndef _LOCORE
529 struct pv_entry;
530 struct vm_page_md {
531 struct mutex pv_mtx;
532 struct pv_entry *pv_list;
533 };
534
535 #define VM_MDPAGE_INIT(pg) do { \
536 mtx_init(&(pg)->mdpage.pv_mtx, IPL_VM); \
537 (pg)->mdpage.pv_list = NULL; \
538 } while (0)
539 #endif /* !_LOCORE */
540
541 #endif /* _MACHINE_PMAP_H_ */
542