xref: /freebsd/sys/amd64/amd64/pmap.c (revision 0957b409)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 1991 Regents of the University of California.
5  * All rights reserved.
6  * Copyright (c) 1994 John S. Dyson
7  * All rights reserved.
8  * Copyright (c) 1994 David Greenman
9  * All rights reserved.
10  * Copyright (c) 2003 Peter Wemm
11  * All rights reserved.
12  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
13  * All rights reserved.
14  *
15  * This code is derived from software contributed to Berkeley by
16  * the Systems Programming Group of the University of Utah Computer
17  * Science Department and William Jolitz of UUNET Technologies Inc.
18  *
19  * Redistribution and use in source and binary forms, with or without
20  * modification, are permitted provided that the following conditions
21  * are met:
22  * 1. Redistributions of source code must retain the above copyright
23  *    notice, this list of conditions and the following disclaimer.
24  * 2. Redistributions in binary form must reproduce the above copyright
25  *    notice, this list of conditions and the following disclaimer in the
26  *    documentation and/or other materials provided with the distribution.
27  * 3. All advertising materials mentioning features or use of this software
28  *    must display the following acknowledgement:
29  *	This product includes software developed by the University of
30  *	California, Berkeley and its contributors.
31  * 4. Neither the name of the University nor the names of its contributors
32  *    may be used to endorse or promote products derived from this software
33  *    without specific prior written permission.
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
36  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
38  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
39  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
40  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
41  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
42  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
43  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
44  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
45  * SUCH DAMAGE.
46  *
47  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
48  */
49 /*-
50  * Copyright (c) 2003 Networks Associates Technology, Inc.
51  * Copyright (c) 2014-2019 The FreeBSD Foundation
52  * All rights reserved.
53  *
54  * This software was developed for the FreeBSD Project by Jake Burkholder,
55  * Safeport Network Services, and Network Associates Laboratories, the
56  * Security Research Division of Network Associates, Inc. under
57  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
58  * CHATS research program.
59  *
60  * Portions of this software were developed by
61  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
62  * the FreeBSD Foundation.
63  *
64  * Redistribution and use in source and binary forms, with or without
65  * modification, are permitted provided that the following conditions
66  * are met:
67  * 1. Redistributions of source code must retain the above copyright
68  *    notice, this list of conditions and the following disclaimer.
69  * 2. Redistributions in binary form must reproduce the above copyright
70  *    notice, this list of conditions and the following disclaimer in the
71  *    documentation and/or other materials provided with the distribution.
72  *
73  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
74  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83  * SUCH DAMAGE.
84  */
85 
86 #define	AMD64_NPT_AWARE
87 
88 #include <sys/cdefs.h>
89 __FBSDID("$FreeBSD$");
90 
91 /*
92  *	Manages physical address maps.
93  *
94  *	Since the information managed by this module is
95  *	also stored by the logical address mapping module,
96  *	this module may throw away valid virtual-to-physical
97  *	mappings at almost any time.  However, invalidations
98  *	of virtual-to-physical mappings must be done as
99  *	requested.
100  *
101  *	In order to cope with hardware architectures which
102  *	make virtual-to-physical map invalidates expensive,
103  *	this module may delay invalidate or reduced protection
104  *	operations until such time as they are actually
105  *	necessary.  This module is given full information as
106  *	to which processors are currently using which maps,
107  *	and to when physical maps must be made correct.
108  */
109 
110 #include "opt_pmap.h"
111 #include "opt_vm.h"
112 
113 #include <sys/param.h>
114 #include <sys/bitstring.h>
115 #include <sys/bus.h>
116 #include <sys/systm.h>
117 #include <sys/kernel.h>
118 #include <sys/ktr.h>
119 #include <sys/lock.h>
120 #include <sys/malloc.h>
121 #include <sys/mman.h>
122 #include <sys/mutex.h>
123 #include <sys/proc.h>
124 #include <sys/rangeset.h>
125 #include <sys/rwlock.h>
126 #include <sys/sx.h>
127 #include <sys/turnstile.h>
128 #include <sys/vmem.h>
129 #include <sys/vmmeter.h>
130 #include <sys/sched.h>
131 #include <sys/sysctl.h>
132 #include <sys/smp.h>
133 
134 #include <vm/vm.h>
135 #include <vm/vm_param.h>
136 #include <vm/vm_kern.h>
137 #include <vm/vm_page.h>
138 #include <vm/vm_map.h>
139 #include <vm/vm_object.h>
140 #include <vm/vm_extern.h>
141 #include <vm/vm_pageout.h>
142 #include <vm/vm_pager.h>
143 #include <vm/vm_phys.h>
144 #include <vm/vm_radix.h>
145 #include <vm/vm_reserv.h>
146 #include <vm/uma.h>
147 
148 #include <machine/intr_machdep.h>
149 #include <x86/apicvar.h>
150 #include <x86/ifunc.h>
151 #include <machine/cpu.h>
152 #include <machine/cputypes.h>
153 #include <machine/md_var.h>
154 #include <machine/pcb.h>
155 #include <machine/specialreg.h>
156 #ifdef SMP
157 #include <machine/smp.h>
158 #endif
159 #include <machine/sysarch.h>
160 #include <machine/tss.h>
161 
162 static __inline boolean_t
163 pmap_type_guest(pmap_t pmap)
164 {
165 
166 	return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
167 }
168 
169 static __inline boolean_t
170 pmap_emulate_ad_bits(pmap_t pmap)
171 {
172 
173 	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
174 }
175 
176 static __inline pt_entry_t
177 pmap_valid_bit(pmap_t pmap)
178 {
179 	pt_entry_t mask;
180 
181 	switch (pmap->pm_type) {
182 	case PT_X86:
183 	case PT_RVI:
184 		mask = X86_PG_V;
185 		break;
186 	case PT_EPT:
187 		if (pmap_emulate_ad_bits(pmap))
188 			mask = EPT_PG_EMUL_V;
189 		else
190 			mask = EPT_PG_READ;
191 		break;
192 	default:
193 		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
194 	}
195 
196 	return (mask);
197 }
198 
199 static __inline pt_entry_t
200 pmap_rw_bit(pmap_t pmap)
201 {
202 	pt_entry_t mask;
203 
204 	switch (pmap->pm_type) {
205 	case PT_X86:
206 	case PT_RVI:
207 		mask = X86_PG_RW;
208 		break;
209 	case PT_EPT:
210 		if (pmap_emulate_ad_bits(pmap))
211 			mask = EPT_PG_EMUL_RW;
212 		else
213 			mask = EPT_PG_WRITE;
214 		break;
215 	default:
216 		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
217 	}
218 
219 	return (mask);
220 }
221 
222 static pt_entry_t pg_g;
223 
224 static __inline pt_entry_t
225 pmap_global_bit(pmap_t pmap)
226 {
227 	pt_entry_t mask;
228 
229 	switch (pmap->pm_type) {
230 	case PT_X86:
231 		mask = pg_g;
232 		break;
233 	case PT_RVI:
234 	case PT_EPT:
235 		mask = 0;
236 		break;
237 	default:
238 		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
239 	}
240 
241 	return (mask);
242 }
243 
244 static __inline pt_entry_t
245 pmap_accessed_bit(pmap_t pmap)
246 {
247 	pt_entry_t mask;
248 
249 	switch (pmap->pm_type) {
250 	case PT_X86:
251 	case PT_RVI:
252 		mask = X86_PG_A;
253 		break;
254 	case PT_EPT:
255 		if (pmap_emulate_ad_bits(pmap))
256 			mask = EPT_PG_READ;
257 		else
258 			mask = EPT_PG_A;
259 		break;
260 	default:
261 		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
262 	}
263 
264 	return (mask);
265 }
266 
267 static __inline pt_entry_t
268 pmap_modified_bit(pmap_t pmap)
269 {
270 	pt_entry_t mask;
271 
272 	switch (pmap->pm_type) {
273 	case PT_X86:
274 	case PT_RVI:
275 		mask = X86_PG_M;
276 		break;
277 	case PT_EPT:
278 		if (pmap_emulate_ad_bits(pmap))
279 			mask = EPT_PG_WRITE;
280 		else
281 			mask = EPT_PG_M;
282 		break;
283 	default:
284 		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
285 	}
286 
287 	return (mask);
288 }
289 
290 static __inline pt_entry_t
291 pmap_pku_mask_bit(pmap_t pmap)
292 {
293 
294 	return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0);
295 }
296 
297 #if !defined(DIAGNOSTIC)
298 #ifdef __GNUC_GNU_INLINE__
299 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
300 #else
301 #define PMAP_INLINE	extern inline
302 #endif
303 #else
304 #define PMAP_INLINE
305 #endif
306 
307 #ifdef PV_STATS
308 #define PV_STAT(x)	do { x ; } while (0)
309 #else
310 #define PV_STAT(x)	do { } while (0)
311 #endif
312 
313 #define	pa_index(pa)	((pa) >> PDRSHIFT)
314 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
315 
316 #define	NPV_LIST_LOCKS	MAXCPU
317 
318 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
319 			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
320 
321 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
322 	struct rwlock **_lockp = (lockp);		\
323 	struct rwlock *_new_lock;			\
324 							\
325 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
326 	if (_new_lock != *_lockp) {			\
327 		if (*_lockp != NULL)			\
328 			rw_wunlock(*_lockp);		\
329 		*_lockp = _new_lock;			\
330 		rw_wlock(*_lockp);			\
331 	}						\
332 } while (0)
333 
334 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
335 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
336 
337 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
338 	struct rwlock **_lockp = (lockp);		\
339 							\
340 	if (*_lockp != NULL) {				\
341 		rw_wunlock(*_lockp);			\
342 		*_lockp = NULL;				\
343 	}						\
344 } while (0)
345 
346 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
347 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
348 
349 struct pmap kernel_pmap_store;
350 
351 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
352 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
353 
354 int nkpt;
355 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
356     "Number of kernel page table pages allocated on bootup");
357 
358 static int ndmpdp;
359 vm_paddr_t dmaplimit;
360 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
361 pt_entry_t pg_nx;
362 
363 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
364 
365 static int pg_ps_enabled = 1;
366 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
367     &pg_ps_enabled, 0, "Are large page mappings enabled?");
368 
369 #define	PAT_INDEX_SIZE	8
370 static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
371 
372 static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
373 static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
374 u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
375 u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
376 
377 static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
378 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
379 static int		ndmpdpphys;	/* number of DMPDPphys pages */
380 
381 static vm_paddr_t	KERNend;	/* phys addr of end of bootstrap data */
382 
383 /*
384  * pmap_mapdev support pre initialization (i.e. console)
385  */
386 #define	PMAP_PREINIT_MAPPING_COUNT	8
387 static struct pmap_preinit_mapping {
388 	vm_paddr_t	pa;
389 	vm_offset_t	va;
390 	vm_size_t	sz;
391 	int		mode;
392 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
393 static int pmap_initialized;
394 
395 /*
396  * Data for the pv entry allocation mechanism.
397  * Updates to pv_invl_gen are protected by the pv_list_locks[]
398  * elements, but reads are not.
399  */
400 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
401 static struct mtx __exclusive_cache_line pv_chunks_mutex;
402 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
403 static u_long pv_invl_gen[NPV_LIST_LOCKS];
404 static struct md_page *pv_table;
405 static struct md_page pv_dummy;
406 
407 /*
408  * All those kernel PT submaps that BSD is so fond of
409  */
410 pt_entry_t *CMAP1 = NULL;
411 caddr_t CADDR1 = 0;
412 static vm_offset_t qframe = 0;
413 static struct mtx qframe_mtx;
414 
415 static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
416 
417 static vmem_t *large_vmem;
418 static u_int lm_ents;
419 
420 int pmap_pcid_enabled = 1;
421 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
422     &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
423 int invpcid_works = 0;
424 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
425     "Is the invpcid instruction available ?");
426 
427 int __read_frequently pti = 0;
428 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
429     &pti, 0,
430     "Page Table Isolation enabled");
431 static vm_object_t pti_obj;
432 static pml4_entry_t *pti_pml4;
433 static vm_pindex_t pti_pg_idx;
434 static bool pti_finalized;
435 
436 struct pmap_pkru_range {
437 	struct rs_el	pkru_rs_el;
438 	u_int		pkru_keyidx;
439 	int		pkru_flags;
440 };
441 
442 static uma_zone_t pmap_pkru_ranges_zone;
443 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
444 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va);
445 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
446 static void *pkru_dup_range(void *ctx, void *data);
447 static void pkru_free_range(void *ctx, void *node);
448 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap);
449 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
450 static void pmap_pkru_deassign_all(pmap_t pmap);
451 
452 static int
453 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
454 {
455 	int i;
456 	uint64_t res;
457 
458 	res = 0;
459 	CPU_FOREACH(i) {
460 		res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
461 	}
462 	return (sysctl_handle_64(oidp, &res, 0, req));
463 }
464 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
465     CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
466     "Count of saved TLB context on switch");
467 
468 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
469     LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
470 static struct mtx invl_gen_mtx;
471 static u_long pmap_invl_gen = 0;
472 /* Fake lock object to satisfy turnstiles interface. */
473 static struct lock_object invl_gen_ts = {
474 	.lo_name = "invlts",
475 };
476 
477 static bool
478 pmap_not_in_di(void)
479 {
480 
481 	return (curthread->td_md.md_invl_gen.gen == 0);
482 }
483 
484 #define	PMAP_ASSERT_NOT_IN_DI() \
485     KASSERT(pmap_not_in_di(), ("DI already started"))
486 
487 /*
488  * Start a new Delayed Invalidation (DI) block of code, executed by
489  * the current thread.  Within a DI block, the current thread may
490  * destroy both the page table and PV list entries for a mapping and
491  * then release the corresponding PV list lock before ensuring that
492  * the mapping is flushed from the TLBs of any processors with the
493  * pmap active.
494  */
495 static void
496 pmap_delayed_invl_started(void)
497 {
498 	struct pmap_invl_gen *invl_gen;
499 	u_long currgen;
500 
501 	invl_gen = &curthread->td_md.md_invl_gen;
502 	PMAP_ASSERT_NOT_IN_DI();
503 	mtx_lock(&invl_gen_mtx);
504 	if (LIST_EMPTY(&pmap_invl_gen_tracker))
505 		currgen = pmap_invl_gen;
506 	else
507 		currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
508 	invl_gen->gen = currgen + 1;
509 	LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
510 	mtx_unlock(&invl_gen_mtx);
511 }
512 
513 /*
514  * Finish the DI block, previously started by the current thread.  All
515  * required TLB flushes for the pages marked by
516  * pmap_delayed_invl_page() must be finished before this function is
517  * called.
518  *
519  * This function works by bumping the global DI generation number to
520  * the generation number of the current thread's DI, unless there is a
521  * pending DI that started earlier.  In the latter case, bumping the
522  * global DI generation number would incorrectly signal that the
523  * earlier DI had finished.  Instead, this function bumps the earlier
524  * DI's generation number to match the generation number of the
525  * current thread's DI.
526  */
527 static void
528 pmap_delayed_invl_finished(void)
529 {
530 	struct pmap_invl_gen *invl_gen, *next;
531 	struct turnstile *ts;
532 
533 	invl_gen = &curthread->td_md.md_invl_gen;
534 	KASSERT(invl_gen->gen != 0, ("missed invl_started"));
535 	mtx_lock(&invl_gen_mtx);
536 	next = LIST_NEXT(invl_gen, link);
537 	if (next == NULL) {
538 		turnstile_chain_lock(&invl_gen_ts);
539 		ts = turnstile_lookup(&invl_gen_ts);
540 		pmap_invl_gen = invl_gen->gen;
541 		if (ts != NULL) {
542 			turnstile_broadcast(ts, TS_SHARED_QUEUE);
543 			turnstile_unpend(ts);
544 		}
545 		turnstile_chain_unlock(&invl_gen_ts);
546 	} else {
547 		next->gen = invl_gen->gen;
548 	}
549 	LIST_REMOVE(invl_gen, link);
550 	mtx_unlock(&invl_gen_mtx);
551 	invl_gen->gen = 0;
552 }
553 
554 #ifdef PV_STATS
555 static long invl_wait;
556 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
557     "Number of times DI invalidation blocked pmap_remove_all/write");
558 #endif
559 
560 static u_long *
561 pmap_delayed_invl_genp(vm_page_t m)
562 {
563 
564 	return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
565 }
566 
567 /*
568  * Ensure that all currently executing DI blocks, that need to flush
569  * TLB for the given page m, actually flushed the TLB at the time the
570  * function returned.  If the page m has an empty PV list and we call
571  * pmap_delayed_invl_wait(), upon its return we know that no CPU has a
572  * valid mapping for the page m in either its page table or TLB.
573  *
574  * This function works by blocking until the global DI generation
575  * number catches up with the generation number associated with the
576  * given page m and its PV list.  Since this function's callers
577  * typically own an object lock and sometimes own a page lock, it
578  * cannot sleep.  Instead, it blocks on a turnstile to relinquish the
579  * processor.
580  */
581 static void
582 pmap_delayed_invl_wait(vm_page_t m)
583 {
584 	struct turnstile *ts;
585 	u_long *m_gen;
586 #ifdef PV_STATS
587 	bool accounted = false;
588 #endif
589 
590 	m_gen = pmap_delayed_invl_genp(m);
591 	while (*m_gen > pmap_invl_gen) {
592 #ifdef PV_STATS
593 		if (!accounted) {
594 			atomic_add_long(&invl_wait, 1);
595 			accounted = true;
596 		}
597 #endif
598 		ts = turnstile_trywait(&invl_gen_ts);
599 		if (*m_gen > pmap_invl_gen)
600 			turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
601 		else
602 			turnstile_cancel(ts);
603 	}
604 }
605 
606 /*
607  * Mark the page m's PV list as participating in the current thread's
608  * DI block.  Any threads concurrently using m's PV list to remove or
609  * restrict all mappings to m will wait for the current thread's DI
610  * block to complete before proceeding.
611  *
612  * The function works by setting the DI generation number for m's PV
613  * list to at least the DI generation number of the current thread.
614  * This forces a caller of pmap_delayed_invl_wait() to block until
615  * current thread calls pmap_delayed_invl_finished().
616  */
617 static void
618 pmap_delayed_invl_page(vm_page_t m)
619 {
620 	u_long gen, *m_gen;
621 
622 	rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
623 	gen = curthread->td_md.md_invl_gen.gen;
624 	if (gen == 0)
625 		return;
626 	m_gen = pmap_delayed_invl_genp(m);
627 	if (*m_gen < gen)
628 		*m_gen = gen;
629 }
630 
631 /*
632  * Crashdump maps.
633  */
634 static caddr_t crashdumpmap;
635 
636 /*
637  * Internal flags for pmap_enter()'s helper functions.
638  */
639 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
640 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
641 
642 static void	free_pv_chunk(struct pv_chunk *pc);
643 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
644 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
645 static int	popcnt_pc_map_pq(uint64_t *map);
646 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
647 static void	reserve_pv_entries(pmap_t pmap, int needed,
648 		    struct rwlock **lockp);
649 static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
650 		    struct rwlock **lockp);
651 static bool	pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
652 		    u_int flags, struct rwlock **lockp);
653 #if VM_NRESERVLEVEL > 0
654 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
655 		    struct rwlock **lockp);
656 #endif
657 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
658 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
659 		    vm_offset_t va);
660 
661 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode,
662     bool noflush);
663 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
664 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
665     vm_offset_t va, struct rwlock **lockp);
666 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
667     vm_offset_t va);
668 static bool	pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
669 		    vm_prot_t prot, struct rwlock **lockp);
670 static int	pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
671 		    u_int flags, vm_page_t m, struct rwlock **lockp);
672 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
673     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
674 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
675 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
676 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,
677     vm_offset_t eva);
678 static void pmap_invalidate_cache_range_all(vm_offset_t sva,
679     vm_offset_t eva);
680 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
681 		    pd_entry_t pde);
682 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
683 static vm_page_t pmap_large_map_getptp_unlocked(void);
684 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
685 #if VM_NRESERVLEVEL > 0
686 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
687     struct rwlock **lockp);
688 #endif
689 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
690     vm_prot_t prot);
691 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
692 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
693     bool exec);
694 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
695 static pd_entry_t *pmap_pti_pde(vm_offset_t va);
696 static void pmap_pti_wire_pte(void *pte);
697 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
698     struct spglist *free, struct rwlock **lockp);
699 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
700     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
701 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
702 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
703     struct spglist *free);
704 static bool	pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
705 		    pd_entry_t *pde, struct spglist *free,
706 		    struct rwlock **lockp);
707 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
708     vm_page_t m, struct rwlock **lockp);
709 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
710     pd_entry_t newpde);
711 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
712 
713 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
714 		struct rwlock **lockp);
715 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
716 		struct rwlock **lockp);
717 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
718 		struct rwlock **lockp);
719 
720 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
721     struct spglist *free);
722 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
723 
724 /********************/
725 /* Inline functions */
726 /********************/
727 
728 /* Return a non-clipped PD index for a given VA */
729 static __inline vm_pindex_t
730 pmap_pde_pindex(vm_offset_t va)
731 {
732 	return (va >> PDRSHIFT);
733 }
734 
735 
736 /* Return a pointer to the PML4 slot that corresponds to a VA */
737 static __inline pml4_entry_t *
738 pmap_pml4e(pmap_t pmap, vm_offset_t va)
739 {
740 
741 	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
742 }
743 
744 /* Return a pointer to the PDP slot that corresponds to a VA */
745 static __inline pdp_entry_t *
746 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
747 {
748 	pdp_entry_t *pdpe;
749 
750 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
751 	return (&pdpe[pmap_pdpe_index(va)]);
752 }
753 
754 /* Return a pointer to the PDP slot that corresponds to a VA */
755 static __inline pdp_entry_t *
756 pmap_pdpe(pmap_t pmap, vm_offset_t va)
757 {
758 	pml4_entry_t *pml4e;
759 	pt_entry_t PG_V;
760 
761 	PG_V = pmap_valid_bit(pmap);
762 	pml4e = pmap_pml4e(pmap, va);
763 	if ((*pml4e & PG_V) == 0)
764 		return (NULL);
765 	return (pmap_pml4e_to_pdpe(pml4e, va));
766 }
767 
768 /* Return a pointer to the PD slot that corresponds to a VA */
769 static __inline pd_entry_t *
770 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
771 {
772 	pd_entry_t *pde;
773 
774 	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
775 	return (&pde[pmap_pde_index(va)]);
776 }
777 
778 /* Return a pointer to the PD slot that corresponds to a VA */
779 static __inline pd_entry_t *
780 pmap_pde(pmap_t pmap, vm_offset_t va)
781 {
782 	pdp_entry_t *pdpe;
783 	pt_entry_t PG_V;
784 
785 	PG_V = pmap_valid_bit(pmap);
786 	pdpe = pmap_pdpe(pmap, va);
787 	if (pdpe == NULL || (*pdpe & PG_V) == 0)
788 		return (NULL);
789 	return (pmap_pdpe_to_pde(pdpe, va));
790 }
791 
792 /* Return a pointer to the PT slot that corresponds to a VA */
793 static __inline pt_entry_t *
794 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
795 {
796 	pt_entry_t *pte;
797 
798 	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
799 	return (&pte[pmap_pte_index(va)]);
800 }
801 
802 /* Return a pointer to the PT slot that corresponds to a VA */
803 static __inline pt_entry_t *
804 pmap_pte(pmap_t pmap, vm_offset_t va)
805 {
806 	pd_entry_t *pde;
807 	pt_entry_t PG_V;
808 
809 	PG_V = pmap_valid_bit(pmap);
810 	pde = pmap_pde(pmap, va);
811 	if (pde == NULL || (*pde & PG_V) == 0)
812 		return (NULL);
813 	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
814 		return ((pt_entry_t *)pde);
815 	return (pmap_pde_to_pte(pde, va));
816 }
817 
818 static __inline void
819 pmap_resident_count_inc(pmap_t pmap, int count)
820 {
821 
822 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
823 	pmap->pm_stats.resident_count += count;
824 }
825 
826 static __inline void
827 pmap_resident_count_dec(pmap_t pmap, int count)
828 {
829 
830 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
831 	KASSERT(pmap->pm_stats.resident_count >= count,
832 	    ("pmap %p resident count underflow %ld %d", pmap,
833 	    pmap->pm_stats.resident_count, count));
834 	pmap->pm_stats.resident_count -= count;
835 }
836 
837 PMAP_INLINE pt_entry_t *
838 vtopte(vm_offset_t va)
839 {
840 	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
841 
842 	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
843 
844 	return (PTmap + ((va >> PAGE_SHIFT) & mask));
845 }
846 
847 static __inline pd_entry_t *
848 vtopde(vm_offset_t va)
849 {
850 	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
851 
852 	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
853 
854 	return (PDmap + ((va >> PDRSHIFT) & mask));
855 }
856 
857 static u_int64_t
858 allocpages(vm_paddr_t *firstaddr, int n)
859 {
860 	u_int64_t ret;
861 
862 	ret = *firstaddr;
863 	bzero((void *)ret, n * PAGE_SIZE);
864 	*firstaddr += n * PAGE_SIZE;
865 	return (ret);
866 }
867 
868 CTASSERT(powerof2(NDMPML4E));
869 
870 /* number of kernel PDP slots */
871 #define	NKPDPE(ptpgs)		howmany(ptpgs, NPDEPG)
872 
873 static void
874 nkpt_init(vm_paddr_t addr)
875 {
876 	int pt_pages;
877 
878 #ifdef NKPT
879 	pt_pages = NKPT;
880 #else
881 	pt_pages = howmany(addr, 1 << PDRSHIFT);
882 	pt_pages += NKPDPE(pt_pages);
883 
884 	/*
885 	 * Add some slop beyond the bare minimum required for bootstrapping
886 	 * the kernel.
887 	 *
888 	 * This is quite important when allocating KVA for kernel modules.
889 	 * The modules are required to be linked in the negative 2GB of
890 	 * the address space.  If we run out of KVA in this region then
891 	 * pmap_growkernel() will need to allocate page table pages to map
892 	 * the entire 512GB of KVA space which is an unnecessary tax on
893 	 * physical memory.
894 	 *
895 	 * Secondly, device memory mapped as part of setting up the low-
896 	 * level console(s) is taken from KVA, starting at virtual_avail.
897 	 * This is because cninit() is called after pmap_bootstrap() but
898 	 * before vm_init() and pmap_init(). 20MB for a frame buffer is
899 	 * not uncommon.
900 	 */
901 	pt_pages += 32;		/* 64MB additional slop. */
902 #endif
903 	nkpt = pt_pages;
904 }
905 
906 /*
907  * Returns the proper write/execute permission for a physical page that is
908  * part of the initial boot allocations.
909  *
910  * If the page has kernel text, it is marked as read-only. If the page has
911  * kernel read-only data, it is marked as read-only/not-executable. If the
912  * page has only read-write data, it is marked as read-write/not-executable.
913  * If the page is below/above the kernel range, it is marked as read-write.
914  *
915  * This function operates on 2M pages, since we map the kernel space that
916  * way.
917  *
918  * Note that this doesn't currently provide any protection for modules.
919  */
920 static inline pt_entry_t
921 bootaddr_rwx(vm_paddr_t pa)
922 {
923 
924 	/*
925 	 * Everything in the same 2M page as the start of the kernel
926 	 * should be static. On the other hand, things in the same 2M
927 	 * page as the end of the kernel could be read-write/executable,
928 	 * as the kernel image is not guaranteed to end on a 2M boundary.
929 	 */
930 	if (pa < trunc_2mpage(btext - KERNBASE) ||
931 	   pa >= trunc_2mpage(_end - KERNBASE))
932 		return (X86_PG_RW);
933 	/*
934 	 * The linker should ensure that the read-only and read-write
935 	 * portions don't share the same 2M page, so this shouldn't
936 	 * impact read-only data. However, in any case, any page with
937 	 * read-write data needs to be read-write.
938 	 */
939 	if (pa >= trunc_2mpage(brwsection - KERNBASE))
940 		return (X86_PG_RW | pg_nx);
941 	/*
942 	 * Mark any 2M page containing kernel text as read-only. Mark
943 	 * other pages with read-only data as read-only and not executable.
944 	 * (It is likely a small portion of the read-only data section will
945 	 * be marked as read-only, but executable. This should be acceptable
946 	 * since the read-only protection will keep the data from changing.)
947 	 * Note that fixups to the .text section will still work until we
948 	 * set CR0.WP.
949 	 */
950 	if (pa < round_2mpage(etext - KERNBASE))
951 		return (0);
952 	return (pg_nx);
953 }
954 
955 static void
956 create_pagetables(vm_paddr_t *firstaddr)
957 {
958 	int i, j, ndm1g, nkpdpe, nkdmpde;
959 	pt_entry_t *pt_p;
960 	pd_entry_t *pd_p;
961 	pdp_entry_t *pdp_p;
962 	pml4_entry_t *p4_p;
963 	uint64_t DMPDkernphys;
964 
965 	/* Allocate page table pages for the direct map */
966 	ndmpdp = howmany(ptoa(Maxmem), NBPDP);
967 	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
968 		ndmpdp = 4;
969 	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
970 	if (ndmpdpphys > NDMPML4E) {
971 		/*
972 		 * Each NDMPML4E allows 512 GB, so limit to that,
973 		 * and then readjust ndmpdp and ndmpdpphys.
974 		 */
975 		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
976 		Maxmem = atop(NDMPML4E * NBPML4);
977 		ndmpdpphys = NDMPML4E;
978 		ndmpdp = NDMPML4E * NPDEPG;
979 	}
980 	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
981 	ndm1g = 0;
982 	if ((amd_feature & AMDID_PAGE1GB) != 0) {
983 		/*
984 		 * Calculate the number of 1G pages that will fully fit in
985 		 * Maxmem.
986 		 */
987 		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
988 
989 		/*
990 		 * Allocate 2M pages for the kernel. These will be used in
991 		 * place of the first one or more 1G pages from ndm1g.
992 		 */
993 		nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP);
994 		DMPDkernphys = allocpages(firstaddr, nkdmpde);
995 	}
996 	if (ndm1g < ndmpdp)
997 		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
998 	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
999 
1000 	/* Allocate pages */
1001 	KPML4phys = allocpages(firstaddr, 1);
1002 	KPDPphys = allocpages(firstaddr, NKPML4E);
1003 
1004 	/*
1005 	 * Allocate the initial number of kernel page table pages required to
1006 	 * bootstrap.  We defer this until after all memory-size dependent
1007 	 * allocations are done (e.g. direct map), so that we don't have to
1008 	 * build in too much slop in our estimate.
1009 	 *
1010 	 * Note that when NKPML4E > 1, we have an empty page underneath
1011 	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
1012 	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
1013 	 */
1014 	nkpt_init(*firstaddr);
1015 	nkpdpe = NKPDPE(nkpt);
1016 
1017 	KPTphys = allocpages(firstaddr, nkpt);
1018 	KPDphys = allocpages(firstaddr, nkpdpe);
1019 
1020 	/* Fill in the underlying page table pages */
1021 	/* XXX not fully used, underneath 2M pages */
1022 	pt_p = (pt_entry_t *)KPTphys;
1023 	for (i = 0; ptoa(i) < *firstaddr; i++)
1024 		pt_p[i] = ptoa(i) | X86_PG_V | pg_g | bootaddr_rwx(ptoa(i));
1025 
1026 	/* Now map the page tables at their location within PTmap */
1027 	pd_p = (pd_entry_t *)KPDphys;
1028 	for (i = 0; i < nkpt; i++)
1029 		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
1030 
1031 	/* Map from zero to end of allocations under 2M pages */
1032 	/* This replaces some of the KPTphys entries above */
1033 	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
1034 		/* Preset PG_M and PG_A because demotion expects it. */
1035 		pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
1036 		    X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT);
1037 
1038 	/*
1039 	 * Because we map the physical blocks in 2M pages, adjust firstaddr
1040 	 * to record the physical blocks we've actually mapped into kernel
1041 	 * virtual address space.
1042 	 */
1043 	*firstaddr = round_2mpage(*firstaddr);
1044 
1045 	/* And connect up the PD to the PDP (leaving room for L4 pages) */
1046 	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
1047 	for (i = 0; i < nkpdpe; i++)
1048 		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
1049 
1050 	/*
1051 	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
1052 	 * the end of physical memory is not aligned to a 1GB page boundary,
1053 	 * then the residual physical memory is mapped with 2MB pages.  Later,
1054 	 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
1055 	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
1056 	 * that are partially used.
1057 	 */
1058 	pd_p = (pd_entry_t *)DMPDphys;
1059 	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
1060 		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
1061 		/* Preset PG_M and PG_A because demotion expects it. */
1062 		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
1063 		    X86_PG_M | X86_PG_A | pg_nx;
1064 	}
1065 	pdp_p = (pdp_entry_t *)DMPDPphys;
1066 	for (i = 0; i < ndm1g; i++) {
1067 		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
1068 		/* Preset PG_M and PG_A because demotion expects it. */
1069 		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
1070 		    X86_PG_M | X86_PG_A | pg_nx;
1071 	}
1072 	for (j = 0; i < ndmpdp; i++, j++) {
1073 		pdp_p[i] = DMPDphys + ptoa(j);
1074 		pdp_p[i] |= X86_PG_RW | X86_PG_V;
1075 	}
1076 
1077 	/*
1078 	 * Instead of using a 1G page for the memory containing the kernel,
1079 	 * use 2M pages with appropriate permissions. (If using 1G pages,
1080 	 * this will partially overwrite the PDPEs above.)
1081 	 */
1082 	if (ndm1g) {
1083 		pd_p = (pd_entry_t *)DMPDkernphys;
1084 		for (i = 0; i < (NPDEPG * nkdmpde); i++)
1085 			pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
1086 			    X86_PG_M | X86_PG_A | pg_nx |
1087 			    bootaddr_rwx(i << PDRSHIFT);
1088 		for (i = 0; i < nkdmpde; i++)
1089 			pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW |
1090 			    X86_PG_V;
1091 	}
1092 
1093 	/* And recursively map PML4 to itself in order to get PTmap */
1094 	p4_p = (pml4_entry_t *)KPML4phys;
1095 	p4_p[PML4PML4I] = KPML4phys;
1096 	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
1097 
1098 	/* Connect the Direct Map slot(s) up to the PML4. */
1099 	for (i = 0; i < ndmpdpphys; i++) {
1100 		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
1101 		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V;
1102 	}
1103 
1104 	/* Connect the KVA slots up to the PML4 */
1105 	for (i = 0; i < NKPML4E; i++) {
1106 		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
1107 		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
1108 	}
1109 }
1110 
1111 /*
1112  *	Bootstrap the system enough to run with virtual memory.
1113  *
1114  *	On amd64 this is called after mapping has already been enabled
1115  *	and just syncs the pmap module with what has already been done.
1116  *	[We can't call it easily with mapping off since the kernel is not
1117  *	mapped with PA == VA, hence we would have to relocate every address
1118  *	from the linked base (virtual) address "KERNBASE" to the actual
1119  *	(physical) address starting relative to 0]
1120  */
1121 void
1122 pmap_bootstrap(vm_paddr_t *firstaddr)
1123 {
1124 	vm_offset_t va;
1125 	pt_entry_t *pte;
1126 	uint64_t cr4;
1127 	u_long res;
1128 	int i;
1129 
1130 	KERNend = *firstaddr;
1131 	res = atop(KERNend - (vm_paddr_t)kernphys);
1132 
1133 	if (!pti)
1134 		pg_g = X86_PG_G;
1135 
1136 	/*
1137 	 * Create an initial set of page tables to run the kernel in.
1138 	 */
1139 	create_pagetables(firstaddr);
1140 
1141 	/*
1142 	 * Add a physical memory segment (vm_phys_seg) corresponding to the
1143 	 * preallocated kernel page table pages so that vm_page structures
1144 	 * representing these pages will be created.  The vm_page structures
1145 	 * are required for promotion of the corresponding kernel virtual
1146 	 * addresses to superpage mappings.
1147 	 */
1148 	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
1149 
1150 	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
1151 	virtual_end = VM_MAX_KERNEL_ADDRESS;
1152 
1153 	/*
1154 	 * Enable PG_G global pages, then switch to the kernel page
1155 	 * table from the bootstrap page table.  After the switch, it
1156 	 * is possible to enable SMEP and SMAP since PG_U bits are
1157 	 * correct now.
1158 	 */
1159 	cr4 = rcr4();
1160 	cr4 |= CR4_PGE;
1161 	load_cr4(cr4);
1162 	load_cr3(KPML4phys);
1163 	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
1164 		cr4 |= CR4_SMEP;
1165 	if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
1166 		cr4 |= CR4_SMAP;
1167 	load_cr4(cr4);
1168 
1169 	/*
1170 	 * Initialize the kernel pmap (which is statically allocated).
1171 	 * Count bootstrap data as being resident in case any of this data is
1172 	 * later unmapped (using pmap_remove()) and freed.
1173 	 */
1174 	PMAP_LOCK_INIT(kernel_pmap);
1175 	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
1176 	kernel_pmap->pm_cr3 = KPML4phys;
1177 	kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
1178 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
1179 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1180 	kernel_pmap->pm_stats.resident_count = res;
1181 	kernel_pmap->pm_flags = pmap_flags;
1182 
1183  	/*
1184 	 * Initialize the TLB invalidations generation number lock.
1185 	 */
1186 	mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
1187 
1188 	/*
1189 	 * Reserve some special page table entries/VA space for temporary
1190 	 * mapping of pages.
1191 	 */
1192 #define	SYSMAP(c, p, v, n)	\
1193 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
1194 
1195 	va = virtual_avail;
1196 	pte = vtopte(va);
1197 
1198 	/*
1199 	 * Crashdump maps.  The first page is reused as CMAP1 for the
1200 	 * memory test.
1201 	 */
1202 	SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
1203 	CADDR1 = crashdumpmap;
1204 
1205 	virtual_avail = va;
1206 
1207 	/*
1208 	 * Initialize the PAT MSR.
1209 	 * pmap_init_pat() clears and sets CR4_PGE, which, as a
1210 	 * side-effect, invalidates stale PG_G TLB entries that might
1211 	 * have been created in our pre-boot environment.
1212 	 */
1213 	pmap_init_pat();
1214 
1215 	/* Initialize TLB Context Id. */
1216 	if (pmap_pcid_enabled) {
1217 		for (i = 0; i < MAXCPU; i++) {
1218 			kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN;
1219 			kernel_pmap->pm_pcids[i].pm_gen = 1;
1220 		}
1221 
1222 		/*
1223 		 * PMAP_PCID_KERN + 1 is used for initialization of
1224 		 * proc0 pmap.  The pmap' pcid state might be used by
1225 		 * EFIRT entry before first context switch, so it
1226 		 * needs to be valid.
1227 		 */
1228 		PCPU_SET(pcid_next, PMAP_PCID_KERN + 2);
1229 		PCPU_SET(pcid_gen, 1);
1230 
1231 		/*
1232 		 * pcpu area for APs is zeroed during AP startup.
1233 		 * pc_pcid_next and pc_pcid_gen are initialized by AP
1234 		 * during pcpu setup.
1235 		 */
1236 		load_cr4(rcr4() | CR4_PCIDE);
1237 	}
1238 }
1239 
1240 /*
1241  * Setup the PAT MSR.
1242  */
1243 void
1244 pmap_init_pat(void)
1245 {
1246 	uint64_t pat_msr;
1247 	u_long cr0, cr4;
1248 	int i;
1249 
1250 	/* Bail if this CPU doesn't implement PAT. */
1251 	if ((cpu_feature & CPUID_PAT) == 0)
1252 		panic("no PAT??");
1253 
1254 	/* Set default PAT index table. */
1255 	for (i = 0; i < PAT_INDEX_SIZE; i++)
1256 		pat_index[i] = -1;
1257 	pat_index[PAT_WRITE_BACK] = 0;
1258 	pat_index[PAT_WRITE_THROUGH] = 1;
1259 	pat_index[PAT_UNCACHEABLE] = 3;
1260 	pat_index[PAT_WRITE_COMBINING] = 6;
1261 	pat_index[PAT_WRITE_PROTECTED] = 5;
1262 	pat_index[PAT_UNCACHED] = 2;
1263 
1264 	/*
1265 	 * Initialize default PAT entries.
1266 	 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
1267 	 * Program 5 and 6 as WP and WC.
1268 	 *
1269 	 * Leave 4 and 7 as WB and UC.  Note that a recursive page table
1270 	 * mapping for a 2M page uses a PAT value with the bit 3 set due
1271 	 * to its overload with PG_PS.
1272 	 */
1273 	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
1274 	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
1275 	    PAT_VALUE(2, PAT_UNCACHED) |
1276 	    PAT_VALUE(3, PAT_UNCACHEABLE) |
1277 	    PAT_VALUE(4, PAT_WRITE_BACK) |
1278 	    PAT_VALUE(5, PAT_WRITE_PROTECTED) |
1279 	    PAT_VALUE(6, PAT_WRITE_COMBINING) |
1280 	    PAT_VALUE(7, PAT_UNCACHEABLE);
1281 
1282 	/* Disable PGE. */
1283 	cr4 = rcr4();
1284 	load_cr4(cr4 & ~CR4_PGE);
1285 
1286 	/* Disable caches (CD = 1, NW = 0). */
1287 	cr0 = rcr0();
1288 	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
1289 
1290 	/* Flushes caches and TLBs. */
1291 	wbinvd();
1292 	invltlb();
1293 
1294 	/* Update PAT and index table. */
1295 	wrmsr(MSR_PAT, pat_msr);
1296 
1297 	/* Flush caches and TLBs again. */
1298 	wbinvd();
1299 	invltlb();
1300 
1301 	/* Restore caches and PGE. */
1302 	load_cr0(cr0);
1303 	load_cr4(cr4);
1304 }
1305 
1306 /*
1307  *	Initialize a vm_page's machine-dependent fields.
1308  */
1309 void
1310 pmap_page_init(vm_page_t m)
1311 {
1312 
1313 	TAILQ_INIT(&m->md.pv_list);
1314 	m->md.pat_mode = PAT_WRITE_BACK;
1315 }
1316 
1317 /*
1318  *	Initialize the pmap module.
1319  *	Called by vm_init, to initialize any structures that the pmap
1320  *	system needs to map virtual memory.
1321  */
1322 void
1323 pmap_init(void)
1324 {
1325 	struct pmap_preinit_mapping *ppim;
1326 	vm_page_t m, mpte;
1327 	vm_size_t s;
1328 	int error, i, pv_npg, ret, skz63;
1329 
1330 	/* L1TF, reserve page @0 unconditionally */
1331 	vm_page_blacklist_add(0, bootverbose);
1332 
1333 	/* Detect bare-metal Skylake Server and Skylake-X. */
1334 	if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL &&
1335 	    CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) {
1336 		/*
1337 		 * Skylake-X errata SKZ63. Processor May Hang When
1338 		 * Executing Code In an HLE Transaction Region between
1339 		 * 40000000H and 403FFFFFH.
1340 		 *
1341 		 * Mark the pages in the range as preallocated.  It
1342 		 * seems to be impossible to distinguish between
1343 		 * Skylake Server and Skylake X.
1344 		 */
1345 		skz63 = 1;
1346 		TUNABLE_INT_FETCH("hw.skz63_enable", &skz63);
1347 		if (skz63 != 0) {
1348 			if (bootverbose)
1349 				printf("SKZ63: skipping 4M RAM starting "
1350 				    "at physical 1G\n");
1351 			for (i = 0; i < atop(0x400000); i++) {
1352 				ret = vm_page_blacklist_add(0x40000000 +
1353 				    ptoa(i), FALSE);
1354 				if (!ret && bootverbose)
1355 					printf("page at %#lx already used\n",
1356 					    0x40000000 + ptoa(i));
1357 			}
1358 		}
1359 	}
1360 
1361 	/*
1362 	 * Initialize the vm page array entries for the kernel pmap's
1363 	 * page table pages.
1364 	 */
1365 	PMAP_LOCK(kernel_pmap);
1366 	for (i = 0; i < nkpt; i++) {
1367 		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
1368 		KASSERT(mpte >= vm_page_array &&
1369 		    mpte < &vm_page_array[vm_page_array_size],
1370 		    ("pmap_init: page table page is out of range"));
1371 		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
1372 		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
1373 		mpte->wire_count = 1;
1374 		if (i << PDRSHIFT < KERNend &&
1375 		    pmap_insert_pt_page(kernel_pmap, mpte))
1376 			panic("pmap_init: pmap_insert_pt_page failed");
1377 	}
1378 	PMAP_UNLOCK(kernel_pmap);
1379 	vm_wire_add(nkpt);
1380 
1381 	/*
1382 	 * If the kernel is running on a virtual machine, then it must assume
1383 	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
1384 	 * be prepared for the hypervisor changing the vendor and family that
1385 	 * are reported by CPUID.  Consequently, the workaround for AMD Family
1386 	 * 10h Erratum 383 is enabled if the processor's feature set does not
1387 	 * include at least one feature that is only supported by older Intel
1388 	 * or newer AMD processors.
1389 	 */
1390 	if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
1391 	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
1392 	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
1393 	    AMDID2_FMA4)) == 0)
1394 		workaround_erratum383 = 1;
1395 
1396 	/*
1397 	 * Are large page mappings enabled?
1398 	 */
1399 	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
1400 	if (pg_ps_enabled) {
1401 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1402 		    ("pmap_init: can't assign to pagesizes[1]"));
1403 		pagesizes[1] = NBPDR;
1404 	}
1405 
1406 	/*
1407 	 * Initialize the pv chunk list mutex.
1408 	 */
1409 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1410 
1411 	/*
1412 	 * Initialize the pool of pv list locks.
1413 	 */
1414 	for (i = 0; i < NPV_LIST_LOCKS; i++)
1415 		rw_init(&pv_list_locks[i], "pmap pv list");
1416 
1417 	/*
1418 	 * Calculate the size of the pv head table for superpages.
1419 	 */
1420 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
1421 
1422 	/*
1423 	 * Allocate memory for the pv head table for superpages.
1424 	 */
1425 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1426 	s = round_page(s);
1427 	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
1428 	for (i = 0; i < pv_npg; i++)
1429 		TAILQ_INIT(&pv_table[i].pv_list);
1430 	TAILQ_INIT(&pv_dummy.pv_list);
1431 
1432 	pmap_initialized = 1;
1433 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
1434 		ppim = pmap_preinit_mapping + i;
1435 		if (ppim->va == 0)
1436 			continue;
1437 		/* Make the direct map consistent */
1438 		if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) {
1439 			(void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
1440 			    ppim->sz, ppim->mode);
1441 		}
1442 		if (!bootverbose)
1443 			continue;
1444 		printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
1445 		    ppim->pa, ppim->va, ppim->sz, ppim->mode);
1446 	}
1447 
1448 	mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
1449 	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
1450 	    (vmem_addr_t *)&qframe);
1451 	if (error != 0)
1452 		panic("qframe allocation failed");
1453 
1454 	lm_ents = 8;
1455 	TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
1456 	if (lm_ents > LMEPML4I - LMSPML4I + 1)
1457 		lm_ents = LMEPML4I - LMSPML4I + 1;
1458 	if (bootverbose)
1459 		printf("pmap: large map %u PML4 slots (%lu Gb)\n",
1460 		    lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
1461 	if (lm_ents != 0) {
1462 		large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS,
1463 		    (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
1464 		if (large_vmem == NULL) {
1465 			printf("pmap: cannot create large map\n");
1466 			lm_ents = 0;
1467 		}
1468 		for (i = 0; i < lm_ents; i++) {
1469 			m = pmap_large_map_getptp_unlocked();
1470 			kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V |
1471 			    X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
1472 			    VM_PAGE_TO_PHYS(m);
1473 		}
1474 	}
1475 }
1476 
1477 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
1478     "2MB page mapping counters");
1479 
1480 static u_long pmap_pde_demotions;
1481 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
1482     &pmap_pde_demotions, 0, "2MB page demotions");
1483 
1484 static u_long pmap_pde_mappings;
1485 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
1486     &pmap_pde_mappings, 0, "2MB page mappings");
1487 
1488 static u_long pmap_pde_p_failures;
1489 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
1490     &pmap_pde_p_failures, 0, "2MB page promotion failures");
1491 
1492 static u_long pmap_pde_promotions;
1493 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
1494     &pmap_pde_promotions, 0, "2MB page promotions");
1495 
1496 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
1497     "1GB page mapping counters");
1498 
1499 static u_long pmap_pdpe_demotions;
1500 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
1501     &pmap_pdpe_demotions, 0, "1GB page demotions");
1502 
1503 /***************************************************
1504  * Low level helper routines.....
1505  ***************************************************/
1506 
1507 static pt_entry_t
1508 pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
1509 {
1510 	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
1511 
1512 	switch (pmap->pm_type) {
1513 	case PT_X86:
1514 	case PT_RVI:
1515 		/* Verify that both PAT bits are not set at the same time */
1516 		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
1517 		    ("Invalid PAT bits in entry %#lx", entry));
1518 
1519 		/* Swap the PAT bits if one of them is set */
1520 		if ((entry & x86_pat_bits) != 0)
1521 			entry ^= x86_pat_bits;
1522 		break;
1523 	case PT_EPT:
1524 		/*
1525 		 * Nothing to do - the memory attributes are represented
1526 		 * the same way for regular pages and superpages.
1527 		 */
1528 		break;
1529 	default:
1530 		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
1531 	}
1532 
1533 	return (entry);
1534 }
1535 
1536 boolean_t
1537 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
1538 {
1539 
1540 	return (mode >= 0 && mode < PAT_INDEX_SIZE &&
1541 	    pat_index[(int)mode] >= 0);
1542 }
1543 
1544 /*
1545  * Determine the appropriate bits to set in a PTE or PDE for a specified
1546  * caching mode.
1547  */
1548 int
1549 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
1550 {
1551 	int cache_bits, pat_flag, pat_idx;
1552 
1553 	if (!pmap_is_valid_memattr(pmap, mode))
1554 		panic("Unknown caching mode %d\n", mode);
1555 
1556 	switch (pmap->pm_type) {
1557 	case PT_X86:
1558 	case PT_RVI:
1559 		/* The PAT bit is different for PTE's and PDE's. */
1560 		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
1561 
1562 		/* Map the caching mode to a PAT index. */
1563 		pat_idx = pat_index[mode];
1564 
1565 		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
1566 		cache_bits = 0;
1567 		if (pat_idx & 0x4)
1568 			cache_bits |= pat_flag;
1569 		if (pat_idx & 0x2)
1570 			cache_bits |= PG_NC_PCD;
1571 		if (pat_idx & 0x1)
1572 			cache_bits |= PG_NC_PWT;
1573 		break;
1574 
1575 	case PT_EPT:
1576 		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
1577 		break;
1578 
1579 	default:
1580 		panic("unsupported pmap type %d", pmap->pm_type);
1581 	}
1582 
1583 	return (cache_bits);
1584 }
1585 
1586 static int
1587 pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
1588 {
1589 	int mask;
1590 
1591 	switch (pmap->pm_type) {
1592 	case PT_X86:
1593 	case PT_RVI:
1594 		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
1595 		break;
1596 	case PT_EPT:
1597 		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
1598 		break;
1599 	default:
1600 		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
1601 	}
1602 
1603 	return (mask);
1604 }
1605 
1606 bool
1607 pmap_ps_enabled(pmap_t pmap)
1608 {
1609 
1610 	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
1611 }
1612 
1613 static void
1614 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
1615 {
1616 
1617 	switch (pmap->pm_type) {
1618 	case PT_X86:
1619 		break;
1620 	case PT_RVI:
1621 	case PT_EPT:
1622 		/*
1623 		 * XXX
1624 		 * This is a little bogus since the generation number is
1625 		 * supposed to be bumped up when a region of the address
1626 		 * space is invalidated in the page tables.
1627 		 *
1628 		 * In this case the old PDE entry is valid but yet we want
1629 		 * to make sure that any mappings using the old entry are
1630 		 * invalidated in the TLB.
1631 		 *
1632 		 * The reason this works as expected is because we rendezvous
1633 		 * "all" host cpus and force any vcpu context to exit as a
1634 		 * side-effect.
1635 		 */
1636 		atomic_add_acq_long(&pmap->pm_eptgen, 1);
1637 		break;
1638 	default:
1639 		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
1640 	}
1641 	pde_store(pde, newpde);
1642 }
1643 
1644 /*
1645  * After changing the page size for the specified virtual address in the page
1646  * table, flush the corresponding entries from the processor's TLB.  Only the
1647  * calling processor's TLB is affected.
1648  *
1649  * The calling thread must be pinned to a processor.
1650  */
1651 static void
1652 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
1653 {
1654 	pt_entry_t PG_G;
1655 
1656 	if (pmap_type_guest(pmap))
1657 		return;
1658 
1659 	KASSERT(pmap->pm_type == PT_X86,
1660 	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
1661 
1662 	PG_G = pmap_global_bit(pmap);
1663 
1664 	if ((newpde & PG_PS) == 0)
1665 		/* Demotion: flush a specific 2MB page mapping. */
1666 		invlpg(va);
1667 	else if ((newpde & PG_G) == 0)
1668 		/*
1669 		 * Promotion: flush every 4KB page mapping from the TLB
1670 		 * because there are too many to flush individually.
1671 		 */
1672 		invltlb();
1673 	else {
1674 		/*
1675 		 * Promotion: flush every 4KB page mapping from the TLB,
1676 		 * including any global (PG_G) mappings.
1677 		 */
1678 		invltlb_glob();
1679 	}
1680 }
1681 #ifdef SMP
1682 
1683 /*
1684  * For SMP, these functions have to use the IPI mechanism for coherence.
1685  *
1686  * N.B.: Before calling any of the following TLB invalidation functions,
1687  * the calling processor must ensure that all stores updating a non-
1688  * kernel page table are globally performed.  Otherwise, another
1689  * processor could cache an old, pre-update entry without being
1690  * invalidated.  This can happen one of two ways: (1) The pmap becomes
1691  * active on another processor after its pm_active field is checked by
1692  * one of the following functions but before a store updating the page
1693  * table is globally performed. (2) The pmap becomes active on another
1694  * processor before its pm_active field is checked but due to
1695  * speculative loads one of the following functions stills reads the
1696  * pmap as inactive on the other processor.
1697  *
1698  * The kernel page table is exempt because its pm_active field is
1699  * immutable.  The kernel page table is always active on every
1700  * processor.
1701  */
1702 
1703 /*
1704  * Interrupt the cpus that are executing in the guest context.
1705  * This will force the vcpu to exit and the cached EPT mappings
1706  * will be invalidated by the host before the next vmresume.
1707  */
1708 static __inline void
1709 pmap_invalidate_ept(pmap_t pmap)
1710 {
1711 	int ipinum;
1712 
1713 	sched_pin();
1714 	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1715 	    ("pmap_invalidate_ept: absurd pm_active"));
1716 
1717 	/*
1718 	 * The TLB mappings associated with a vcpu context are not
1719 	 * flushed each time a different vcpu is chosen to execute.
1720 	 *
1721 	 * This is in contrast with a process's vtop mappings that
1722 	 * are flushed from the TLB on each context switch.
1723 	 *
1724 	 * Therefore we need to do more than just a TLB shootdown on
1725 	 * the active cpus in 'pmap->pm_active'. To do this we keep
1726 	 * track of the number of invalidations performed on this pmap.
1727 	 *
1728 	 * Each vcpu keeps a cache of this counter and compares it
1729 	 * just before a vmresume. If the counter is out-of-date an
1730 	 * invept will be done to flush stale mappings from the TLB.
1731 	 */
1732 	atomic_add_acq_long(&pmap->pm_eptgen, 1);
1733 
1734 	/*
1735 	 * Force the vcpu to exit and trap back into the hypervisor.
1736 	 */
1737 	ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
1738 	ipi_selected(pmap->pm_active, ipinum);
1739 	sched_unpin();
1740 }
1741 
1742 static cpuset_t
1743 pmap_invalidate_cpu_mask(pmap_t pmap)
1744 {
1745 
1746 	return (pmap == kernel_pmap ? all_cpus : pmap->pm_active);
1747 }
1748 
1749 static inline void
1750 pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va,
1751     const bool invpcid_works1)
1752 {
1753 	struct invpcid_descr d;
1754 	uint64_t kcr3, ucr3;
1755 	uint32_t pcid;
1756 	u_int cpuid, i;
1757 
1758 	cpuid = PCPU_GET(cpuid);
1759 	if (pmap == PCPU_GET(curpmap)) {
1760 		if (pmap->pm_ucr3 != PMAP_NO_CR3) {
1761 			/*
1762 			 * Because pm_pcid is recalculated on a
1763 			 * context switch, we must disable switching.
1764 			 * Otherwise, we might use a stale value
1765 			 * below.
1766 			 */
1767 			critical_enter();
1768 			pcid = pmap->pm_pcids[cpuid].pm_pcid;
1769 			if (invpcid_works1) {
1770 				d.pcid = pcid | PMAP_PCID_USER_PT;
1771 				d.pad = 0;
1772 				d.addr = va;
1773 				invpcid(&d, INVPCID_ADDR);
1774 			} else {
1775 				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
1776 				ucr3 = pmap->pm_ucr3 | pcid |
1777 				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
1778 				pmap_pti_pcid_invlpg(ucr3, kcr3, va);
1779 			}
1780 			critical_exit();
1781 		}
1782 	} else
1783 		pmap->pm_pcids[cpuid].pm_gen = 0;
1784 
1785 	CPU_FOREACH(i) {
1786 		if (cpuid != i)
1787 			pmap->pm_pcids[i].pm_gen = 0;
1788 	}
1789 
1790 	/*
1791 	 * The fence is between stores to pm_gen and the read of the
1792 	 * pm_active mask.  We need to ensure that it is impossible
1793 	 * for us to miss the bit update in pm_active and
1794 	 * simultaneously observe a non-zero pm_gen in
1795 	 * pmap_activate_sw(), otherwise TLB update is missed.
1796 	 * Without the fence, IA32 allows such an outcome.  Note that
1797 	 * pm_active is updated by a locked operation, which provides
1798 	 * the reciprocal fence.
1799 	 */
1800 	atomic_thread_fence_seq_cst();
1801 }
1802 
1803 static void
1804 pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va)
1805 {
1806 
1807 	pmap_invalidate_page_pcid(pmap, va, true);
1808 }
1809 
1810 static void
1811 pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va)
1812 {
1813 
1814 	pmap_invalidate_page_pcid(pmap, va, false);
1815 }
1816 
1817 static void
1818 pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va)
1819 {
1820 }
1821 
1822 DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t),
1823     static)
1824 {
1825 
1826 	if (pmap_pcid_enabled)
1827 		return (invpcid_works ? pmap_invalidate_page_pcid_invpcid :
1828 		    pmap_invalidate_page_pcid_noinvpcid);
1829 	return (pmap_invalidate_page_nopcid);
1830 }
1831 
1832 void
1833 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1834 {
1835 
1836 	if (pmap_type_guest(pmap)) {
1837 		pmap_invalidate_ept(pmap);
1838 		return;
1839 	}
1840 
1841 	KASSERT(pmap->pm_type == PT_X86,
1842 	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
1843 
1844 	sched_pin();
1845 	if (pmap == kernel_pmap) {
1846 		invlpg(va);
1847 	} else {
1848 		if (pmap == PCPU_GET(curpmap))
1849 			invlpg(va);
1850 		pmap_invalidate_page_mode(pmap, va);
1851 	}
1852 	smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap);
1853 	sched_unpin();
1854 }
1855 
1856 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
1857 #define	PMAP_INVLPG_THRESHOLD	(4 * 1024 * PAGE_SIZE)
1858 
1859 static void
1860 pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1861     const bool invpcid_works1)
1862 {
1863 	struct invpcid_descr d;
1864 	uint64_t kcr3, ucr3;
1865 	uint32_t pcid;
1866 	u_int cpuid, i;
1867 
1868 	cpuid = PCPU_GET(cpuid);
1869 	if (pmap == PCPU_GET(curpmap)) {
1870 		if (pmap->pm_ucr3 != PMAP_NO_CR3) {
1871 			critical_enter();
1872 			pcid = pmap->pm_pcids[cpuid].pm_pcid;
1873 			if (invpcid_works1) {
1874 				d.pcid = pcid | PMAP_PCID_USER_PT;
1875 				d.pad = 0;
1876 				d.addr = sva;
1877 				for (; d.addr < eva; d.addr += PAGE_SIZE)
1878 					invpcid(&d, INVPCID_ADDR);
1879 			} else {
1880 				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
1881 				ucr3 = pmap->pm_ucr3 | pcid |
1882 				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
1883 				pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
1884 			}
1885 			critical_exit();
1886 		}
1887 	} else
1888 		pmap->pm_pcids[cpuid].pm_gen = 0;
1889 
1890 	CPU_FOREACH(i) {
1891 		if (cpuid != i)
1892 			pmap->pm_pcids[i].pm_gen = 0;
1893 	}
1894 	/* See the comment in pmap_invalidate_page_pcid(). */
1895 	atomic_thread_fence_seq_cst();
1896 }
1897 
1898 static void
1899 pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva,
1900     vm_offset_t eva)
1901 {
1902 
1903 	pmap_invalidate_range_pcid(pmap, sva, eva, true);
1904 }
1905 
1906 static void
1907 pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva,
1908     vm_offset_t eva)
1909 {
1910 
1911 	pmap_invalidate_range_pcid(pmap, sva, eva, false);
1912 }
1913 
1914 static void
1915 pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1916 {
1917 }
1918 
1919 DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t,
1920     vm_offset_t), static)
1921 {
1922 
1923 	if (pmap_pcid_enabled)
1924 		return (invpcid_works ? pmap_invalidate_range_pcid_invpcid :
1925 		    pmap_invalidate_range_pcid_noinvpcid);
1926 	return (pmap_invalidate_range_nopcid);
1927 }
1928 
1929 void
1930 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1931 {
1932 	vm_offset_t addr;
1933 
1934 	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
1935 		pmap_invalidate_all(pmap);
1936 		return;
1937 	}
1938 
1939 	if (pmap_type_guest(pmap)) {
1940 		pmap_invalidate_ept(pmap);
1941 		return;
1942 	}
1943 
1944 	KASSERT(pmap->pm_type == PT_X86,
1945 	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
1946 
1947 	sched_pin();
1948 	if (pmap == kernel_pmap) {
1949 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1950 			invlpg(addr);
1951 	} else {
1952 		if (pmap == PCPU_GET(curpmap)) {
1953 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1954 				invlpg(addr);
1955 		}
1956 		pmap_invalidate_range_mode(pmap, sva, eva);
1957 	}
1958 	smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap);
1959 	sched_unpin();
1960 }
1961 
1962 static inline void
1963 pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1)
1964 {
1965 	struct invpcid_descr d;
1966 	uint64_t kcr3, ucr3;
1967 	uint32_t pcid;
1968 	u_int cpuid, i;
1969 
1970 	if (pmap == kernel_pmap) {
1971 		if (invpcid_works1) {
1972 			bzero(&d, sizeof(d));
1973 			invpcid(&d, INVPCID_CTXGLOB);
1974 		} else {
1975 			invltlb_glob();
1976 		}
1977 	} else {
1978 		cpuid = PCPU_GET(cpuid);
1979 		if (pmap == PCPU_GET(curpmap)) {
1980 			critical_enter();
1981 			pcid = pmap->pm_pcids[cpuid].pm_pcid;
1982 			if (invpcid_works1) {
1983 				d.pcid = pcid;
1984 				d.pad = 0;
1985 				d.addr = 0;
1986 				invpcid(&d, INVPCID_CTX);
1987 				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
1988 					d.pcid |= PMAP_PCID_USER_PT;
1989 					invpcid(&d, INVPCID_CTX);
1990 				}
1991 			} else {
1992 				kcr3 = pmap->pm_cr3 | pcid;
1993 				ucr3 = pmap->pm_ucr3;
1994 				if (ucr3 != PMAP_NO_CR3) {
1995 					ucr3 |= pcid | PMAP_PCID_USER_PT;
1996 					pmap_pti_pcid_invalidate(ucr3, kcr3);
1997 				} else {
1998 					load_cr3(kcr3);
1999 				}
2000 			}
2001 			critical_exit();
2002 		} else
2003 			pmap->pm_pcids[cpuid].pm_gen = 0;
2004 		CPU_FOREACH(i) {
2005 			if (cpuid != i)
2006 				pmap->pm_pcids[i].pm_gen = 0;
2007 		}
2008 	}
2009 	/* See the comment in pmap_invalidate_page_pcid(). */
2010 	atomic_thread_fence_seq_cst();
2011 }
2012 
2013 static void
2014 pmap_invalidate_all_pcid_invpcid(pmap_t pmap)
2015 {
2016 
2017 	pmap_invalidate_all_pcid(pmap, true);
2018 }
2019 
2020 static void
2021 pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap)
2022 {
2023 
2024 	pmap_invalidate_all_pcid(pmap, false);
2025 }
2026 
2027 static void
2028 pmap_invalidate_all_nopcid(pmap_t pmap)
2029 {
2030 
2031 	if (pmap == kernel_pmap)
2032 		invltlb_glob();
2033 	else if (pmap == PCPU_GET(curpmap))
2034 		invltlb();
2035 }
2036 
2037 DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t), static)
2038 {
2039 
2040 	if (pmap_pcid_enabled)
2041 		return (invpcid_works ? pmap_invalidate_all_pcid_invpcid :
2042 		    pmap_invalidate_all_pcid_noinvpcid);
2043 	return (pmap_invalidate_all_nopcid);
2044 }
2045 
2046 void
2047 pmap_invalidate_all(pmap_t pmap)
2048 {
2049 
2050 	if (pmap_type_guest(pmap)) {
2051 		pmap_invalidate_ept(pmap);
2052 		return;
2053 	}
2054 
2055 	KASSERT(pmap->pm_type == PT_X86,
2056 	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
2057 
2058 	sched_pin();
2059 	pmap_invalidate_all_mode(pmap);
2060 	smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap);
2061 	sched_unpin();
2062 }
2063 
2064 void
2065 pmap_invalidate_cache(void)
2066 {
2067 
2068 	sched_pin();
2069 	wbinvd();
2070 	smp_cache_flush();
2071 	sched_unpin();
2072 }
2073 
2074 struct pde_action {
2075 	cpuset_t invalidate;	/* processors that invalidate their TLB */
2076 	pmap_t pmap;
2077 	vm_offset_t va;
2078 	pd_entry_t *pde;
2079 	pd_entry_t newpde;
2080 	u_int store;		/* processor that updates the PDE */
2081 };
2082 
2083 static void
2084 pmap_update_pde_action(void *arg)
2085 {
2086 	struct pde_action *act = arg;
2087 
2088 	if (act->store == PCPU_GET(cpuid))
2089 		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
2090 }
2091 
2092 static void
2093 pmap_update_pde_teardown(void *arg)
2094 {
2095 	struct pde_action *act = arg;
2096 
2097 	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
2098 		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
2099 }
2100 
2101 /*
2102  * Change the page size for the specified virtual address in a way that
2103  * prevents any possibility of the TLB ever having two entries that map the
2104  * same virtual address using different page sizes.  This is the recommended
2105  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
2106  * machine check exception for a TLB state that is improperly diagnosed as a
2107  * hardware error.
2108  */
2109 static void
2110 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
2111 {
2112 	struct pde_action act;
2113 	cpuset_t active, other_cpus;
2114 	u_int cpuid;
2115 
2116 	sched_pin();
2117 	cpuid = PCPU_GET(cpuid);
2118 	other_cpus = all_cpus;
2119 	CPU_CLR(cpuid, &other_cpus);
2120 	if (pmap == kernel_pmap || pmap_type_guest(pmap))
2121 		active = all_cpus;
2122 	else {
2123 		active = pmap->pm_active;
2124 	}
2125 	if (CPU_OVERLAP(&active, &other_cpus)) {
2126 		act.store = cpuid;
2127 		act.invalidate = active;
2128 		act.va = va;
2129 		act.pmap = pmap;
2130 		act.pde = pde;
2131 		act.newpde = newpde;
2132 		CPU_SET(cpuid, &active);
2133 		smp_rendezvous_cpus(active,
2134 		    smp_no_rendezvous_barrier, pmap_update_pde_action,
2135 		    pmap_update_pde_teardown, &act);
2136 	} else {
2137 		pmap_update_pde_store(pmap, pde, newpde);
2138 		if (CPU_ISSET(cpuid, &active))
2139 			pmap_update_pde_invalidate(pmap, va, newpde);
2140 	}
2141 	sched_unpin();
2142 }
2143 #else /* !SMP */
2144 /*
2145  * Normal, non-SMP, invalidation functions.
2146  */
2147 void
2148 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
2149 {
2150 	struct invpcid_descr d;
2151 	uint64_t kcr3, ucr3;
2152 	uint32_t pcid;
2153 
2154 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
2155 		pmap->pm_eptgen++;
2156 		return;
2157 	}
2158 	KASSERT(pmap->pm_type == PT_X86,
2159 	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
2160 
2161 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
2162 		invlpg(va);
2163 		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
2164 		    pmap->pm_ucr3 != PMAP_NO_CR3) {
2165 			critical_enter();
2166 			pcid = pmap->pm_pcids[0].pm_pcid;
2167 			if (invpcid_works) {
2168 				d.pcid = pcid | PMAP_PCID_USER_PT;
2169 				d.pad = 0;
2170 				d.addr = va;
2171 				invpcid(&d, INVPCID_ADDR);
2172 			} else {
2173 				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
2174 				ucr3 = pmap->pm_ucr3 | pcid |
2175 				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
2176 				pmap_pti_pcid_invlpg(ucr3, kcr3, va);
2177 			}
2178 			critical_exit();
2179 		}
2180 	} else if (pmap_pcid_enabled)
2181 		pmap->pm_pcids[0].pm_gen = 0;
2182 }
2183 
2184 void
2185 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2186 {
2187 	struct invpcid_descr d;
2188 	vm_offset_t addr;
2189 	uint64_t kcr3, ucr3;
2190 
2191 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
2192 		pmap->pm_eptgen++;
2193 		return;
2194 	}
2195 	KASSERT(pmap->pm_type == PT_X86,
2196 	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
2197 
2198 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
2199 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
2200 			invlpg(addr);
2201 		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
2202 		    pmap->pm_ucr3 != PMAP_NO_CR3) {
2203 			critical_enter();
2204 			if (invpcid_works) {
2205 				d.pcid = pmap->pm_pcids[0].pm_pcid |
2206 				    PMAP_PCID_USER_PT;
2207 				d.pad = 0;
2208 				d.addr = sva;
2209 				for (; d.addr < eva; d.addr += PAGE_SIZE)
2210 					invpcid(&d, INVPCID_ADDR);
2211 			} else {
2212 				kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].
2213 				    pm_pcid | CR3_PCID_SAVE;
2214 				ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0].
2215 				    pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
2216 				pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
2217 			}
2218 			critical_exit();
2219 		}
2220 	} else if (pmap_pcid_enabled) {
2221 		pmap->pm_pcids[0].pm_gen = 0;
2222 	}
2223 }
2224 
2225 void
2226 pmap_invalidate_all(pmap_t pmap)
2227 {
2228 	struct invpcid_descr d;
2229 	uint64_t kcr3, ucr3;
2230 
2231 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
2232 		pmap->pm_eptgen++;
2233 		return;
2234 	}
2235 	KASSERT(pmap->pm_type == PT_X86,
2236 	    ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
2237 
2238 	if (pmap == kernel_pmap) {
2239 		if (pmap_pcid_enabled && invpcid_works) {
2240 			bzero(&d, sizeof(d));
2241 			invpcid(&d, INVPCID_CTXGLOB);
2242 		} else {
2243 			invltlb_glob();
2244 		}
2245 	} else if (pmap == PCPU_GET(curpmap)) {
2246 		if (pmap_pcid_enabled) {
2247 			critical_enter();
2248 			if (invpcid_works) {
2249 				d.pcid = pmap->pm_pcids[0].pm_pcid;
2250 				d.pad = 0;
2251 				d.addr = 0;
2252 				invpcid(&d, INVPCID_CTX);
2253 				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
2254 					d.pcid |= PMAP_PCID_USER_PT;
2255 					invpcid(&d, INVPCID_CTX);
2256 				}
2257 			} else {
2258 				kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid;
2259 				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
2260 					ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[
2261 					    0].pm_pcid | PMAP_PCID_USER_PT;
2262 					pmap_pti_pcid_invalidate(ucr3, kcr3);
2263 				} else
2264 					load_cr3(kcr3);
2265 			}
2266 			critical_exit();
2267 		} else {
2268 			invltlb();
2269 		}
2270 	} else if (pmap_pcid_enabled) {
2271 		pmap->pm_pcids[0].pm_gen = 0;
2272 	}
2273 }
2274 
2275 PMAP_INLINE void
2276 pmap_invalidate_cache(void)
2277 {
2278 
2279 	wbinvd();
2280 }
2281 
2282 static void
2283 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
2284 {
2285 
2286 	pmap_update_pde_store(pmap, pde, newpde);
2287 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
2288 		pmap_update_pde_invalidate(pmap, va, newpde);
2289 	else
2290 		pmap->pm_pcids[0].pm_gen = 0;
2291 }
2292 #endif /* !SMP */
2293 
2294 static void
2295 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
2296 {
2297 
2298 	/*
2299 	 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
2300 	 * by a promotion that did not invalidate the 512 4KB page mappings
2301 	 * that might exist in the TLB.  Consequently, at this point, the TLB
2302 	 * may hold both 4KB and 2MB page mappings for the address range [va,
2303 	 * va + NBPDR).  Therefore, the entire range must be invalidated here.
2304 	 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
2305 	 * 4KB page mappings for the address range [va, va + NBPDR), and so a
2306 	 * single INVLPG suffices to invalidate the 2MB page mapping from the
2307 	 * TLB.
2308 	 */
2309 	if ((pde & PG_PROMOTED) != 0)
2310 		pmap_invalidate_range(pmap, va, va + NBPDR - 1);
2311 	else
2312 		pmap_invalidate_page(pmap, va);
2313 }
2314 
2315 DEFINE_IFUNC(, void, pmap_invalidate_cache_range,
2316     (vm_offset_t sva, vm_offset_t eva), static)
2317 {
2318 
2319 	if ((cpu_feature & CPUID_SS) != 0)
2320 		return (pmap_invalidate_cache_range_selfsnoop);
2321 	if ((cpu_feature & CPUID_CLFSH) != 0)
2322 		return (pmap_force_invalidate_cache_range);
2323 	return (pmap_invalidate_cache_range_all);
2324 }
2325 
2326 #define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
2327 
2328 static void
2329 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva)
2330 {
2331 
2332 	KASSERT((sva & PAGE_MASK) == 0,
2333 	    ("pmap_invalidate_cache_range: sva not page-aligned"));
2334 	KASSERT((eva & PAGE_MASK) == 0,
2335 	    ("pmap_invalidate_cache_range: eva not page-aligned"));
2336 }
2337 
2338 static void
2339 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva)
2340 {
2341 
2342 	pmap_invalidate_cache_range_check_align(sva, eva);
2343 }
2344 
2345 void
2346 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
2347 {
2348 
2349 	sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
2350 
2351 	/*
2352 	 * XXX: Some CPUs fault, hang, or trash the local APIC
2353 	 * registers if we use CLFLUSH on the local APIC range.  The
2354 	 * local APIC is always uncached, so we don't need to flush
2355 	 * for that range anyway.
2356 	 */
2357 	if (pmap_kextract(sva) == lapic_paddr)
2358 		return;
2359 
2360 	if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) {
2361 		/*
2362 		 * Do per-cache line flush.  Use the sfence
2363 		 * instruction to insure that previous stores are
2364 		 * included in the write-back.  The processor
2365 		 * propagates flush to other processors in the cache
2366 		 * coherence domain.
2367 		 */
2368 		sfence();
2369 		for (; sva < eva; sva += cpu_clflush_line_size)
2370 			clflushopt(sva);
2371 		sfence();
2372 	} else {
2373 		/*
2374 		 * Writes are ordered by CLFLUSH on Intel CPUs.
2375 		 */
2376 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
2377 			mfence();
2378 		for (; sva < eva; sva += cpu_clflush_line_size)
2379 			clflush(sva);
2380 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
2381 			mfence();
2382 	}
2383 }
2384 
2385 static void
2386 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva)
2387 {
2388 
2389 	pmap_invalidate_cache_range_check_align(sva, eva);
2390 	pmap_invalidate_cache();
2391 }
2392 
2393 /*
2394  * Remove the specified set of pages from the data and instruction caches.
2395  *
2396  * In contrast to pmap_invalidate_cache_range(), this function does not
2397  * rely on the CPU's self-snoop feature, because it is intended for use
2398  * when moving pages into a different cache domain.
2399  */
2400 void
2401 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
2402 {
2403 	vm_offset_t daddr, eva;
2404 	int i;
2405 	bool useclflushopt;
2406 
2407 	useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
2408 	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
2409 	    ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
2410 		pmap_invalidate_cache();
2411 	else {
2412 		if (useclflushopt)
2413 			sfence();
2414 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
2415 			mfence();
2416 		for (i = 0; i < count; i++) {
2417 			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
2418 			eva = daddr + PAGE_SIZE;
2419 			for (; daddr < eva; daddr += cpu_clflush_line_size) {
2420 				if (useclflushopt)
2421 					clflushopt(daddr);
2422 				else
2423 					clflush(daddr);
2424 			}
2425 		}
2426 		if (useclflushopt)
2427 			sfence();
2428 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
2429 			mfence();
2430 	}
2431 }
2432 
2433 void
2434 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva)
2435 {
2436 
2437 	pmap_invalidate_cache_range_check_align(sva, eva);
2438 
2439 	if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) {
2440 		pmap_force_invalidate_cache_range(sva, eva);
2441 		return;
2442 	}
2443 
2444 	/* See comment in pmap_force_invalidate_cache_range(). */
2445 	if (pmap_kextract(sva) == lapic_paddr)
2446 		return;
2447 
2448 	sfence();
2449 	for (; sva < eva; sva += cpu_clflush_line_size)
2450 		clwb(sva);
2451 	sfence();
2452 }
2453 
2454 void
2455 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr)
2456 {
2457 	pt_entry_t *pte;
2458 	vm_offset_t vaddr;
2459 	int error, pte_bits;
2460 
2461 	KASSERT((spa & PAGE_MASK) == 0,
2462 	    ("pmap_flush_cache_phys_range: spa not page-aligned"));
2463 	KASSERT((epa & PAGE_MASK) == 0,
2464 	    ("pmap_flush_cache_phys_range: epa not page-aligned"));
2465 
2466 	if (spa < dmaplimit) {
2467 		pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN(
2468 		    dmaplimit, epa)));
2469 		if (dmaplimit >= epa)
2470 			return;
2471 		spa = dmaplimit;
2472 	}
2473 
2474 	pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW |
2475 	    X86_PG_V;
2476 	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
2477 	    &vaddr);
2478 	KASSERT(error == 0, ("vmem_alloc failed: %d", error));
2479 	pte = vtopte(vaddr);
2480 	for (; spa < epa; spa += PAGE_SIZE) {
2481 		sched_pin();
2482 		pte_store(pte, spa | pte_bits);
2483 		invlpg(vaddr);
2484 		/* XXXKIB sfences inside flush_cache_range are excessive */
2485 		pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE);
2486 		sched_unpin();
2487 	}
2488 	vmem_free(kernel_arena, vaddr, PAGE_SIZE);
2489 }
2490 
2491 /*
2492  *	Routine:	pmap_extract
2493  *	Function:
2494  *		Extract the physical page address associated
2495  *		with the given map/virtual_address pair.
2496  */
2497 vm_paddr_t
2498 pmap_extract(pmap_t pmap, vm_offset_t va)
2499 {
2500 	pdp_entry_t *pdpe;
2501 	pd_entry_t *pde;
2502 	pt_entry_t *pte, PG_V;
2503 	vm_paddr_t pa;
2504 
2505 	pa = 0;
2506 	PG_V = pmap_valid_bit(pmap);
2507 	PMAP_LOCK(pmap);
2508 	pdpe = pmap_pdpe(pmap, va);
2509 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
2510 		if ((*pdpe & PG_PS) != 0)
2511 			pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
2512 		else {
2513 			pde = pmap_pdpe_to_pde(pdpe, va);
2514 			if ((*pde & PG_V) != 0) {
2515 				if ((*pde & PG_PS) != 0) {
2516 					pa = (*pde & PG_PS_FRAME) |
2517 					    (va & PDRMASK);
2518 				} else {
2519 					pte = pmap_pde_to_pte(pde, va);
2520 					pa = (*pte & PG_FRAME) |
2521 					    (va & PAGE_MASK);
2522 				}
2523 			}
2524 		}
2525 	}
2526 	PMAP_UNLOCK(pmap);
2527 	return (pa);
2528 }
2529 
2530 /*
2531  *	Routine:	pmap_extract_and_hold
2532  *	Function:
2533  *		Atomically extract and hold the physical page
2534  *		with the given pmap and virtual address pair
2535  *		if that mapping permits the given protection.
2536  */
2537 vm_page_t
2538 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
2539 {
2540 	pd_entry_t pde, *pdep;
2541 	pt_entry_t pte, PG_RW, PG_V;
2542 	vm_paddr_t pa;
2543 	vm_page_t m;
2544 
2545 	pa = 0;
2546 	m = NULL;
2547 	PG_RW = pmap_rw_bit(pmap);
2548 	PG_V = pmap_valid_bit(pmap);
2549 	PMAP_LOCK(pmap);
2550 retry:
2551 	pdep = pmap_pde(pmap, va);
2552 	if (pdep != NULL && (pde = *pdep)) {
2553 		if (pde & PG_PS) {
2554 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
2555 				if (vm_page_pa_tryrelock(pmap, (pde &
2556 				    PG_PS_FRAME) | (va & PDRMASK), &pa))
2557 					goto retry;
2558 				m = PHYS_TO_VM_PAGE(pa);
2559 			}
2560 		} else {
2561 			pte = *pmap_pde_to_pte(pdep, va);
2562 			if ((pte & PG_V) &&
2563 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
2564 				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
2565 				    &pa))
2566 					goto retry;
2567 				m = PHYS_TO_VM_PAGE(pa);
2568 			}
2569 		}
2570 		if (m != NULL)
2571 			vm_page_hold(m);
2572 	}
2573 	PA_UNLOCK_COND(pa);
2574 	PMAP_UNLOCK(pmap);
2575 	return (m);
2576 }
2577 
2578 vm_paddr_t
2579 pmap_kextract(vm_offset_t va)
2580 {
2581 	pd_entry_t pde;
2582 	vm_paddr_t pa;
2583 
2584 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
2585 		pa = DMAP_TO_PHYS(va);
2586 	} else {
2587 		pde = *vtopde(va);
2588 		if (pde & PG_PS) {
2589 			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
2590 		} else {
2591 			/*
2592 			 * Beware of a concurrent promotion that changes the
2593 			 * PDE at this point!  For example, vtopte() must not
2594 			 * be used to access the PTE because it would use the
2595 			 * new PDE.  It is, however, safe to use the old PDE
2596 			 * because the page table page is preserved by the
2597 			 * promotion.
2598 			 */
2599 			pa = *pmap_pde_to_pte(&pde, va);
2600 			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
2601 		}
2602 	}
2603 	return (pa);
2604 }
2605 
2606 /***************************************************
2607  * Low level mapping routines.....
2608  ***************************************************/
2609 
2610 /*
2611  * Add a wired page to the kva.
2612  * Note: not SMP coherent.
2613  */
2614 PMAP_INLINE void
2615 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
2616 {
2617 	pt_entry_t *pte;
2618 
2619 	pte = vtopte(va);
2620 	pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g);
2621 }
2622 
2623 static __inline void
2624 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
2625 {
2626 	pt_entry_t *pte;
2627 	int cache_bits;
2628 
2629 	pte = vtopte(va);
2630 	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
2631 	pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits);
2632 }
2633 
2634 /*
2635  * Remove a page from the kernel pagetables.
2636  * Note: not SMP coherent.
2637  */
2638 PMAP_INLINE void
2639 pmap_kremove(vm_offset_t va)
2640 {
2641 	pt_entry_t *pte;
2642 
2643 	pte = vtopte(va);
2644 	pte_clear(pte);
2645 }
2646 
2647 /*
2648  *	Used to map a range of physical addresses into kernel
2649  *	virtual address space.
2650  *
2651  *	The value passed in '*virt' is a suggested virtual address for
2652  *	the mapping. Architectures which can support a direct-mapped
2653  *	physical to virtual region can return the appropriate address
2654  *	within that region, leaving '*virt' unchanged. Other
2655  *	architectures should map the pages starting at '*virt' and
2656  *	update '*virt' with the first usable address after the mapped
2657  *	region.
2658  */
2659 vm_offset_t
2660 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2661 {
2662 	return PHYS_TO_DMAP(start);
2663 }
2664 
2665 
2666 /*
2667  * Add a list of wired pages to the kva
2668  * this routine is only used for temporary
2669  * kernel mappings that do not need to have
2670  * page modification or references recorded.
2671  * Note that old mappings are simply written
2672  * over.  The page *must* be wired.
2673  * Note: SMP coherent.  Uses a ranged shootdown IPI.
2674  */
2675 void
2676 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2677 {
2678 	pt_entry_t *endpte, oldpte, pa, *pte;
2679 	vm_page_t m;
2680 	int cache_bits;
2681 
2682 	oldpte = 0;
2683 	pte = vtopte(sva);
2684 	endpte = pte + count;
2685 	while (pte < endpte) {
2686 		m = *ma++;
2687 		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
2688 		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
2689 		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
2690 			oldpte |= *pte;
2691 			pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V);
2692 		}
2693 		pte++;
2694 	}
2695 	if (__predict_false((oldpte & X86_PG_V) != 0))
2696 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
2697 		    PAGE_SIZE);
2698 }
2699 
2700 /*
2701  * This routine tears out page mappings from the
2702  * kernel -- it is meant only for temporary mappings.
2703  * Note: SMP coherent.  Uses a ranged shootdown IPI.
2704  */
2705 void
2706 pmap_qremove(vm_offset_t sva, int count)
2707 {
2708 	vm_offset_t va;
2709 
2710 	va = sva;
2711 	while (count-- > 0) {
2712 		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
2713 		pmap_kremove(va);
2714 		va += PAGE_SIZE;
2715 	}
2716 	pmap_invalidate_range(kernel_pmap, sva, va);
2717 }
2718 
2719 /***************************************************
2720  * Page table page management routines.....
2721  ***************************************************/
2722 /*
2723  * Schedule the specified unused page table page to be freed.  Specifically,
2724  * add the page to the specified list of pages that will be released to the
2725  * physical memory manager after the TLB has been updated.
2726  */
2727 static __inline void
2728 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
2729     boolean_t set_PG_ZERO)
2730 {
2731 
2732 	if (set_PG_ZERO)
2733 		m->flags |= PG_ZERO;
2734 	else
2735 		m->flags &= ~PG_ZERO;
2736 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2737 }
2738 
2739 /*
2740  * Inserts the specified page table page into the specified pmap's collection
2741  * of idle page table pages.  Each of a pmap's page table pages is responsible
2742  * for mapping a distinct range of virtual addresses.  The pmap's collection is
2743  * ordered by this virtual address range.
2744  */
2745 static __inline int
2746 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
2747 {
2748 
2749 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2750 	return (vm_radix_insert(&pmap->pm_root, mpte));
2751 }
2752 
2753 /*
2754  * Removes the page table page mapping the specified virtual address from the
2755  * specified pmap's collection of idle page table pages, and returns it.
2756  * Otherwise, returns NULL if there is no page table page corresponding to the
2757  * specified virtual address.
2758  */
2759 static __inline vm_page_t
2760 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
2761 {
2762 
2763 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2764 	return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va)));
2765 }
2766 
2767 /*
2768  * Decrements a page table page's wire count, which is used to record the
2769  * number of valid page table entries within the page.  If the wire count
2770  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
2771  * page table page was unmapped and FALSE otherwise.
2772  */
2773 static inline boolean_t
2774 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2775 {
2776 
2777 	--m->wire_count;
2778 	if (m->wire_count == 0) {
2779 		_pmap_unwire_ptp(pmap, va, m, free);
2780 		return (TRUE);
2781 	} else
2782 		return (FALSE);
2783 }
2784 
2785 static void
2786 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2787 {
2788 
2789 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2790 	/*
2791 	 * unmap the page table page
2792 	 */
2793 	if (m->pindex >= (NUPDE + NUPDPE)) {
2794 		/* PDP page */
2795 		pml4_entry_t *pml4;
2796 		pml4 = pmap_pml4e(pmap, va);
2797 		*pml4 = 0;
2798 		if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
2799 			pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
2800 			*pml4 = 0;
2801 		}
2802 	} else if (m->pindex >= NUPDE) {
2803 		/* PD page */
2804 		pdp_entry_t *pdp;
2805 		pdp = pmap_pdpe(pmap, va);
2806 		*pdp = 0;
2807 	} else {
2808 		/* PTE page */
2809 		pd_entry_t *pd;
2810 		pd = pmap_pde(pmap, va);
2811 		*pd = 0;
2812 	}
2813 	pmap_resident_count_dec(pmap, 1);
2814 	if (m->pindex < NUPDE) {
2815 		/* We just released a PT, unhold the matching PD */
2816 		vm_page_t pdpg;
2817 
2818 		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
2819 		pmap_unwire_ptp(pmap, va, pdpg, free);
2820 	}
2821 	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
2822 		/* We just released a PD, unhold the matching PDP */
2823 		vm_page_t pdppg;
2824 
2825 		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
2826 		pmap_unwire_ptp(pmap, va, pdppg, free);
2827 	}
2828 
2829 	/*
2830 	 * Put page on a list so that it is released after
2831 	 * *ALL* TLB shootdown is done
2832 	 */
2833 	pmap_add_delayed_free_list(m, free, TRUE);
2834 }
2835 
2836 /*
2837  * After removing a page table entry, this routine is used to
2838  * conditionally free the page, and manage the hold/wire counts.
2839  */
2840 static int
2841 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2842     struct spglist *free)
2843 {
2844 	vm_page_t mpte;
2845 
2846 	if (va >= VM_MAXUSER_ADDRESS)
2847 		return (0);
2848 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2849 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
2850 	return (pmap_unwire_ptp(pmap, va, mpte, free));
2851 }
2852 
2853 void
2854 pmap_pinit0(pmap_t pmap)
2855 {
2856 	int i;
2857 
2858 	PMAP_LOCK_INIT(pmap);
2859 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
2860 	pmap->pm_pml4u = NULL;
2861 	pmap->pm_cr3 = KPML4phys;
2862 	/* hack to keep pmap_pti_pcid_invalidate() alive */
2863 	pmap->pm_ucr3 = PMAP_NO_CR3;
2864 	pmap->pm_root.rt_root = 0;
2865 	CPU_ZERO(&pmap->pm_active);
2866 	TAILQ_INIT(&pmap->pm_pvchunk);
2867 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2868 	pmap->pm_flags = pmap_flags;
2869 	CPU_FOREACH(i) {
2870 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1;
2871 		pmap->pm_pcids[i].pm_gen = 1;
2872 	}
2873 	pmap_activate_boot(pmap);
2874 
2875 	if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
2876 		pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
2877 		    sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL,
2878 		    UMA_ALIGN_PTR, 0);
2879 	}
2880 }
2881 
2882 void
2883 pmap_pinit_pml4(vm_page_t pml4pg)
2884 {
2885 	pml4_entry_t *pm_pml4;
2886 	int i;
2887 
2888 	pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
2889 
2890 	/* Wire in kernel global address entries. */
2891 	for (i = 0; i < NKPML4E; i++) {
2892 		pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
2893 		    X86_PG_V;
2894 	}
2895 	for (i = 0; i < ndmpdpphys; i++) {
2896 		pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
2897 		    X86_PG_V;
2898 	}
2899 
2900 	/* install self-referential address mapping entry(s) */
2901 	pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
2902 	    X86_PG_A | X86_PG_M;
2903 
2904 	/* install large map entries if configured */
2905 	for (i = 0; i < lm_ents; i++)
2906 		pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i];
2907 }
2908 
2909 static void
2910 pmap_pinit_pml4_pti(vm_page_t pml4pg)
2911 {
2912 	pml4_entry_t *pm_pml4;
2913 	int i;
2914 
2915 	pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
2916 	for (i = 0; i < NPML4EPG; i++)
2917 		pm_pml4[i] = pti_pml4[i];
2918 }
2919 
2920 /*
2921  * Initialize a preallocated and zeroed pmap structure,
2922  * such as one in a vmspace structure.
2923  */
2924 int
2925 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
2926 {
2927 	vm_page_t pml4pg, pml4pgu;
2928 	vm_paddr_t pml4phys;
2929 	int i;
2930 
2931 	/*
2932 	 * allocate the page directory page
2933 	 */
2934 	pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2935 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK);
2936 
2937 	pml4phys = VM_PAGE_TO_PHYS(pml4pg);
2938 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
2939 	CPU_FOREACH(i) {
2940 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
2941 		pmap->pm_pcids[i].pm_gen = 0;
2942 	}
2943 	pmap->pm_cr3 = PMAP_NO_CR3;	/* initialize to an invalid value */
2944 	pmap->pm_ucr3 = PMAP_NO_CR3;
2945 	pmap->pm_pml4u = NULL;
2946 
2947 	pmap->pm_type = pm_type;
2948 	if ((pml4pg->flags & PG_ZERO) == 0)
2949 		pagezero(pmap->pm_pml4);
2950 
2951 	/*
2952 	 * Do not install the host kernel mappings in the nested page
2953 	 * tables. These mappings are meaningless in the guest physical
2954 	 * address space.
2955 	 * Install minimal kernel mappings in PTI case.
2956 	 */
2957 	if (pm_type == PT_X86) {
2958 		pmap->pm_cr3 = pml4phys;
2959 		pmap_pinit_pml4(pml4pg);
2960 		if (pti) {
2961 			pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2962 			    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
2963 			pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
2964 			    VM_PAGE_TO_PHYS(pml4pgu));
2965 			pmap_pinit_pml4_pti(pml4pgu);
2966 			pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
2967 		}
2968 		if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
2969 			rangeset_init(&pmap->pm_pkru, pkru_dup_range,
2970 			    pkru_free_range, pmap, M_NOWAIT);
2971 		}
2972 	}
2973 
2974 	pmap->pm_root.rt_root = 0;
2975 	CPU_ZERO(&pmap->pm_active);
2976 	TAILQ_INIT(&pmap->pm_pvchunk);
2977 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2978 	pmap->pm_flags = flags;
2979 	pmap->pm_eptgen = 0;
2980 
2981 	return (1);
2982 }
2983 
2984 int
2985 pmap_pinit(pmap_t pmap)
2986 {
2987 
2988 	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
2989 }
2990 
2991 /*
2992  * This routine is called if the desired page table page does not exist.
2993  *
2994  * If page table page allocation fails, this routine may sleep before
2995  * returning NULL.  It sleeps only if a lock pointer was given.
2996  *
2997  * Note: If a page allocation fails at page table level two or three,
2998  * one or two pages may be held during the wait, only to be released
2999  * afterwards.  This conservative approach is easily argued to avoid
3000  * race conditions.
3001  */
3002 static vm_page_t
3003 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
3004 {
3005 	vm_page_t m, pdppg, pdpg;
3006 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
3007 
3008 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3009 
3010 	PG_A = pmap_accessed_bit(pmap);
3011 	PG_M = pmap_modified_bit(pmap);
3012 	PG_V = pmap_valid_bit(pmap);
3013 	PG_RW = pmap_rw_bit(pmap);
3014 
3015 	/*
3016 	 * Allocate a page table page.
3017 	 */
3018 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
3019 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
3020 		if (lockp != NULL) {
3021 			RELEASE_PV_LIST_LOCK(lockp);
3022 			PMAP_UNLOCK(pmap);
3023 			PMAP_ASSERT_NOT_IN_DI();
3024 			vm_wait(NULL);
3025 			PMAP_LOCK(pmap);
3026 		}
3027 
3028 		/*
3029 		 * Indicate the need to retry.  While waiting, the page table
3030 		 * page may have been allocated.
3031 		 */
3032 		return (NULL);
3033 	}
3034 	if ((m->flags & PG_ZERO) == 0)
3035 		pmap_zero_page(m);
3036 
3037 	/*
3038 	 * Map the pagetable page into the process address space, if
3039 	 * it isn't already there.
3040 	 */
3041 
3042 	if (ptepindex >= (NUPDE + NUPDPE)) {
3043 		pml4_entry_t *pml4, *pml4u;
3044 		vm_pindex_t pml4index;
3045 
3046 		/* Wire up a new PDPE page */
3047 		pml4index = ptepindex - (NUPDE + NUPDPE);
3048 		pml4 = &pmap->pm_pml4[pml4index];
3049 		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
3050 		if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) {
3051 			/*
3052 			 * PTI: Make all user-space mappings in the
3053 			 * kernel-mode page table no-execute so that
3054 			 * we detect any programming errors that leave
3055 			 * the kernel-mode page table active on return
3056 			 * to user space.
3057 			 */
3058 			if (pmap->pm_ucr3 != PMAP_NO_CR3)
3059 				*pml4 |= pg_nx;
3060 
3061 			pml4u = &pmap->pm_pml4u[pml4index];
3062 			*pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
3063 			    PG_A | PG_M;
3064 		}
3065 
3066 	} else if (ptepindex >= NUPDE) {
3067 		vm_pindex_t pml4index;
3068 		vm_pindex_t pdpindex;
3069 		pml4_entry_t *pml4;
3070 		pdp_entry_t *pdp;
3071 
3072 		/* Wire up a new PDE page */
3073 		pdpindex = ptepindex - NUPDE;
3074 		pml4index = pdpindex >> NPML4EPGSHIFT;
3075 
3076 		pml4 = &pmap->pm_pml4[pml4index];
3077 		if ((*pml4 & PG_V) == 0) {
3078 			/* Have to allocate a new pdp, recurse */
3079 			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
3080 			    lockp) == NULL) {
3081 				vm_page_unwire_noq(m);
3082 				vm_page_free_zero(m);
3083 				return (NULL);
3084 			}
3085 		} else {
3086 			/* Add reference to pdp page */
3087 			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
3088 			pdppg->wire_count++;
3089 		}
3090 		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
3091 
3092 		/* Now find the pdp page */
3093 		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
3094 		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
3095 
3096 	} else {
3097 		vm_pindex_t pml4index;
3098 		vm_pindex_t pdpindex;
3099 		pml4_entry_t *pml4;
3100 		pdp_entry_t *pdp;
3101 		pd_entry_t *pd;
3102 
3103 		/* Wire up a new PTE page */
3104 		pdpindex = ptepindex >> NPDPEPGSHIFT;
3105 		pml4index = pdpindex >> NPML4EPGSHIFT;
3106 
3107 		/* First, find the pdp and check that its valid. */
3108 		pml4 = &pmap->pm_pml4[pml4index];
3109 		if ((*pml4 & PG_V) == 0) {
3110 			/* Have to allocate a new pd, recurse */
3111 			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
3112 			    lockp) == NULL) {
3113 				vm_page_unwire_noq(m);
3114 				vm_page_free_zero(m);
3115 				return (NULL);
3116 			}
3117 			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
3118 			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
3119 		} else {
3120 			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
3121 			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
3122 			if ((*pdp & PG_V) == 0) {
3123 				/* Have to allocate a new pd, recurse */
3124 				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
3125 				    lockp) == NULL) {
3126 					vm_page_unwire_noq(m);
3127 					vm_page_free_zero(m);
3128 					return (NULL);
3129 				}
3130 			} else {
3131 				/* Add reference to the pd page */
3132 				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
3133 				pdpg->wire_count++;
3134 			}
3135 		}
3136 		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
3137 
3138 		/* Now we know where the page directory page is */
3139 		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
3140 		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
3141 	}
3142 
3143 	pmap_resident_count_inc(pmap, 1);
3144 
3145 	return (m);
3146 }
3147 
3148 static vm_page_t
3149 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
3150 {
3151 	vm_pindex_t pdpindex, ptepindex;
3152 	pdp_entry_t *pdpe, PG_V;
3153 	vm_page_t pdpg;
3154 
3155 	PG_V = pmap_valid_bit(pmap);
3156 
3157 retry:
3158 	pdpe = pmap_pdpe(pmap, va);
3159 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
3160 		/* Add a reference to the pd page. */
3161 		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
3162 		pdpg->wire_count++;
3163 	} else {
3164 		/* Allocate a pd page. */
3165 		ptepindex = pmap_pde_pindex(va);
3166 		pdpindex = ptepindex >> NPDPEPGSHIFT;
3167 		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
3168 		if (pdpg == NULL && lockp != NULL)
3169 			goto retry;
3170 	}
3171 	return (pdpg);
3172 }
3173 
3174 static vm_page_t
3175 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
3176 {
3177 	vm_pindex_t ptepindex;
3178 	pd_entry_t *pd, PG_V;
3179 	vm_page_t m;
3180 
3181 	PG_V = pmap_valid_bit(pmap);
3182 
3183 	/*
3184 	 * Calculate pagetable page index
3185 	 */
3186 	ptepindex = pmap_pde_pindex(va);
3187 retry:
3188 	/*
3189 	 * Get the page directory entry
3190 	 */
3191 	pd = pmap_pde(pmap, va);
3192 
3193 	/*
3194 	 * This supports switching from a 2MB page to a
3195 	 * normal 4K page.
3196 	 */
3197 	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
3198 		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
3199 			/*
3200 			 * Invalidation of the 2MB page mapping may have caused
3201 			 * the deallocation of the underlying PD page.
3202 			 */
3203 			pd = NULL;
3204 		}
3205 	}
3206 
3207 	/*
3208 	 * If the page table page is mapped, we just increment the
3209 	 * hold count, and activate it.
3210 	 */
3211 	if (pd != NULL && (*pd & PG_V) != 0) {
3212 		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
3213 		m->wire_count++;
3214 	} else {
3215 		/*
3216 		 * Here if the pte page isn't mapped, or if it has been
3217 		 * deallocated.
3218 		 */
3219 		m = _pmap_allocpte(pmap, ptepindex, lockp);
3220 		if (m == NULL && lockp != NULL)
3221 			goto retry;
3222 	}
3223 	return (m);
3224 }
3225 
3226 
3227 /***************************************************
3228  * Pmap allocation/deallocation routines.
3229  ***************************************************/
3230 
3231 /*
3232  * Release any resources held by the given physical map.
3233  * Called when a pmap initialized by pmap_pinit is being released.
3234  * Should only be called if the map contains no valid mappings.
3235  */
3236 void
3237 pmap_release(pmap_t pmap)
3238 {
3239 	vm_page_t m;
3240 	int i;
3241 
3242 	KASSERT(pmap->pm_stats.resident_count == 0,
3243 	    ("pmap_release: pmap resident count %ld != 0",
3244 	    pmap->pm_stats.resident_count));
3245 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
3246 	    ("pmap_release: pmap has reserved page table page(s)"));
3247 	KASSERT(CPU_EMPTY(&pmap->pm_active),
3248 	    ("releasing active pmap %p", pmap));
3249 
3250 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
3251 
3252 	for (i = 0; i < NKPML4E; i++)	/* KVA */
3253 		pmap->pm_pml4[KPML4BASE + i] = 0;
3254 	for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
3255 		pmap->pm_pml4[DMPML4I + i] = 0;
3256 	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
3257 	for (i = 0; i < lm_ents; i++)	/* Large Map */
3258 		pmap->pm_pml4[LMSPML4I + i] = 0;
3259 
3260 	vm_page_unwire_noq(m);
3261 	vm_page_free_zero(m);
3262 
3263 	if (pmap->pm_pml4u != NULL) {
3264 		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
3265 		vm_page_unwire_noq(m);
3266 		vm_page_free(m);
3267 	}
3268 	if (pmap->pm_type == PT_X86 &&
3269 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
3270 		rangeset_fini(&pmap->pm_pkru);
3271 }
3272 
3273 static int
3274 kvm_size(SYSCTL_HANDLER_ARGS)
3275 {
3276 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
3277 
3278 	return sysctl_handle_long(oidp, &ksize, 0, req);
3279 }
3280 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
3281     0, 0, kvm_size, "LU", "Size of KVM");
3282 
3283 static int
3284 kvm_free(SYSCTL_HANDLER_ARGS)
3285 {
3286 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
3287 
3288 	return sysctl_handle_long(oidp, &kfree, 0, req);
3289 }
3290 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
3291     0, 0, kvm_free, "LU", "Amount of KVM free");
3292 
3293 /*
3294  * grow the number of kernel page table entries, if needed
3295  */
3296 void
3297 pmap_growkernel(vm_offset_t addr)
3298 {
3299 	vm_paddr_t paddr;
3300 	vm_page_t nkpg;
3301 	pd_entry_t *pde, newpdir;
3302 	pdp_entry_t *pdpe;
3303 
3304 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
3305 
3306 	/*
3307 	 * Return if "addr" is within the range of kernel page table pages
3308 	 * that were preallocated during pmap bootstrap.  Moreover, leave
3309 	 * "kernel_vm_end" and the kernel page table as they were.
3310 	 *
3311 	 * The correctness of this action is based on the following
3312 	 * argument: vm_map_insert() allocates contiguous ranges of the
3313 	 * kernel virtual address space.  It calls this function if a range
3314 	 * ends after "kernel_vm_end".  If the kernel is mapped between
3315 	 * "kernel_vm_end" and "addr", then the range cannot begin at
3316 	 * "kernel_vm_end".  In fact, its beginning address cannot be less
3317 	 * than the kernel.  Thus, there is no immediate need to allocate
3318 	 * any new kernel page table pages between "kernel_vm_end" and
3319 	 * "KERNBASE".
3320 	 */
3321 	if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
3322 		return;
3323 
3324 	addr = roundup2(addr, NBPDR);
3325 	if (addr - 1 >= vm_map_max(kernel_map))
3326 		addr = vm_map_max(kernel_map);
3327 	while (kernel_vm_end < addr) {
3328 		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
3329 		if ((*pdpe & X86_PG_V) == 0) {
3330 			/* We need a new PDP entry */
3331 			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
3332 			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
3333 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3334 			if (nkpg == NULL)
3335 				panic("pmap_growkernel: no memory to grow kernel");
3336 			if ((nkpg->flags & PG_ZERO) == 0)
3337 				pmap_zero_page(nkpg);
3338 			paddr = VM_PAGE_TO_PHYS(nkpg);
3339 			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
3340 			    X86_PG_A | X86_PG_M);
3341 			continue; /* try again */
3342 		}
3343 		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
3344 		if ((*pde & X86_PG_V) != 0) {
3345 			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
3346 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3347 				kernel_vm_end = vm_map_max(kernel_map);
3348 				break;
3349 			}
3350 			continue;
3351 		}
3352 
3353 		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
3354 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
3355 		    VM_ALLOC_ZERO);
3356 		if (nkpg == NULL)
3357 			panic("pmap_growkernel: no memory to grow kernel");
3358 		if ((nkpg->flags & PG_ZERO) == 0)
3359 			pmap_zero_page(nkpg);
3360 		paddr = VM_PAGE_TO_PHYS(nkpg);
3361 		newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
3362 		pde_store(pde, newpdir);
3363 
3364 		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
3365 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3366 			kernel_vm_end = vm_map_max(kernel_map);
3367 			break;
3368 		}
3369 	}
3370 }
3371 
3372 
3373 /***************************************************
3374  * page management routines.
3375  ***************************************************/
3376 
3377 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
3378 CTASSERT(_NPCM == 3);
3379 CTASSERT(_NPCPV == 168);
3380 
3381 static __inline struct pv_chunk *
3382 pv_to_chunk(pv_entry_t pv)
3383 {
3384 
3385 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
3386 }
3387 
3388 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
3389 
3390 #define	PC_FREE0	0xfffffffffffffffful
3391 #define	PC_FREE1	0xfffffffffffffffful
3392 #define	PC_FREE2	0x000000fffffffffful
3393 
3394 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
3395 
3396 #ifdef PV_STATS
3397 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
3398 
3399 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
3400 	"Current number of pv entry chunks");
3401 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
3402 	"Current number of pv entry chunks allocated");
3403 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
3404 	"Current number of pv entry chunks frees");
3405 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
3406 	"Number of times tried to get a chunk page but failed.");
3407 
3408 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
3409 static int pv_entry_spare;
3410 
3411 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
3412 	"Current number of pv entry frees");
3413 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
3414 	"Current number of pv entry allocs");
3415 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
3416 	"Current number of pv entries");
3417 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
3418 	"Current number of spare pv entries");
3419 #endif
3420 
3421 static void
3422 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di)
3423 {
3424 
3425 	if (pmap == NULL)
3426 		return;
3427 	pmap_invalidate_all(pmap);
3428 	if (pmap != locked_pmap)
3429 		PMAP_UNLOCK(pmap);
3430 	if (start_di)
3431 		pmap_delayed_invl_finished();
3432 }
3433 
3434 /*
3435  * We are in a serious low memory condition.  Resort to
3436  * drastic measures to free some pages so we can allocate
3437  * another pv entry chunk.
3438  *
3439  * Returns NULL if PV entries were reclaimed from the specified pmap.
3440  *
3441  * We do not, however, unmap 2mpages because subsequent accesses will
3442  * allocate per-page pv entries until repromotion occurs, thereby
3443  * exacerbating the shortage of free pv entries.
3444  */
3445 static vm_page_t
3446 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
3447 {
3448 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
3449 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
3450 	struct md_page *pvh;
3451 	pd_entry_t *pde;
3452 	pmap_t next_pmap, pmap;
3453 	pt_entry_t *pte, tpte;
3454 	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
3455 	pv_entry_t pv;
3456 	vm_offset_t va;
3457 	vm_page_t m, m_pc;
3458 	struct spglist free;
3459 	uint64_t inuse;
3460 	int bit, field, freed;
3461 	bool start_di;
3462 	static int active_reclaims = 0;
3463 
3464 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
3465 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
3466 	pmap = NULL;
3467 	m_pc = NULL;
3468 	PG_G = PG_A = PG_M = PG_RW = 0;
3469 	SLIST_INIT(&free);
3470 	bzero(&pc_marker_b, sizeof(pc_marker_b));
3471 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
3472 	pc_marker = (struct pv_chunk *)&pc_marker_b;
3473 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
3474 
3475 	/*
3476 	 * A delayed invalidation block should already be active if
3477 	 * pmap_advise() or pmap_remove() called this function by way
3478 	 * of pmap_demote_pde_locked().
3479 	 */
3480 	start_di = pmap_not_in_di();
3481 
3482 	mtx_lock(&pv_chunks_mutex);
3483 	active_reclaims++;
3484 	TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
3485 	TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
3486 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
3487 	    SLIST_EMPTY(&free)) {
3488 		next_pmap = pc->pc_pmap;
3489 		if (next_pmap == NULL) {
3490 			/*
3491 			 * The next chunk is a marker.  However, it is
3492 			 * not our marker, so active_reclaims must be
3493 			 * > 1.  Consequently, the next_chunk code
3494 			 * will not rotate the pv_chunks list.
3495 			 */
3496 			goto next_chunk;
3497 		}
3498 		mtx_unlock(&pv_chunks_mutex);
3499 
3500 		/*
3501 		 * A pv_chunk can only be removed from the pc_lru list
3502 		 * when both pc_chunks_mutex is owned and the
3503 		 * corresponding pmap is locked.
3504 		 */
3505 		if (pmap != next_pmap) {
3506 			reclaim_pv_chunk_leave_pmap(pmap, locked_pmap,
3507 			    start_di);
3508 			pmap = next_pmap;
3509 			/* Avoid deadlock and lock recursion. */
3510 			if (pmap > locked_pmap) {
3511 				RELEASE_PV_LIST_LOCK(lockp);
3512 				PMAP_LOCK(pmap);
3513 				if (start_di)
3514 					pmap_delayed_invl_started();
3515 				mtx_lock(&pv_chunks_mutex);
3516 				continue;
3517 			} else if (pmap != locked_pmap) {
3518 				if (PMAP_TRYLOCK(pmap)) {
3519 					if (start_di)
3520 						pmap_delayed_invl_started();
3521 					mtx_lock(&pv_chunks_mutex);
3522 					continue;
3523 				} else {
3524 					pmap = NULL; /* pmap is not locked */
3525 					mtx_lock(&pv_chunks_mutex);
3526 					pc = TAILQ_NEXT(pc_marker, pc_lru);
3527 					if (pc == NULL ||
3528 					    pc->pc_pmap != next_pmap)
3529 						continue;
3530 					goto next_chunk;
3531 				}
3532 			} else if (start_di)
3533 				pmap_delayed_invl_started();
3534 			PG_G = pmap_global_bit(pmap);
3535 			PG_A = pmap_accessed_bit(pmap);
3536 			PG_M = pmap_modified_bit(pmap);
3537 			PG_RW = pmap_rw_bit(pmap);
3538 		}
3539 
3540 		/*
3541 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
3542 		 */
3543 		freed = 0;
3544 		for (field = 0; field < _NPCM; field++) {
3545 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
3546 			    inuse != 0; inuse &= ~(1UL << bit)) {
3547 				bit = bsfq(inuse);
3548 				pv = &pc->pc_pventry[field * 64 + bit];
3549 				va = pv->pv_va;
3550 				pde = pmap_pde(pmap, va);
3551 				if ((*pde & PG_PS) != 0)
3552 					continue;
3553 				pte = pmap_pde_to_pte(pde, va);
3554 				if ((*pte & PG_W) != 0)
3555 					continue;
3556 				tpte = pte_load_clear(pte);
3557 				if ((tpte & PG_G) != 0)
3558 					pmap_invalidate_page(pmap, va);
3559 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
3560 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3561 					vm_page_dirty(m);
3562 				if ((tpte & PG_A) != 0)
3563 					vm_page_aflag_set(m, PGA_REFERENCED);
3564 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3565 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3566 				m->md.pv_gen++;
3567 				if (TAILQ_EMPTY(&m->md.pv_list) &&
3568 				    (m->flags & PG_FICTITIOUS) == 0) {
3569 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3570 					if (TAILQ_EMPTY(&pvh->pv_list)) {
3571 						vm_page_aflag_clear(m,
3572 						    PGA_WRITEABLE);
3573 					}
3574 				}
3575 				pmap_delayed_invl_page(m);
3576 				pc->pc_map[field] |= 1UL << bit;
3577 				pmap_unuse_pt(pmap, va, *pde, &free);
3578 				freed++;
3579 			}
3580 		}
3581 		if (freed == 0) {
3582 			mtx_lock(&pv_chunks_mutex);
3583 			goto next_chunk;
3584 		}
3585 		/* Every freed mapping is for a 4 KB page. */
3586 		pmap_resident_count_dec(pmap, freed);
3587 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3588 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3589 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3590 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3591 		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
3592 		    pc->pc_map[2] == PC_FREE2) {
3593 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3594 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3595 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3596 			/* Entire chunk is free; return it. */
3597 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3598 			dump_drop_page(m_pc->phys_addr);
3599 			mtx_lock(&pv_chunks_mutex);
3600 			TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
3601 			break;
3602 		}
3603 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3604 		mtx_lock(&pv_chunks_mutex);
3605 		/* One freed pv entry in locked_pmap is sufficient. */
3606 		if (pmap == locked_pmap)
3607 			break;
3608 next_chunk:
3609 		TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
3610 		TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
3611 		if (active_reclaims == 1 && pmap != NULL) {
3612 			/*
3613 			 * Rotate the pv chunks list so that we do not
3614 			 * scan the same pv chunks that could not be
3615 			 * freed (because they contained a wired
3616 			 * and/or superpage mapping) on every
3617 			 * invocation of reclaim_pv_chunk().
3618 			 */
3619 			while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
3620 				MPASS(pc->pc_pmap != NULL);
3621 				TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
3622 				TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
3623 			}
3624 		}
3625 	}
3626 	TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
3627 	TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
3628 	active_reclaims--;
3629 	mtx_unlock(&pv_chunks_mutex);
3630 	reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di);
3631 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
3632 		m_pc = SLIST_FIRST(&free);
3633 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3634 		/* Recycle a freed page table page. */
3635 		m_pc->wire_count = 1;
3636 	}
3637 	vm_page_free_pages_toq(&free, true);
3638 	return (m_pc);
3639 }
3640 
3641 /*
3642  * free the pv_entry back to the free list
3643  */
3644 static void
3645 free_pv_entry(pmap_t pmap, pv_entry_t pv)
3646 {
3647 	struct pv_chunk *pc;
3648 	int idx, field, bit;
3649 
3650 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3651 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3652 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3653 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3654 	pc = pv_to_chunk(pv);
3655 	idx = pv - &pc->pc_pventry[0];
3656 	field = idx / 64;
3657 	bit = idx % 64;
3658 	pc->pc_map[field] |= 1ul << bit;
3659 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
3660 	    pc->pc_map[2] != PC_FREE2) {
3661 		/* 98% of the time, pc is already at the head of the list. */
3662 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3663 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3664 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3665 		}
3666 		return;
3667 	}
3668 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3669 	free_pv_chunk(pc);
3670 }
3671 
3672 static void
3673 free_pv_chunk(struct pv_chunk *pc)
3674 {
3675 	vm_page_t m;
3676 
3677 	mtx_lock(&pv_chunks_mutex);
3678  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
3679 	mtx_unlock(&pv_chunks_mutex);
3680 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3681 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3682 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3683 	/* entire chunk is free, return it */
3684 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3685 	dump_drop_page(m->phys_addr);
3686 	vm_page_unwire(m, PQ_NONE);
3687 	vm_page_free(m);
3688 }
3689 
3690 /*
3691  * Returns a new PV entry, allocating a new PV chunk from the system when
3692  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
3693  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
3694  * returned.
3695  *
3696  * The given PV list lock may be released.
3697  */
3698 static pv_entry_t
3699 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3700 {
3701 	int bit, field;
3702 	pv_entry_t pv;
3703 	struct pv_chunk *pc;
3704 	vm_page_t m;
3705 
3706 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3707 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3708 retry:
3709 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3710 	if (pc != NULL) {
3711 		for (field = 0; field < _NPCM; field++) {
3712 			if (pc->pc_map[field]) {
3713 				bit = bsfq(pc->pc_map[field]);
3714 				break;
3715 			}
3716 		}
3717 		if (field < _NPCM) {
3718 			pv = &pc->pc_pventry[field * 64 + bit];
3719 			pc->pc_map[field] &= ~(1ul << bit);
3720 			/* If this was the last item, move it to tail */
3721 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
3722 			    pc->pc_map[2] == 0) {
3723 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3724 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3725 				    pc_list);
3726 			}
3727 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
3728 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3729 			return (pv);
3730 		}
3731 	}
3732 	/* No free items, allocate another chunk */
3733 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3734 	    VM_ALLOC_WIRED);
3735 	if (m == NULL) {
3736 		if (lockp == NULL) {
3737 			PV_STAT(pc_chunk_tryfail++);
3738 			return (NULL);
3739 		}
3740 		m = reclaim_pv_chunk(pmap, lockp);
3741 		if (m == NULL)
3742 			goto retry;
3743 	}
3744 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3745 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3746 	dump_add_page(m->phys_addr);
3747 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3748 	pc->pc_pmap = pmap;
3749 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
3750 	pc->pc_map[1] = PC_FREE1;
3751 	pc->pc_map[2] = PC_FREE2;
3752 	mtx_lock(&pv_chunks_mutex);
3753 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
3754 	mtx_unlock(&pv_chunks_mutex);
3755 	pv = &pc->pc_pventry[0];
3756 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3757 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
3758 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3759 	return (pv);
3760 }
3761 
3762 /*
3763  * Returns the number of one bits within the given PV chunk map.
3764  *
3765  * The erratas for Intel processors state that "POPCNT Instruction May
3766  * Take Longer to Execute Than Expected".  It is believed that the
3767  * issue is the spurious dependency on the destination register.
3768  * Provide a hint to the register rename logic that the destination
3769  * value is overwritten, by clearing it, as suggested in the
3770  * optimization manual.  It should be cheap for unaffected processors
3771  * as well.
3772  *
3773  * Reference numbers for erratas are
3774  * 4th Gen Core: HSD146
3775  * 5th Gen Core: BDM85
3776  * 6th Gen Core: SKL029
3777  */
3778 static int
3779 popcnt_pc_map_pq(uint64_t *map)
3780 {
3781 	u_long result, tmp;
3782 
3783 	__asm __volatile("xorl %k0,%k0;popcntq %2,%0;"
3784 	    "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;"
3785 	    "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0"
3786 	    : "=&r" (result), "=&r" (tmp)
3787 	    : "m" (map[0]), "m" (map[1]), "m" (map[2]));
3788 	return (result);
3789 }
3790 
3791 /*
3792  * Ensure that the number of spare PV entries in the specified pmap meets or
3793  * exceeds the given count, "needed".
3794  *
3795  * The given PV list lock may be released.
3796  */
3797 static void
3798 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3799 {
3800 	struct pch new_tail;
3801 	struct pv_chunk *pc;
3802 	vm_page_t m;
3803 	int avail, free;
3804 	bool reclaimed;
3805 
3806 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3807 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3808 
3809 	/*
3810 	 * Newly allocated PV chunks must be stored in a private list until
3811 	 * the required number of PV chunks have been allocated.  Otherwise,
3812 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
3813 	 * contrast, these chunks must be added to the pmap upon allocation.
3814 	 */
3815 	TAILQ_INIT(&new_tail);
3816 retry:
3817 	avail = 0;
3818 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3819 #ifndef __POPCNT__
3820 		if ((cpu_feature2 & CPUID2_POPCNT) == 0)
3821 			bit_count((bitstr_t *)pc->pc_map, 0,
3822 			    sizeof(pc->pc_map) * NBBY, &free);
3823 		else
3824 #endif
3825 		free = popcnt_pc_map_pq(pc->pc_map);
3826 		if (free == 0)
3827 			break;
3828 		avail += free;
3829 		if (avail >= needed)
3830 			break;
3831 	}
3832 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
3833 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3834 		    VM_ALLOC_WIRED);
3835 		if (m == NULL) {
3836 			m = reclaim_pv_chunk(pmap, lockp);
3837 			if (m == NULL)
3838 				goto retry;
3839 			reclaimed = true;
3840 		}
3841 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3842 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3843 		dump_add_page(m->phys_addr);
3844 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3845 		pc->pc_pmap = pmap;
3846 		pc->pc_map[0] = PC_FREE0;
3847 		pc->pc_map[1] = PC_FREE1;
3848 		pc->pc_map[2] = PC_FREE2;
3849 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3850 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
3851 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3852 
3853 		/*
3854 		 * The reclaim might have freed a chunk from the current pmap.
3855 		 * If that chunk contained available entries, we need to
3856 		 * re-count the number of available entries.
3857 		 */
3858 		if (reclaimed)
3859 			goto retry;
3860 	}
3861 	if (!TAILQ_EMPTY(&new_tail)) {
3862 		mtx_lock(&pv_chunks_mutex);
3863 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
3864 		mtx_unlock(&pv_chunks_mutex);
3865 	}
3866 }
3867 
3868 /*
3869  * First find and then remove the pv entry for the specified pmap and virtual
3870  * address from the specified pv list.  Returns the pv entry if found and NULL
3871  * otherwise.  This operation can be performed on pv lists for either 4KB or
3872  * 2MB page mappings.
3873  */
3874 static __inline pv_entry_t
3875 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3876 {
3877 	pv_entry_t pv;
3878 
3879 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3880 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3881 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3882 			pvh->pv_gen++;
3883 			break;
3884 		}
3885 	}
3886 	return (pv);
3887 }
3888 
3889 /*
3890  * After demotion from a 2MB page mapping to 512 4KB page mappings,
3891  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3892  * entries for each of the 4KB page mappings.
3893  */
3894 static void
3895 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3896     struct rwlock **lockp)
3897 {
3898 	struct md_page *pvh;
3899 	struct pv_chunk *pc;
3900 	pv_entry_t pv;
3901 	vm_offset_t va_last;
3902 	vm_page_t m;
3903 	int bit, field;
3904 
3905 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3906 	KASSERT((pa & PDRMASK) == 0,
3907 	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
3908 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3909 
3910 	/*
3911 	 * Transfer the 2mpage's pv entry for this mapping to the first
3912 	 * page's pv list.  Once this transfer begins, the pv list lock
3913 	 * must not be released until the last pv entry is reinstantiated.
3914 	 */
3915 	pvh = pa_to_pvh(pa);
3916 	va = trunc_2mpage(va);
3917 	pv = pmap_pvh_remove(pvh, pmap, va);
3918 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
3919 	m = PHYS_TO_VM_PAGE(pa);
3920 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3921 	m->md.pv_gen++;
3922 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
3923 	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
3924 	va_last = va + NBPDR - PAGE_SIZE;
3925 	for (;;) {
3926 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3927 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
3928 		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
3929 		for (field = 0; field < _NPCM; field++) {
3930 			while (pc->pc_map[field]) {
3931 				bit = bsfq(pc->pc_map[field]);
3932 				pc->pc_map[field] &= ~(1ul << bit);
3933 				pv = &pc->pc_pventry[field * 64 + bit];
3934 				va += PAGE_SIZE;
3935 				pv->pv_va = va;
3936 				m++;
3937 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3938 			    ("pmap_pv_demote_pde: page %p is not managed", m));
3939 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3940 				m->md.pv_gen++;
3941 				if (va == va_last)
3942 					goto out;
3943 			}
3944 		}
3945 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3946 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3947 	}
3948 out:
3949 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
3950 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3951 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3952 	}
3953 	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
3954 	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
3955 }
3956 
3957 #if VM_NRESERVLEVEL > 0
3958 /*
3959  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
3960  * replace the many pv entries for the 4KB page mappings by a single pv entry
3961  * for the 2MB page mapping.
3962  */
3963 static void
3964 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3965     struct rwlock **lockp)
3966 {
3967 	struct md_page *pvh;
3968 	pv_entry_t pv;
3969 	vm_offset_t va_last;
3970 	vm_page_t m;
3971 
3972 	KASSERT((pa & PDRMASK) == 0,
3973 	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
3974 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3975 
3976 	/*
3977 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
3978 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
3979 	 * a transfer avoids the possibility that get_pv_entry() calls
3980 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
3981 	 * mappings that is being promoted.
3982 	 */
3983 	m = PHYS_TO_VM_PAGE(pa);
3984 	va = trunc_2mpage(va);
3985 	pv = pmap_pvh_remove(&m->md, pmap, va);
3986 	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
3987 	pvh = pa_to_pvh(pa);
3988 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3989 	pvh->pv_gen++;
3990 	/* Free the remaining NPTEPG - 1 pv entries. */
3991 	va_last = va + NBPDR - PAGE_SIZE;
3992 	do {
3993 		m++;
3994 		va += PAGE_SIZE;
3995 		pmap_pvh_free(&m->md, pmap, va);
3996 	} while (va < va_last);
3997 }
3998 #endif /* VM_NRESERVLEVEL > 0 */
3999 
4000 /*
4001  * First find and then destroy the pv entry for the specified pmap and virtual
4002  * address.  This operation can be performed on pv lists for either 4KB or 2MB
4003  * page mappings.
4004  */
4005 static void
4006 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
4007 {
4008 	pv_entry_t pv;
4009 
4010 	pv = pmap_pvh_remove(pvh, pmap, va);
4011 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
4012 	free_pv_entry(pmap, pv);
4013 }
4014 
4015 /*
4016  * Conditionally create the PV entry for a 4KB page mapping if the required
4017  * memory can be allocated without resorting to reclamation.
4018  */
4019 static boolean_t
4020 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
4021     struct rwlock **lockp)
4022 {
4023 	pv_entry_t pv;
4024 
4025 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4026 	/* Pass NULL instead of the lock pointer to disable reclamation. */
4027 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
4028 		pv->pv_va = va;
4029 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
4030 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4031 		m->md.pv_gen++;
4032 		return (TRUE);
4033 	} else
4034 		return (FALSE);
4035 }
4036 
4037 /*
4038  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
4039  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
4040  * false if the PV entry cannot be allocated without resorting to reclamation.
4041  */
4042 static bool
4043 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags,
4044     struct rwlock **lockp)
4045 {
4046 	struct md_page *pvh;
4047 	pv_entry_t pv;
4048 	vm_paddr_t pa;
4049 
4050 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4051 	/* Pass NULL instead of the lock pointer to disable reclamation. */
4052 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
4053 	    NULL : lockp)) == NULL)
4054 		return (false);
4055 	pv->pv_va = va;
4056 	pa = pde & PG_PS_FRAME;
4057 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4058 	pvh = pa_to_pvh(pa);
4059 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4060 	pvh->pv_gen++;
4061 	return (true);
4062 }
4063 
4064 /*
4065  * Fills a page table page with mappings to consecutive physical pages.
4066  */
4067 static void
4068 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
4069 {
4070 	pt_entry_t *pte;
4071 
4072 	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
4073 		*pte = newpte;
4074 		newpte += PAGE_SIZE;
4075 	}
4076 }
4077 
4078 /*
4079  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
4080  * mapping is invalidated.
4081  */
4082 static boolean_t
4083 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
4084 {
4085 	struct rwlock *lock;
4086 	boolean_t rv;
4087 
4088 	lock = NULL;
4089 	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
4090 	if (lock != NULL)
4091 		rw_wunlock(lock);
4092 	return (rv);
4093 }
4094 
4095 static boolean_t
4096 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
4097     struct rwlock **lockp)
4098 {
4099 	pd_entry_t newpde, oldpde;
4100 	pt_entry_t *firstpte, newpte;
4101 	pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V;
4102 	vm_paddr_t mptepa;
4103 	vm_page_t mpte;
4104 	struct spglist free;
4105 	vm_offset_t sva;
4106 	int PG_PTE_CACHE;
4107 
4108 	PG_G = pmap_global_bit(pmap);
4109 	PG_A = pmap_accessed_bit(pmap);
4110 	PG_M = pmap_modified_bit(pmap);
4111 	PG_RW = pmap_rw_bit(pmap);
4112 	PG_V = pmap_valid_bit(pmap);
4113 	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
4114 	PG_PKU_MASK = pmap_pku_mask_bit(pmap);
4115 
4116 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4117 	oldpde = *pde;
4118 	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
4119 	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
4120 	if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
4121 	    NULL) {
4122 		KASSERT((oldpde & PG_W) == 0,
4123 		    ("pmap_demote_pde: page table page for a wired mapping"
4124 		    " is missing"));
4125 
4126 		/*
4127 		 * Invalidate the 2MB page mapping and return "failure" if the
4128 		 * mapping was never accessed or the allocation of the new
4129 		 * page table page fails.  If the 2MB page mapping belongs to
4130 		 * the direct map region of the kernel's address space, then
4131 		 * the page allocation request specifies the highest possible
4132 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
4133 		 * normal.  Page table pages are preallocated for every other
4134 		 * part of the kernel address space, so the direct map region
4135 		 * is the only part of the kernel address space that must be
4136 		 * handled here.
4137 		 */
4138 		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
4139 		    pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
4140 		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
4141 		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
4142 			SLIST_INIT(&free);
4143 			sva = trunc_2mpage(va);
4144 			pmap_remove_pde(pmap, pde, sva, &free, lockp);
4145 			if ((oldpde & PG_G) == 0)
4146 				pmap_invalidate_pde_page(pmap, sva, oldpde);
4147 			vm_page_free_pages_toq(&free, true);
4148 			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
4149 			    " in pmap %p", va, pmap);
4150 			return (FALSE);
4151 		}
4152 		if (va < VM_MAXUSER_ADDRESS)
4153 			pmap_resident_count_inc(pmap, 1);
4154 	}
4155 	mptepa = VM_PAGE_TO_PHYS(mpte);
4156 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
4157 	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
4158 	KASSERT((oldpde & PG_A) != 0,
4159 	    ("pmap_demote_pde: oldpde is missing PG_A"));
4160 	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
4161 	    ("pmap_demote_pde: oldpde is missing PG_M"));
4162 	newpte = oldpde & ~PG_PS;
4163 	newpte = pmap_swap_pat(pmap, newpte);
4164 
4165 	/*
4166 	 * If the page table page is new, initialize it.
4167 	 */
4168 	if (mpte->wire_count == 1) {
4169 		mpte->wire_count = NPTEPG;
4170 		pmap_fill_ptp(firstpte, newpte);
4171 	}
4172 	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
4173 	    ("pmap_demote_pde: firstpte and newpte map different physical"
4174 	    " addresses"));
4175 
4176 	/*
4177 	 * If the mapping has changed attributes, update the page table
4178 	 * entries.
4179 	 */
4180 	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
4181 		pmap_fill_ptp(firstpte, newpte);
4182 
4183 	/*
4184 	 * The spare PV entries must be reserved prior to demoting the
4185 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
4186 	 * of the PDE and the PV lists will be inconsistent, which can result
4187 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
4188 	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
4189 	 * PV entry for the 2MB page mapping that is being demoted.
4190 	 */
4191 	if ((oldpde & PG_MANAGED) != 0)
4192 		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
4193 
4194 	/*
4195 	 * Demote the mapping.  This pmap is locked.  The old PDE has
4196 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
4197 	 * set.  Thus, there is no danger of a race with another
4198 	 * processor changing the setting of PG_A and/or PG_M between
4199 	 * the read above and the store below.
4200 	 */
4201 	if (workaround_erratum383)
4202 		pmap_update_pde(pmap, va, pde, newpde);
4203 	else
4204 		pde_store(pde, newpde);
4205 
4206 	/*
4207 	 * Invalidate a stale recursive mapping of the page table page.
4208 	 */
4209 	if (va >= VM_MAXUSER_ADDRESS)
4210 		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
4211 
4212 	/*
4213 	 * Demote the PV entry.
4214 	 */
4215 	if ((oldpde & PG_MANAGED) != 0)
4216 		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
4217 
4218 	atomic_add_long(&pmap_pde_demotions, 1);
4219 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
4220 	    " in pmap %p", va, pmap);
4221 	return (TRUE);
4222 }
4223 
4224 /*
4225  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
4226  */
4227 static void
4228 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
4229 {
4230 	pd_entry_t newpde;
4231 	vm_paddr_t mptepa;
4232 	vm_page_t mpte;
4233 
4234 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
4235 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4236 	mpte = pmap_remove_pt_page(pmap, va);
4237 	if (mpte == NULL)
4238 		panic("pmap_remove_kernel_pde: Missing pt page.");
4239 
4240 	mptepa = VM_PAGE_TO_PHYS(mpte);
4241 	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
4242 
4243 	/*
4244 	 * Initialize the page table page.
4245 	 */
4246 	pagezero((void *)PHYS_TO_DMAP(mptepa));
4247 
4248 	/*
4249 	 * Demote the mapping.
4250 	 */
4251 	if (workaround_erratum383)
4252 		pmap_update_pde(pmap, va, pde, newpde);
4253 	else
4254 		pde_store(pde, newpde);
4255 
4256 	/*
4257 	 * Invalidate a stale recursive mapping of the page table page.
4258 	 */
4259 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
4260 }
4261 
4262 /*
4263  * pmap_remove_pde: do the things to unmap a superpage in a process
4264  */
4265 static int
4266 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
4267     struct spglist *free, struct rwlock **lockp)
4268 {
4269 	struct md_page *pvh;
4270 	pd_entry_t oldpde;
4271 	vm_offset_t eva, va;
4272 	vm_page_t m, mpte;
4273 	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
4274 
4275 	PG_G = pmap_global_bit(pmap);
4276 	PG_A = pmap_accessed_bit(pmap);
4277 	PG_M = pmap_modified_bit(pmap);
4278 	PG_RW = pmap_rw_bit(pmap);
4279 
4280 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4281 	KASSERT((sva & PDRMASK) == 0,
4282 	    ("pmap_remove_pde: sva is not 2mpage aligned"));
4283 	oldpde = pte_load_clear(pdq);
4284 	if (oldpde & PG_W)
4285 		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
4286 	if ((oldpde & PG_G) != 0)
4287 		pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
4288 	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
4289 	if (oldpde & PG_MANAGED) {
4290 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
4291 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
4292 		pmap_pvh_free(pvh, pmap, sva);
4293 		eva = sva + NBPDR;
4294 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
4295 		    va < eva; va += PAGE_SIZE, m++) {
4296 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
4297 				vm_page_dirty(m);
4298 			if (oldpde & PG_A)
4299 				vm_page_aflag_set(m, PGA_REFERENCED);
4300 			if (TAILQ_EMPTY(&m->md.pv_list) &&
4301 			    TAILQ_EMPTY(&pvh->pv_list))
4302 				vm_page_aflag_clear(m, PGA_WRITEABLE);
4303 			pmap_delayed_invl_page(m);
4304 		}
4305 	}
4306 	if (pmap == kernel_pmap) {
4307 		pmap_remove_kernel_pde(pmap, pdq, sva);
4308 	} else {
4309 		mpte = pmap_remove_pt_page(pmap, sva);
4310 		if (mpte != NULL) {
4311 			pmap_resident_count_dec(pmap, 1);
4312 			KASSERT(mpte->wire_count == NPTEPG,
4313 			    ("pmap_remove_pde: pte page wire count error"));
4314 			mpte->wire_count = 0;
4315 			pmap_add_delayed_free_list(mpte, free, FALSE);
4316 		}
4317 	}
4318 	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
4319 }
4320 
4321 /*
4322  * pmap_remove_pte: do the things to unmap a page in a process
4323  */
4324 static int
4325 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
4326     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
4327 {
4328 	struct md_page *pvh;
4329 	pt_entry_t oldpte, PG_A, PG_M, PG_RW;
4330 	vm_page_t m;
4331 
4332 	PG_A = pmap_accessed_bit(pmap);
4333 	PG_M = pmap_modified_bit(pmap);
4334 	PG_RW = pmap_rw_bit(pmap);
4335 
4336 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4337 	oldpte = pte_load_clear(ptq);
4338 	if (oldpte & PG_W)
4339 		pmap->pm_stats.wired_count -= 1;
4340 	pmap_resident_count_dec(pmap, 1);
4341 	if (oldpte & PG_MANAGED) {
4342 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
4343 		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4344 			vm_page_dirty(m);
4345 		if (oldpte & PG_A)
4346 			vm_page_aflag_set(m, PGA_REFERENCED);
4347 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
4348 		pmap_pvh_free(&m->md, pmap, va);
4349 		if (TAILQ_EMPTY(&m->md.pv_list) &&
4350 		    (m->flags & PG_FICTITIOUS) == 0) {
4351 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4352 			if (TAILQ_EMPTY(&pvh->pv_list))
4353 				vm_page_aflag_clear(m, PGA_WRITEABLE);
4354 		}
4355 		pmap_delayed_invl_page(m);
4356 	}
4357 	return (pmap_unuse_pt(pmap, va, ptepde, free));
4358 }
4359 
4360 /*
4361  * Remove a single page from a process address space
4362  */
4363 static void
4364 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
4365     struct spglist *free)
4366 {
4367 	struct rwlock *lock;
4368 	pt_entry_t *pte, PG_V;
4369 
4370 	PG_V = pmap_valid_bit(pmap);
4371 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4372 	if ((*pde & PG_V) == 0)
4373 		return;
4374 	pte = pmap_pde_to_pte(pde, va);
4375 	if ((*pte & PG_V) == 0)
4376 		return;
4377 	lock = NULL;
4378 	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
4379 	if (lock != NULL)
4380 		rw_wunlock(lock);
4381 	pmap_invalidate_page(pmap, va);
4382 }
4383 
4384 /*
4385  * Removes the specified range of addresses from the page table page.
4386  */
4387 static bool
4388 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
4389     pd_entry_t *pde, struct spglist *free, struct rwlock **lockp)
4390 {
4391 	pt_entry_t PG_G, *pte;
4392 	vm_offset_t va;
4393 	bool anyvalid;
4394 
4395 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4396 	PG_G = pmap_global_bit(pmap);
4397 	anyvalid = false;
4398 	va = eva;
4399 	for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++,
4400 	    sva += PAGE_SIZE) {
4401 		if (*pte == 0) {
4402 			if (va != eva) {
4403 				pmap_invalidate_range(pmap, va, sva);
4404 				va = eva;
4405 			}
4406 			continue;
4407 		}
4408 		if ((*pte & PG_G) == 0)
4409 			anyvalid = true;
4410 		else if (va == eva)
4411 			va = sva;
4412 		if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) {
4413 			sva += PAGE_SIZE;
4414 			break;
4415 		}
4416 	}
4417 	if (va != eva)
4418 		pmap_invalidate_range(pmap, va, sva);
4419 	return (anyvalid);
4420 }
4421 
4422 /*
4423  *	Remove the given range of addresses from the specified map.
4424  *
4425  *	It is assumed that the start and end are properly
4426  *	rounded to the page size.
4427  */
4428 void
4429 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4430 {
4431 	struct rwlock *lock;
4432 	vm_offset_t va_next;
4433 	pml4_entry_t *pml4e;
4434 	pdp_entry_t *pdpe;
4435 	pd_entry_t ptpaddr, *pde;
4436 	pt_entry_t PG_G, PG_V;
4437 	struct spglist free;
4438 	int anyvalid;
4439 
4440 	PG_G = pmap_global_bit(pmap);
4441 	PG_V = pmap_valid_bit(pmap);
4442 
4443 	/*
4444 	 * Perform an unsynchronized read.  This is, however, safe.
4445 	 */
4446 	if (pmap->pm_stats.resident_count == 0)
4447 		return;
4448 
4449 	anyvalid = 0;
4450 	SLIST_INIT(&free);
4451 
4452 	pmap_delayed_invl_started();
4453 	PMAP_LOCK(pmap);
4454 
4455 	/*
4456 	 * special handling of removing one page.  a very
4457 	 * common operation and easy to short circuit some
4458 	 * code.
4459 	 */
4460 	if (sva + PAGE_SIZE == eva) {
4461 		pde = pmap_pde(pmap, sva);
4462 		if (pde && (*pde & PG_PS) == 0) {
4463 			pmap_remove_page(pmap, sva, pde, &free);
4464 			goto out;
4465 		}
4466 	}
4467 
4468 	lock = NULL;
4469 	for (; sva < eva; sva = va_next) {
4470 
4471 		if (pmap->pm_stats.resident_count == 0)
4472 			break;
4473 
4474 		pml4e = pmap_pml4e(pmap, sva);
4475 		if ((*pml4e & PG_V) == 0) {
4476 			va_next = (sva + NBPML4) & ~PML4MASK;
4477 			if (va_next < sva)
4478 				va_next = eva;
4479 			continue;
4480 		}
4481 
4482 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
4483 		if ((*pdpe & PG_V) == 0) {
4484 			va_next = (sva + NBPDP) & ~PDPMASK;
4485 			if (va_next < sva)
4486 				va_next = eva;
4487 			continue;
4488 		}
4489 
4490 		/*
4491 		 * Calculate index for next page table.
4492 		 */
4493 		va_next = (sva + NBPDR) & ~PDRMASK;
4494 		if (va_next < sva)
4495 			va_next = eva;
4496 
4497 		pde = pmap_pdpe_to_pde(pdpe, sva);
4498 		ptpaddr = *pde;
4499 
4500 		/*
4501 		 * Weed out invalid mappings.
4502 		 */
4503 		if (ptpaddr == 0)
4504 			continue;
4505 
4506 		/*
4507 		 * Check for large page.
4508 		 */
4509 		if ((ptpaddr & PG_PS) != 0) {
4510 			/*
4511 			 * Are we removing the entire large page?  If not,
4512 			 * demote the mapping and fall through.
4513 			 */
4514 			if (sva + NBPDR == va_next && eva >= va_next) {
4515 				/*
4516 				 * The TLB entry for a PG_G mapping is
4517 				 * invalidated by pmap_remove_pde().
4518 				 */
4519 				if ((ptpaddr & PG_G) == 0)
4520 					anyvalid = 1;
4521 				pmap_remove_pde(pmap, pde, sva, &free, &lock);
4522 				continue;
4523 			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
4524 			    &lock)) {
4525 				/* The large page mapping was destroyed. */
4526 				continue;
4527 			} else
4528 				ptpaddr = *pde;
4529 		}
4530 
4531 		/*
4532 		 * Limit our scan to either the end of the va represented
4533 		 * by the current page table page, or to the end of the
4534 		 * range being removed.
4535 		 */
4536 		if (va_next > eva)
4537 			va_next = eva;
4538 
4539 		if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock))
4540 			anyvalid = 1;
4541 	}
4542 	if (lock != NULL)
4543 		rw_wunlock(lock);
4544 out:
4545 	if (anyvalid)
4546 		pmap_invalidate_all(pmap);
4547 	pmap_pkru_on_remove(pmap, sva, eva);
4548 	PMAP_UNLOCK(pmap);
4549 	pmap_delayed_invl_finished();
4550 	vm_page_free_pages_toq(&free, true);
4551 }
4552 
4553 /*
4554  *	Routine:	pmap_remove_all
4555  *	Function:
4556  *		Removes this physical page from
4557  *		all physical maps in which it resides.
4558  *		Reflects back modify bits to the pager.
4559  *
4560  *	Notes:
4561  *		Original versions of this routine were very
4562  *		inefficient because they iteratively called
4563  *		pmap_remove (slow...)
4564  */
4565 
4566 void
4567 pmap_remove_all(vm_page_t m)
4568 {
4569 	struct md_page *pvh;
4570 	pv_entry_t pv;
4571 	pmap_t pmap;
4572 	struct rwlock *lock;
4573 	pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
4574 	pd_entry_t *pde;
4575 	vm_offset_t va;
4576 	struct spglist free;
4577 	int pvh_gen, md_gen;
4578 
4579 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4580 	    ("pmap_remove_all: page %p is not managed", m));
4581 	SLIST_INIT(&free);
4582 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4583 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4584 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
4585 retry:
4586 	rw_wlock(lock);
4587 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
4588 		pmap = PV_PMAP(pv);
4589 		if (!PMAP_TRYLOCK(pmap)) {
4590 			pvh_gen = pvh->pv_gen;
4591 			rw_wunlock(lock);
4592 			PMAP_LOCK(pmap);
4593 			rw_wlock(lock);
4594 			if (pvh_gen != pvh->pv_gen) {
4595 				rw_wunlock(lock);
4596 				PMAP_UNLOCK(pmap);
4597 				goto retry;
4598 			}
4599 		}
4600 		va = pv->pv_va;
4601 		pde = pmap_pde(pmap, va);
4602 		(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
4603 		PMAP_UNLOCK(pmap);
4604 	}
4605 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4606 		pmap = PV_PMAP(pv);
4607 		if (!PMAP_TRYLOCK(pmap)) {
4608 			pvh_gen = pvh->pv_gen;
4609 			md_gen = m->md.pv_gen;
4610 			rw_wunlock(lock);
4611 			PMAP_LOCK(pmap);
4612 			rw_wlock(lock);
4613 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4614 				rw_wunlock(lock);
4615 				PMAP_UNLOCK(pmap);
4616 				goto retry;
4617 			}
4618 		}
4619 		PG_A = pmap_accessed_bit(pmap);
4620 		PG_M = pmap_modified_bit(pmap);
4621 		PG_RW = pmap_rw_bit(pmap);
4622 		pmap_resident_count_dec(pmap, 1);
4623 		pde = pmap_pde(pmap, pv->pv_va);
4624 		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
4625 		    " a 2mpage in page %p's pv list", m));
4626 		pte = pmap_pde_to_pte(pde, pv->pv_va);
4627 		tpte = pte_load_clear(pte);
4628 		if (tpte & PG_W)
4629 			pmap->pm_stats.wired_count--;
4630 		if (tpte & PG_A)
4631 			vm_page_aflag_set(m, PGA_REFERENCED);
4632 
4633 		/*
4634 		 * Update the vm_page_t clean and reference bits.
4635 		 */
4636 		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4637 			vm_page_dirty(m);
4638 		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
4639 		pmap_invalidate_page(pmap, pv->pv_va);
4640 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4641 		m->md.pv_gen++;
4642 		free_pv_entry(pmap, pv);
4643 		PMAP_UNLOCK(pmap);
4644 	}
4645 	vm_page_aflag_clear(m, PGA_WRITEABLE);
4646 	rw_wunlock(lock);
4647 	pmap_delayed_invl_wait(m);
4648 	vm_page_free_pages_toq(&free, true);
4649 }
4650 
4651 /*
4652  * pmap_protect_pde: do the things to protect a 2mpage in a process
4653  */
4654 static boolean_t
4655 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
4656 {
4657 	pd_entry_t newpde, oldpde;
4658 	vm_offset_t eva, va;
4659 	vm_page_t m;
4660 	boolean_t anychanged;
4661 	pt_entry_t PG_G, PG_M, PG_RW;
4662 
4663 	PG_G = pmap_global_bit(pmap);
4664 	PG_M = pmap_modified_bit(pmap);
4665 	PG_RW = pmap_rw_bit(pmap);
4666 
4667 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4668 	KASSERT((sva & PDRMASK) == 0,
4669 	    ("pmap_protect_pde: sva is not 2mpage aligned"));
4670 	anychanged = FALSE;
4671 retry:
4672 	oldpde = newpde = *pde;
4673 	if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
4674 	    (PG_MANAGED | PG_M | PG_RW)) {
4675 		eva = sva + NBPDR;
4676 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
4677 		    va < eva; va += PAGE_SIZE, m++)
4678 			vm_page_dirty(m);
4679 	}
4680 	if ((prot & VM_PROT_WRITE) == 0)
4681 		newpde &= ~(PG_RW | PG_M);
4682 	if ((prot & VM_PROT_EXECUTE) == 0)
4683 		newpde |= pg_nx;
4684 	if (newpde != oldpde) {
4685 		/*
4686 		 * As an optimization to future operations on this PDE, clear
4687 		 * PG_PROMOTED.  The impending invalidation will remove any
4688 		 * lingering 4KB page mappings from the TLB.
4689 		 */
4690 		if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
4691 			goto retry;
4692 		if ((oldpde & PG_G) != 0)
4693 			pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
4694 		else
4695 			anychanged = TRUE;
4696 	}
4697 	return (anychanged);
4698 }
4699 
4700 /*
4701  *	Set the physical protection on the
4702  *	specified range of this map as requested.
4703  */
4704 void
4705 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4706 {
4707 	vm_offset_t va_next;
4708 	pml4_entry_t *pml4e;
4709 	pdp_entry_t *pdpe;
4710 	pd_entry_t ptpaddr, *pde;
4711 	pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
4712 	boolean_t anychanged;
4713 
4714 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4715 	if (prot == VM_PROT_NONE) {
4716 		pmap_remove(pmap, sva, eva);
4717 		return;
4718 	}
4719 
4720 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
4721 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
4722 		return;
4723 
4724 	PG_G = pmap_global_bit(pmap);
4725 	PG_M = pmap_modified_bit(pmap);
4726 	PG_V = pmap_valid_bit(pmap);
4727 	PG_RW = pmap_rw_bit(pmap);
4728 	anychanged = FALSE;
4729 
4730 	/*
4731 	 * Although this function delays and batches the invalidation
4732 	 * of stale TLB entries, it does not need to call
4733 	 * pmap_delayed_invl_started() and
4734 	 * pmap_delayed_invl_finished(), because it does not
4735 	 * ordinarily destroy mappings.  Stale TLB entries from
4736 	 * protection-only changes need only be invalidated before the
4737 	 * pmap lock is released, because protection-only changes do
4738 	 * not destroy PV entries.  Even operations that iterate over
4739 	 * a physical page's PV list of mappings, like
4740 	 * pmap_remove_write(), acquire the pmap lock for each
4741 	 * mapping.  Consequently, for protection-only changes, the
4742 	 * pmap lock suffices to synchronize both page table and TLB
4743 	 * updates.
4744 	 *
4745 	 * This function only destroys a mapping if pmap_demote_pde()
4746 	 * fails.  In that case, stale TLB entries are immediately
4747 	 * invalidated.
4748 	 */
4749 
4750 	PMAP_LOCK(pmap);
4751 	for (; sva < eva; sva = va_next) {
4752 
4753 		pml4e = pmap_pml4e(pmap, sva);
4754 		if ((*pml4e & PG_V) == 0) {
4755 			va_next = (sva + NBPML4) & ~PML4MASK;
4756 			if (va_next < sva)
4757 				va_next = eva;
4758 			continue;
4759 		}
4760 
4761 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
4762 		if ((*pdpe & PG_V) == 0) {
4763 			va_next = (sva + NBPDP) & ~PDPMASK;
4764 			if (va_next < sva)
4765 				va_next = eva;
4766 			continue;
4767 		}
4768 
4769 		va_next = (sva + NBPDR) & ~PDRMASK;
4770 		if (va_next < sva)
4771 			va_next = eva;
4772 
4773 		pde = pmap_pdpe_to_pde(pdpe, sva);
4774 		ptpaddr = *pde;
4775 
4776 		/*
4777 		 * Weed out invalid mappings.
4778 		 */
4779 		if (ptpaddr == 0)
4780 			continue;
4781 
4782 		/*
4783 		 * Check for large page.
4784 		 */
4785 		if ((ptpaddr & PG_PS) != 0) {
4786 			/*
4787 			 * Are we protecting the entire large page?  If not,
4788 			 * demote the mapping and fall through.
4789 			 */
4790 			if (sva + NBPDR == va_next && eva >= va_next) {
4791 				/*
4792 				 * The TLB entry for a PG_G mapping is
4793 				 * invalidated by pmap_protect_pde().
4794 				 */
4795 				if (pmap_protect_pde(pmap, pde, sva, prot))
4796 					anychanged = TRUE;
4797 				continue;
4798 			} else if (!pmap_demote_pde(pmap, pde, sva)) {
4799 				/*
4800 				 * The large page mapping was destroyed.
4801 				 */
4802 				continue;
4803 			}
4804 		}
4805 
4806 		if (va_next > eva)
4807 			va_next = eva;
4808 
4809 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
4810 		    sva += PAGE_SIZE) {
4811 			pt_entry_t obits, pbits;
4812 			vm_page_t m;
4813 
4814 retry:
4815 			obits = pbits = *pte;
4816 			if ((pbits & PG_V) == 0)
4817 				continue;
4818 
4819 			if ((prot & VM_PROT_WRITE) == 0) {
4820 				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
4821 				    (PG_MANAGED | PG_M | PG_RW)) {
4822 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
4823 					vm_page_dirty(m);
4824 				}
4825 				pbits &= ~(PG_RW | PG_M);
4826 			}
4827 			if ((prot & VM_PROT_EXECUTE) == 0)
4828 				pbits |= pg_nx;
4829 
4830 			if (pbits != obits) {
4831 				if (!atomic_cmpset_long(pte, obits, pbits))
4832 					goto retry;
4833 				if (obits & PG_G)
4834 					pmap_invalidate_page(pmap, sva);
4835 				else
4836 					anychanged = TRUE;
4837 			}
4838 		}
4839 	}
4840 	if (anychanged)
4841 		pmap_invalidate_all(pmap);
4842 	PMAP_UNLOCK(pmap);
4843 }
4844 
4845 #if VM_NRESERVLEVEL > 0
4846 /*
4847  * Tries to promote the 512, contiguous 4KB page mappings that are within a
4848  * single page table page (PTP) to a single 2MB page mapping.  For promotion
4849  * to occur, two conditions must be met: (1) the 4KB page mappings must map
4850  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4851  * identical characteristics.
4852  */
4853 static void
4854 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
4855     struct rwlock **lockp)
4856 {
4857 	pd_entry_t newpde;
4858 	pt_entry_t *firstpte, oldpte, pa, *pte;
4859 	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK;
4860 	vm_page_t mpte;
4861 	int PG_PTE_CACHE;
4862 
4863 	PG_A = pmap_accessed_bit(pmap);
4864 	PG_G = pmap_global_bit(pmap);
4865 	PG_M = pmap_modified_bit(pmap);
4866 	PG_V = pmap_valid_bit(pmap);
4867 	PG_RW = pmap_rw_bit(pmap);
4868 	PG_PKU_MASK = pmap_pku_mask_bit(pmap);
4869 	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
4870 
4871 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4872 
4873 	/*
4874 	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
4875 	 * either invalid, unused, or does not map the first 4KB physical page
4876 	 * within a 2MB page.
4877 	 */
4878 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
4879 setpde:
4880 	newpde = *firstpte;
4881 	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
4882 		atomic_add_long(&pmap_pde_p_failures, 1);
4883 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4884 		    " in pmap %p", va, pmap);
4885 		return;
4886 	}
4887 	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
4888 		/*
4889 		 * When PG_M is already clear, PG_RW can be cleared without
4890 		 * a TLB invalidation.
4891 		 */
4892 		if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
4893 			goto setpde;
4894 		newpde &= ~PG_RW;
4895 	}
4896 
4897 	/*
4898 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
4899 	 * PTE maps an unexpected 4KB physical page or does not have identical
4900 	 * characteristics to the first PTE.
4901 	 */
4902 	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
4903 	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
4904 setpte:
4905 		oldpte = *pte;
4906 		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
4907 			atomic_add_long(&pmap_pde_p_failures, 1);
4908 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4909 			    " in pmap %p", va, pmap);
4910 			return;
4911 		}
4912 		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
4913 			/*
4914 			 * When PG_M is already clear, PG_RW can be cleared
4915 			 * without a TLB invalidation.
4916 			 */
4917 			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
4918 				goto setpte;
4919 			oldpte &= ~PG_RW;
4920 			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
4921 			    " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
4922 			    (va & ~PDRMASK), pmap);
4923 		}
4924 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
4925 			atomic_add_long(&pmap_pde_p_failures, 1);
4926 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4927 			    " in pmap %p", va, pmap);
4928 			return;
4929 		}
4930 		pa -= PAGE_SIZE;
4931 	}
4932 
4933 	/*
4934 	 * Save the page table page in its current state until the PDE
4935 	 * mapping the superpage is demoted by pmap_demote_pde() or
4936 	 * destroyed by pmap_remove_pde().
4937 	 */
4938 	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4939 	KASSERT(mpte >= vm_page_array &&
4940 	    mpte < &vm_page_array[vm_page_array_size],
4941 	    ("pmap_promote_pde: page table page is out of range"));
4942 	KASSERT(mpte->pindex == pmap_pde_pindex(va),
4943 	    ("pmap_promote_pde: page table page's pindex is wrong"));
4944 	if (pmap_insert_pt_page(pmap, mpte)) {
4945 		atomic_add_long(&pmap_pde_p_failures, 1);
4946 		CTR2(KTR_PMAP,
4947 		    "pmap_promote_pde: failure for va %#lx in pmap %p", va,
4948 		    pmap);
4949 		return;
4950 	}
4951 
4952 	/*
4953 	 * Promote the pv entries.
4954 	 */
4955 	if ((newpde & PG_MANAGED) != 0)
4956 		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
4957 
4958 	/*
4959 	 * Propagate the PAT index to its proper position.
4960 	 */
4961 	newpde = pmap_swap_pat(pmap, newpde);
4962 
4963 	/*
4964 	 * Map the superpage.
4965 	 */
4966 	if (workaround_erratum383)
4967 		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
4968 	else
4969 		pde_store(pde, PG_PROMOTED | PG_PS | newpde);
4970 
4971 	atomic_add_long(&pmap_pde_promotions, 1);
4972 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
4973 	    " in pmap %p", va, pmap);
4974 }
4975 #endif /* VM_NRESERVLEVEL > 0 */
4976 
4977 /*
4978  *	Insert the given physical page (p) at
4979  *	the specified virtual address (v) in the
4980  *	target physical map with the protection requested.
4981  *
4982  *	If specified, the page will be wired down, meaning
4983  *	that the related pte can not be reclaimed.
4984  *
4985  *	NB:  This is the only routine which MAY NOT lazy-evaluate
4986  *	or lose information.  That is, this routine must actually
4987  *	insert this page into the given map NOW.
4988  *
4989  *	When destroying both a page table and PV entry, this function
4990  *	performs the TLB invalidation before releasing the PV list
4991  *	lock, so we do not need pmap_delayed_invl_page() calls here.
4992  */
4993 int
4994 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4995     u_int flags, int8_t psind)
4996 {
4997 	struct rwlock *lock;
4998 	pd_entry_t *pde;
4999 	pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
5000 	pt_entry_t newpte, origpte;
5001 	pv_entry_t pv;
5002 	vm_paddr_t opa, pa;
5003 	vm_page_t mpte, om;
5004 	int rv;
5005 	boolean_t nosleep;
5006 
5007 	PG_A = pmap_accessed_bit(pmap);
5008 	PG_G = pmap_global_bit(pmap);
5009 	PG_M = pmap_modified_bit(pmap);
5010 	PG_V = pmap_valid_bit(pmap);
5011 	PG_RW = pmap_rw_bit(pmap);
5012 
5013 	va = trunc_page(va);
5014 	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
5015 	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
5016 	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
5017 	    va));
5018 	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
5019 	    va >= kmi.clean_eva,
5020 	    ("pmap_enter: managed mapping within the clean submap"));
5021 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
5022 		VM_OBJECT_ASSERT_LOCKED(m->object);
5023 	KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
5024 	    ("pmap_enter: flags %u has reserved bits set", flags));
5025 	pa = VM_PAGE_TO_PHYS(m);
5026 	newpte = (pt_entry_t)(pa | PG_A | PG_V);
5027 	if ((flags & VM_PROT_WRITE) != 0)
5028 		newpte |= PG_M;
5029 	if ((prot & VM_PROT_WRITE) != 0)
5030 		newpte |= PG_RW;
5031 	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
5032 	    ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
5033 	if ((prot & VM_PROT_EXECUTE) == 0)
5034 		newpte |= pg_nx;
5035 	if ((flags & PMAP_ENTER_WIRED) != 0)
5036 		newpte |= PG_W;
5037 	if (va < VM_MAXUSER_ADDRESS)
5038 		newpte |= PG_U;
5039 	if (pmap == kernel_pmap)
5040 		newpte |= PG_G;
5041 	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0);
5042 
5043 	/*
5044 	 * Set modified bit gratuitously for writeable mappings if
5045 	 * the page is unmanaged. We do not want to take a fault
5046 	 * to do the dirty bit accounting for these mappings.
5047 	 */
5048 	if ((m->oflags & VPO_UNMANAGED) != 0) {
5049 		if ((newpte & PG_RW) != 0)
5050 			newpte |= PG_M;
5051 	} else
5052 		newpte |= PG_MANAGED;
5053 
5054 	lock = NULL;
5055 	PMAP_LOCK(pmap);
5056 	if (psind == 1) {
5057 		/* Assert the required virtual and physical alignment. */
5058 		KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
5059 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
5060 		rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock);
5061 		goto out;
5062 	}
5063 	mpte = NULL;
5064 
5065 	/*
5066 	 * In the case that a page table page is not
5067 	 * resident, we are creating it here.
5068 	 */
5069 retry:
5070 	pde = pmap_pde(pmap, va);
5071 	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
5072 	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
5073 		pte = pmap_pde_to_pte(pde, va);
5074 		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
5075 			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
5076 			mpte->wire_count++;
5077 		}
5078 	} else if (va < VM_MAXUSER_ADDRESS) {
5079 		/*
5080 		 * Here if the pte page isn't mapped, or if it has been
5081 		 * deallocated.
5082 		 */
5083 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
5084 		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
5085 		    nosleep ? NULL : &lock);
5086 		if (mpte == NULL && nosleep) {
5087 			rv = KERN_RESOURCE_SHORTAGE;
5088 			goto out;
5089 		}
5090 		goto retry;
5091 	} else
5092 		panic("pmap_enter: invalid page directory va=%#lx", va);
5093 
5094 	origpte = *pte;
5095 	pv = NULL;
5096 	if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
5097 		newpte |= pmap_pkru_get(pmap, va);
5098 
5099 	/*
5100 	 * Is the specified virtual address already mapped?
5101 	 */
5102 	if ((origpte & PG_V) != 0) {
5103 		/*
5104 		 * Wiring change, just update stats. We don't worry about
5105 		 * wiring PT pages as they remain resident as long as there
5106 		 * are valid mappings in them. Hence, if a user page is wired,
5107 		 * the PT page will be also.
5108 		 */
5109 		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
5110 			pmap->pm_stats.wired_count++;
5111 		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
5112 			pmap->pm_stats.wired_count--;
5113 
5114 		/*
5115 		 * Remove the extra PT page reference.
5116 		 */
5117 		if (mpte != NULL) {
5118 			mpte->wire_count--;
5119 			KASSERT(mpte->wire_count > 0,
5120 			    ("pmap_enter: missing reference to page table page,"
5121 			     " va: 0x%lx", va));
5122 		}
5123 
5124 		/*
5125 		 * Has the physical page changed?
5126 		 */
5127 		opa = origpte & PG_FRAME;
5128 		if (opa == pa) {
5129 			/*
5130 			 * No, might be a protection or wiring change.
5131 			 */
5132 			if ((origpte & PG_MANAGED) != 0 &&
5133 			    (newpte & PG_RW) != 0)
5134 				vm_page_aflag_set(m, PGA_WRITEABLE);
5135 			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
5136 				goto unchanged;
5137 			goto validate;
5138 		}
5139 
5140 		/*
5141 		 * The physical page has changed.  Temporarily invalidate
5142 		 * the mapping.  This ensures that all threads sharing the
5143 		 * pmap keep a consistent view of the mapping, which is
5144 		 * necessary for the correct handling of COW faults.  It
5145 		 * also permits reuse of the old mapping's PV entry,
5146 		 * avoiding an allocation.
5147 		 *
5148 		 * For consistency, handle unmanaged mappings the same way.
5149 		 */
5150 		origpte = pte_load_clear(pte);
5151 		KASSERT((origpte & PG_FRAME) == opa,
5152 		    ("pmap_enter: unexpected pa update for %#lx", va));
5153 		if ((origpte & PG_MANAGED) != 0) {
5154 			om = PHYS_TO_VM_PAGE(opa);
5155 
5156 			/*
5157 			 * The pmap lock is sufficient to synchronize with
5158 			 * concurrent calls to pmap_page_test_mappings() and
5159 			 * pmap_ts_referenced().
5160 			 */
5161 			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5162 				vm_page_dirty(om);
5163 			if ((origpte & PG_A) != 0)
5164 				vm_page_aflag_set(om, PGA_REFERENCED);
5165 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
5166 			pv = pmap_pvh_remove(&om->md, pmap, va);
5167 			KASSERT(pv != NULL,
5168 			    ("pmap_enter: no PV entry for %#lx", va));
5169 			if ((newpte & PG_MANAGED) == 0)
5170 				free_pv_entry(pmap, pv);
5171 			if ((om->aflags & PGA_WRITEABLE) != 0 &&
5172 			    TAILQ_EMPTY(&om->md.pv_list) &&
5173 			    ((om->flags & PG_FICTITIOUS) != 0 ||
5174 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
5175 				vm_page_aflag_clear(om, PGA_WRITEABLE);
5176 		}
5177 		if ((origpte & PG_A) != 0)
5178 			pmap_invalidate_page(pmap, va);
5179 		origpte = 0;
5180 	} else {
5181 		/*
5182 		 * Increment the counters.
5183 		 */
5184 		if ((newpte & PG_W) != 0)
5185 			pmap->pm_stats.wired_count++;
5186 		pmap_resident_count_inc(pmap, 1);
5187 	}
5188 
5189 	/*
5190 	 * Enter on the PV list if part of our managed memory.
5191 	 */
5192 	if ((newpte & PG_MANAGED) != 0) {
5193 		if (pv == NULL) {
5194 			pv = get_pv_entry(pmap, &lock);
5195 			pv->pv_va = va;
5196 		}
5197 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
5198 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5199 		m->md.pv_gen++;
5200 		if ((newpte & PG_RW) != 0)
5201 			vm_page_aflag_set(m, PGA_WRITEABLE);
5202 	}
5203 
5204 	/*
5205 	 * Update the PTE.
5206 	 */
5207 	if ((origpte & PG_V) != 0) {
5208 validate:
5209 		origpte = pte_load_store(pte, newpte);
5210 		KASSERT((origpte & PG_FRAME) == pa,
5211 		    ("pmap_enter: unexpected pa update for %#lx", va));
5212 		if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
5213 		    (PG_M | PG_RW)) {
5214 			if ((origpte & PG_MANAGED) != 0)
5215 				vm_page_dirty(m);
5216 
5217 			/*
5218 			 * Although the PTE may still have PG_RW set, TLB
5219 			 * invalidation may nonetheless be required because
5220 			 * the PTE no longer has PG_M set.
5221 			 */
5222 		} else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
5223 			/*
5224 			 * This PTE change does not require TLB invalidation.
5225 			 */
5226 			goto unchanged;
5227 		}
5228 		if ((origpte & PG_A) != 0)
5229 			pmap_invalidate_page(pmap, va);
5230 	} else
5231 		pte_store(pte, newpte);
5232 
5233 unchanged:
5234 
5235 #if VM_NRESERVLEVEL > 0
5236 	/*
5237 	 * If both the page table page and the reservation are fully
5238 	 * populated, then attempt promotion.
5239 	 */
5240 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
5241 	    pmap_ps_enabled(pmap) &&
5242 	    (m->flags & PG_FICTITIOUS) == 0 &&
5243 	    vm_reserv_level_iffullpop(m) == 0)
5244 		pmap_promote_pde(pmap, pde, va, &lock);
5245 #endif
5246 
5247 	rv = KERN_SUCCESS;
5248 out:
5249 	if (lock != NULL)
5250 		rw_wunlock(lock);
5251 	PMAP_UNLOCK(pmap);
5252 	return (rv);
5253 }
5254 
5255 /*
5256  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
5257  * if successful.  Returns false if (1) a page table page cannot be allocated
5258  * without sleeping, (2) a mapping already exists at the specified virtual
5259  * address, or (3) a PV entry cannot be allocated without reclaiming another
5260  * PV entry.
5261  */
5262 static bool
5263 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5264     struct rwlock **lockp)
5265 {
5266 	pd_entry_t newpde;
5267 	pt_entry_t PG_V;
5268 
5269 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5270 	PG_V = pmap_valid_bit(pmap);
5271 	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
5272 	    PG_PS | PG_V;
5273 	if ((m->oflags & VPO_UNMANAGED) == 0)
5274 		newpde |= PG_MANAGED;
5275 	if ((prot & VM_PROT_EXECUTE) == 0)
5276 		newpde |= pg_nx;
5277 	if (va < VM_MAXUSER_ADDRESS)
5278 		newpde |= PG_U;
5279 	return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
5280 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
5281 	    KERN_SUCCESS);
5282 }
5283 
5284 /*
5285  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
5286  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
5287  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
5288  * a mapping already exists at the specified virtual address.  Returns
5289  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
5290  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
5291  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
5292  *
5293  * The parameter "m" is only used when creating a managed, writeable mapping.
5294  */
5295 static int
5296 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
5297     vm_page_t m, struct rwlock **lockp)
5298 {
5299 	struct spglist free;
5300 	pd_entry_t oldpde, *pde;
5301 	pt_entry_t PG_G, PG_RW, PG_V;
5302 	vm_page_t mt, pdpg;
5303 
5304 	PG_G = pmap_global_bit(pmap);
5305 	PG_RW = pmap_rw_bit(pmap);
5306 	KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
5307 	    ("pmap_enter_pde: newpde is missing PG_M"));
5308 	PG_V = pmap_valid_bit(pmap);
5309 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5310 
5311 	if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
5312 	    NULL : lockp)) == NULL) {
5313 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
5314 		    " in pmap %p", va, pmap);
5315 		return (KERN_RESOURCE_SHORTAGE);
5316 	}
5317 
5318 	/*
5319 	 * If pkru is not same for the whole pde range, return failure
5320 	 * and let vm_fault() cope.  Check after pde allocation, since
5321 	 * it could sleep.
5322 	 */
5323 	if (!pmap_pkru_same(pmap, va, va + NBPDR)) {
5324 		SLIST_INIT(&free);
5325 		if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
5326 			pmap_invalidate_page(pmap, va);
5327 			vm_page_free_pages_toq(&free, true);
5328 		}
5329 		return (KERN_FAILURE);
5330 	}
5331 	if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) {
5332 		newpde &= ~X86_PG_PKU_MASK;
5333 		newpde |= pmap_pkru_get(pmap, va);
5334 	}
5335 
5336 	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
5337 	pde = &pde[pmap_pde_index(va)];
5338 	oldpde = *pde;
5339 	if ((oldpde & PG_V) != 0) {
5340 		KASSERT(pdpg->wire_count > 1,
5341 		    ("pmap_enter_pde: pdpg's wire count is too low"));
5342 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5343 			pdpg->wire_count--;
5344 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
5345 			    " in pmap %p", va, pmap);
5346 			return (KERN_FAILURE);
5347 		}
5348 		/* Break the existing mapping(s). */
5349 		SLIST_INIT(&free);
5350 		if ((oldpde & PG_PS) != 0) {
5351 			/*
5352 			 * The reference to the PD page that was acquired by
5353 			 * pmap_allocpde() ensures that it won't be freed.
5354 			 * However, if the PDE resulted from a promotion, then
5355 			 * a reserved PT page could be freed.
5356 			 */
5357 			(void)pmap_remove_pde(pmap, pde, va, &free, lockp);
5358 			if ((oldpde & PG_G) == 0)
5359 				pmap_invalidate_pde_page(pmap, va, oldpde);
5360 		} else {
5361 			pmap_delayed_invl_started();
5362 			if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free,
5363 			    lockp))
5364 		               pmap_invalidate_all(pmap);
5365 			pmap_delayed_invl_finished();
5366 		}
5367 		vm_page_free_pages_toq(&free, true);
5368 		if (va >= VM_MAXUSER_ADDRESS) {
5369 			mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
5370 			if (pmap_insert_pt_page(pmap, mt)) {
5371 				/*
5372 				 * XXX Currently, this can't happen because
5373 				 * we do not perform pmap_enter(psind == 1)
5374 				 * on the kernel pmap.
5375 				 */
5376 				panic("pmap_enter_pde: trie insert failed");
5377 			}
5378 		} else
5379 			KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p",
5380 			    pde));
5381 	}
5382 	if ((newpde & PG_MANAGED) != 0) {
5383 		/*
5384 		 * Abort this mapping if its PV entry could not be created.
5385 		 */
5386 		if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
5387 			SLIST_INIT(&free);
5388 			if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
5389 				/*
5390 				 * Although "va" is not mapped, paging-
5391 				 * structure caches could nonetheless have
5392 				 * entries that refer to the freed page table
5393 				 * pages.  Invalidate those entries.
5394 				 */
5395 				pmap_invalidate_page(pmap, va);
5396 				vm_page_free_pages_toq(&free, true);
5397 			}
5398 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
5399 			    " in pmap %p", va, pmap);
5400 			return (KERN_RESOURCE_SHORTAGE);
5401 		}
5402 		if ((newpde & PG_RW) != 0) {
5403 			for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5404 				vm_page_aflag_set(mt, PGA_WRITEABLE);
5405 		}
5406 	}
5407 
5408 	/*
5409 	 * Increment counters.
5410 	 */
5411 	if ((newpde & PG_W) != 0)
5412 		pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
5413 	pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
5414 
5415 	/*
5416 	 * Map the superpage.  (This is not a promoted mapping; there will not
5417 	 * be any lingering 4KB page mappings in the TLB.)
5418 	 */
5419 	pde_store(pde, newpde);
5420 
5421 	atomic_add_long(&pmap_pde_mappings, 1);
5422 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
5423 	    " in pmap %p", va, pmap);
5424 	return (KERN_SUCCESS);
5425 }
5426 
5427 /*
5428  * Maps a sequence of resident pages belonging to the same object.
5429  * The sequence begins with the given page m_start.  This page is
5430  * mapped at the given virtual address start.  Each subsequent page is
5431  * mapped at a virtual address that is offset from start by the same
5432  * amount as the page is offset from m_start within the object.  The
5433  * last page in the sequence is the page with the largest offset from
5434  * m_start that can be mapped at a virtual address less than the given
5435  * virtual address end.  Not every virtual page between start and end
5436  * is mapped; only those for which a resident page exists with the
5437  * corresponding offset from m_start are mapped.
5438  */
5439 void
5440 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
5441     vm_page_t m_start, vm_prot_t prot)
5442 {
5443 	struct rwlock *lock;
5444 	vm_offset_t va;
5445 	vm_page_t m, mpte;
5446 	vm_pindex_t diff, psize;
5447 
5448 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
5449 
5450 	psize = atop(end - start);
5451 	mpte = NULL;
5452 	m = m_start;
5453 	lock = NULL;
5454 	PMAP_LOCK(pmap);
5455 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
5456 		va = start + ptoa(diff);
5457 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
5458 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
5459 		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
5460 			m = &m[NBPDR / PAGE_SIZE - 1];
5461 		else
5462 			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
5463 			    mpte, &lock);
5464 		m = TAILQ_NEXT(m, listq);
5465 	}
5466 	if (lock != NULL)
5467 		rw_wunlock(lock);
5468 	PMAP_UNLOCK(pmap);
5469 }
5470 
5471 /*
5472  * this code makes some *MAJOR* assumptions:
5473  * 1. Current pmap & pmap exists.
5474  * 2. Not wired.
5475  * 3. Read access.
5476  * 4. No page table pages.
5477  * but is *MUCH* faster than pmap_enter...
5478  */
5479 
5480 void
5481 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
5482 {
5483 	struct rwlock *lock;
5484 
5485 	lock = NULL;
5486 	PMAP_LOCK(pmap);
5487 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
5488 	if (lock != NULL)
5489 		rw_wunlock(lock);
5490 	PMAP_UNLOCK(pmap);
5491 }
5492 
5493 static vm_page_t
5494 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
5495     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
5496 {
5497 	struct spglist free;
5498 	pt_entry_t newpte, *pte, PG_V;
5499 
5500 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
5501 	    (m->oflags & VPO_UNMANAGED) != 0,
5502 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
5503 	PG_V = pmap_valid_bit(pmap);
5504 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5505 
5506 	/*
5507 	 * In the case that a page table page is not
5508 	 * resident, we are creating it here.
5509 	 */
5510 	if (va < VM_MAXUSER_ADDRESS) {
5511 		vm_pindex_t ptepindex;
5512 		pd_entry_t *ptepa;
5513 
5514 		/*
5515 		 * Calculate pagetable page index
5516 		 */
5517 		ptepindex = pmap_pde_pindex(va);
5518 		if (mpte && (mpte->pindex == ptepindex)) {
5519 			mpte->wire_count++;
5520 		} else {
5521 			/*
5522 			 * Get the page directory entry
5523 			 */
5524 			ptepa = pmap_pde(pmap, va);
5525 
5526 			/*
5527 			 * If the page table page is mapped, we just increment
5528 			 * the hold count, and activate it.  Otherwise, we
5529 			 * attempt to allocate a page table page.  If this
5530 			 * attempt fails, we don't retry.  Instead, we give up.
5531 			 */
5532 			if (ptepa && (*ptepa & PG_V) != 0) {
5533 				if (*ptepa & PG_PS)
5534 					return (NULL);
5535 				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
5536 				mpte->wire_count++;
5537 			} else {
5538 				/*
5539 				 * Pass NULL instead of the PV list lock
5540 				 * pointer, because we don't intend to sleep.
5541 				 */
5542 				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
5543 				if (mpte == NULL)
5544 					return (mpte);
5545 			}
5546 		}
5547 		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
5548 		pte = &pte[pmap_pte_index(va)];
5549 	} else {
5550 		mpte = NULL;
5551 		pte = vtopte(va);
5552 	}
5553 	if (*pte) {
5554 		if (mpte != NULL) {
5555 			mpte->wire_count--;
5556 			mpte = NULL;
5557 		}
5558 		return (mpte);
5559 	}
5560 
5561 	/*
5562 	 * Enter on the PV list if part of our managed memory.
5563 	 */
5564 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
5565 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
5566 		if (mpte != NULL) {
5567 			SLIST_INIT(&free);
5568 			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
5569 				/*
5570 				 * Although "va" is not mapped, paging-
5571 				 * structure caches could nonetheless have
5572 				 * entries that refer to the freed page table
5573 				 * pages.  Invalidate those entries.
5574 				 */
5575 				pmap_invalidate_page(pmap, va);
5576 				vm_page_free_pages_toq(&free, true);
5577 			}
5578 			mpte = NULL;
5579 		}
5580 		return (mpte);
5581 	}
5582 
5583 	/*
5584 	 * Increment counters
5585 	 */
5586 	pmap_resident_count_inc(pmap, 1);
5587 
5588 	newpte = VM_PAGE_TO_PHYS(m) | PG_V |
5589 	    pmap_cache_bits(pmap, m->md.pat_mode, 0);
5590 	if ((m->oflags & VPO_UNMANAGED) == 0)
5591 		newpte |= PG_MANAGED;
5592 	if ((prot & VM_PROT_EXECUTE) == 0)
5593 		newpte |= pg_nx;
5594 	if (va < VM_MAXUSER_ADDRESS)
5595 		newpte |= PG_U | pmap_pkru_get(pmap, va);
5596 	pte_store(pte, newpte);
5597 	return (mpte);
5598 }
5599 
5600 /*
5601  * Make a temporary mapping for a physical address.  This is only intended
5602  * to be used for panic dumps.
5603  */
5604 void *
5605 pmap_kenter_temporary(vm_paddr_t pa, int i)
5606 {
5607 	vm_offset_t va;
5608 
5609 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
5610 	pmap_kenter(va, pa);
5611 	invlpg(va);
5612 	return ((void *)crashdumpmap);
5613 }
5614 
5615 /*
5616  * This code maps large physical mmap regions into the
5617  * processor address space.  Note that some shortcuts
5618  * are taken, but the code works.
5619  */
5620 void
5621 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
5622     vm_pindex_t pindex, vm_size_t size)
5623 {
5624 	pd_entry_t *pde;
5625 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
5626 	vm_paddr_t pa, ptepa;
5627 	vm_page_t p, pdpg;
5628 	int pat_mode;
5629 
5630 	PG_A = pmap_accessed_bit(pmap);
5631 	PG_M = pmap_modified_bit(pmap);
5632 	PG_V = pmap_valid_bit(pmap);
5633 	PG_RW = pmap_rw_bit(pmap);
5634 
5635 	VM_OBJECT_ASSERT_WLOCKED(object);
5636 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
5637 	    ("pmap_object_init_pt: non-device object"));
5638 	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
5639 		if (!pmap_ps_enabled(pmap))
5640 			return;
5641 		if (!vm_object_populate(object, pindex, pindex + atop(size)))
5642 			return;
5643 		p = vm_page_lookup(object, pindex);
5644 		KASSERT(p->valid == VM_PAGE_BITS_ALL,
5645 		    ("pmap_object_init_pt: invalid page %p", p));
5646 		pat_mode = p->md.pat_mode;
5647 
5648 		/*
5649 		 * Abort the mapping if the first page is not physically
5650 		 * aligned to a 2MB page boundary.
5651 		 */
5652 		ptepa = VM_PAGE_TO_PHYS(p);
5653 		if (ptepa & (NBPDR - 1))
5654 			return;
5655 
5656 		/*
5657 		 * Skip the first page.  Abort the mapping if the rest of
5658 		 * the pages are not physically contiguous or have differing
5659 		 * memory attributes.
5660 		 */
5661 		p = TAILQ_NEXT(p, listq);
5662 		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
5663 		    pa += PAGE_SIZE) {
5664 			KASSERT(p->valid == VM_PAGE_BITS_ALL,
5665 			    ("pmap_object_init_pt: invalid page %p", p));
5666 			if (pa != VM_PAGE_TO_PHYS(p) ||
5667 			    pat_mode != p->md.pat_mode)
5668 				return;
5669 			p = TAILQ_NEXT(p, listq);
5670 		}
5671 
5672 		/*
5673 		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
5674 		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
5675 		 * will not affect the termination of this loop.
5676 		 */
5677 		PMAP_LOCK(pmap);
5678 		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
5679 		    pa < ptepa + size; pa += NBPDR) {
5680 			pdpg = pmap_allocpde(pmap, addr, NULL);
5681 			if (pdpg == NULL) {
5682 				/*
5683 				 * The creation of mappings below is only an
5684 				 * optimization.  If a page directory page
5685 				 * cannot be allocated without blocking,
5686 				 * continue on to the next mapping rather than
5687 				 * blocking.
5688 				 */
5689 				addr += NBPDR;
5690 				continue;
5691 			}
5692 			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
5693 			pde = &pde[pmap_pde_index(addr)];
5694 			if ((*pde & PG_V) == 0) {
5695 				pde_store(pde, pa | PG_PS | PG_M | PG_A |
5696 				    PG_U | PG_RW | PG_V);
5697 				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
5698 				atomic_add_long(&pmap_pde_mappings, 1);
5699 			} else {
5700 				/* Continue on if the PDE is already valid. */
5701 				pdpg->wire_count--;
5702 				KASSERT(pdpg->wire_count > 0,
5703 				    ("pmap_object_init_pt: missing reference "
5704 				    "to page directory page, va: 0x%lx", addr));
5705 			}
5706 			addr += NBPDR;
5707 		}
5708 		PMAP_UNLOCK(pmap);
5709 	}
5710 }
5711 
5712 /*
5713  *	Clear the wired attribute from the mappings for the specified range of
5714  *	addresses in the given pmap.  Every valid mapping within that range
5715  *	must have the wired attribute set.  In contrast, invalid mappings
5716  *	cannot have the wired attribute set, so they are ignored.
5717  *
5718  *	The wired attribute of the page table entry is not a hardware
5719  *	feature, so there is no need to invalidate any TLB entries.
5720  *	Since pmap_demote_pde() for the wired entry must never fail,
5721  *	pmap_delayed_invl_started()/finished() calls around the
5722  *	function are not needed.
5723  */
5724 void
5725 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5726 {
5727 	vm_offset_t va_next;
5728 	pml4_entry_t *pml4e;
5729 	pdp_entry_t *pdpe;
5730 	pd_entry_t *pde;
5731 	pt_entry_t *pte, PG_V;
5732 
5733 	PG_V = pmap_valid_bit(pmap);
5734 	PMAP_LOCK(pmap);
5735 	for (; sva < eva; sva = va_next) {
5736 		pml4e = pmap_pml4e(pmap, sva);
5737 		if ((*pml4e & PG_V) == 0) {
5738 			va_next = (sva + NBPML4) & ~PML4MASK;
5739 			if (va_next < sva)
5740 				va_next = eva;
5741 			continue;
5742 		}
5743 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
5744 		if ((*pdpe & PG_V) == 0) {
5745 			va_next = (sva + NBPDP) & ~PDPMASK;
5746 			if (va_next < sva)
5747 				va_next = eva;
5748 			continue;
5749 		}
5750 		va_next = (sva + NBPDR) & ~PDRMASK;
5751 		if (va_next < sva)
5752 			va_next = eva;
5753 		pde = pmap_pdpe_to_pde(pdpe, sva);
5754 		if ((*pde & PG_V) == 0)
5755 			continue;
5756 		if ((*pde & PG_PS) != 0) {
5757 			if ((*pde & PG_W) == 0)
5758 				panic("pmap_unwire: pde %#jx is missing PG_W",
5759 				    (uintmax_t)*pde);
5760 
5761 			/*
5762 			 * Are we unwiring the entire large page?  If not,
5763 			 * demote the mapping and fall through.
5764 			 */
5765 			if (sva + NBPDR == va_next && eva >= va_next) {
5766 				atomic_clear_long(pde, PG_W);
5767 				pmap->pm_stats.wired_count -= NBPDR /
5768 				    PAGE_SIZE;
5769 				continue;
5770 			} else if (!pmap_demote_pde(pmap, pde, sva))
5771 				panic("pmap_unwire: demotion failed");
5772 		}
5773 		if (va_next > eva)
5774 			va_next = eva;
5775 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
5776 		    sva += PAGE_SIZE) {
5777 			if ((*pte & PG_V) == 0)
5778 				continue;
5779 			if ((*pte & PG_W) == 0)
5780 				panic("pmap_unwire: pte %#jx is missing PG_W",
5781 				    (uintmax_t)*pte);
5782 
5783 			/*
5784 			 * PG_W must be cleared atomically.  Although the pmap
5785 			 * lock synchronizes access to PG_W, another processor
5786 			 * could be setting PG_M and/or PG_A concurrently.
5787 			 */
5788 			atomic_clear_long(pte, PG_W);
5789 			pmap->pm_stats.wired_count--;
5790 		}
5791 	}
5792 	PMAP_UNLOCK(pmap);
5793 }
5794 
5795 /*
5796  *	Copy the range specified by src_addr/len
5797  *	from the source map to the range dst_addr/len
5798  *	in the destination map.
5799  *
5800  *	This routine is only advisory and need not do anything.
5801  */
5802 
5803 void
5804 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
5805     vm_offset_t src_addr)
5806 {
5807 	struct rwlock *lock;
5808 	struct spglist free;
5809 	vm_offset_t addr;
5810 	vm_offset_t end_addr = src_addr + len;
5811 	vm_offset_t va_next;
5812 	vm_page_t dst_pdpg, dstmpte, srcmpte;
5813 	pt_entry_t PG_A, PG_M, PG_V;
5814 
5815 	if (dst_addr != src_addr)
5816 		return;
5817 
5818 	if (dst_pmap->pm_type != src_pmap->pm_type)
5819 		return;
5820 
5821 	/*
5822 	 * EPT page table entries that require emulation of A/D bits are
5823 	 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
5824 	 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
5825 	 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
5826 	 * implementations flag an EPT misconfiguration for exec-only
5827 	 * mappings we skip this function entirely for emulated pmaps.
5828 	 */
5829 	if (pmap_emulate_ad_bits(dst_pmap))
5830 		return;
5831 
5832 	lock = NULL;
5833 	if (dst_pmap < src_pmap) {
5834 		PMAP_LOCK(dst_pmap);
5835 		PMAP_LOCK(src_pmap);
5836 	} else {
5837 		PMAP_LOCK(src_pmap);
5838 		PMAP_LOCK(dst_pmap);
5839 	}
5840 
5841 	PG_A = pmap_accessed_bit(dst_pmap);
5842 	PG_M = pmap_modified_bit(dst_pmap);
5843 	PG_V = pmap_valid_bit(dst_pmap);
5844 
5845 	for (addr = src_addr; addr < end_addr; addr = va_next) {
5846 		pt_entry_t *src_pte, *dst_pte;
5847 		pml4_entry_t *pml4e;
5848 		pdp_entry_t *pdpe;
5849 		pd_entry_t srcptepaddr, *pde;
5850 
5851 		KASSERT(addr < UPT_MIN_ADDRESS,
5852 		    ("pmap_copy: invalid to pmap_copy page tables"));
5853 
5854 		pml4e = pmap_pml4e(src_pmap, addr);
5855 		if ((*pml4e & PG_V) == 0) {
5856 			va_next = (addr + NBPML4) & ~PML4MASK;
5857 			if (va_next < addr)
5858 				va_next = end_addr;
5859 			continue;
5860 		}
5861 
5862 		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
5863 		if ((*pdpe & PG_V) == 0) {
5864 			va_next = (addr + NBPDP) & ~PDPMASK;
5865 			if (va_next < addr)
5866 				va_next = end_addr;
5867 			continue;
5868 		}
5869 
5870 		va_next = (addr + NBPDR) & ~PDRMASK;
5871 		if (va_next < addr)
5872 			va_next = end_addr;
5873 
5874 		pde = pmap_pdpe_to_pde(pdpe, addr);
5875 		srcptepaddr = *pde;
5876 		if (srcptepaddr == 0)
5877 			continue;
5878 
5879 		if (srcptepaddr & PG_PS) {
5880 			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
5881 				continue;
5882 			dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL);
5883 			if (dst_pdpg == NULL)
5884 				break;
5885 			pde = (pd_entry_t *)
5886 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
5887 			pde = &pde[pmap_pde_index(addr)];
5888 			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
5889 			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr,
5890 			    PMAP_ENTER_NORECLAIM, &lock))) {
5891 				*pde = srcptepaddr & ~PG_W;
5892 				pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
5893 				atomic_add_long(&pmap_pde_mappings, 1);
5894 			} else
5895 				dst_pdpg->wire_count--;
5896 			continue;
5897 		}
5898 
5899 		srcptepaddr &= PG_FRAME;
5900 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
5901 		KASSERT(srcmpte->wire_count > 0,
5902 		    ("pmap_copy: source page table page is unused"));
5903 
5904 		if (va_next > end_addr)
5905 			va_next = end_addr;
5906 
5907 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
5908 		src_pte = &src_pte[pmap_pte_index(addr)];
5909 		dstmpte = NULL;
5910 		while (addr < va_next) {
5911 			pt_entry_t ptetemp;
5912 			ptetemp = *src_pte;
5913 			/*
5914 			 * we only virtual copy managed pages
5915 			 */
5916 			if ((ptetemp & PG_MANAGED) != 0) {
5917 				if (dstmpte != NULL &&
5918 				    dstmpte->pindex == pmap_pde_pindex(addr))
5919 					dstmpte->wire_count++;
5920 				else if ((dstmpte = pmap_allocpte(dst_pmap,
5921 				    addr, NULL)) == NULL)
5922 					goto out;
5923 				dst_pte = (pt_entry_t *)
5924 				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
5925 				dst_pte = &dst_pte[pmap_pte_index(addr)];
5926 				if (*dst_pte == 0 &&
5927 				    pmap_try_insert_pv_entry(dst_pmap, addr,
5928 				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
5929 				    &lock)) {
5930 					/*
5931 					 * Clear the wired, modified, and
5932 					 * accessed (referenced) bits
5933 					 * during the copy.
5934 					 */
5935 					*dst_pte = ptetemp & ~(PG_W | PG_M |
5936 					    PG_A);
5937 					pmap_resident_count_inc(dst_pmap, 1);
5938 				} else {
5939 					SLIST_INIT(&free);
5940 					if (pmap_unwire_ptp(dst_pmap, addr,
5941 					    dstmpte, &free)) {
5942 						/*
5943 						 * Although "addr" is not
5944 						 * mapped, paging-structure
5945 						 * caches could nonetheless
5946 						 * have entries that refer to
5947 						 * the freed page table pages.
5948 						 * Invalidate those entries.
5949 						 */
5950 						pmap_invalidate_page(dst_pmap,
5951 						    addr);
5952 						vm_page_free_pages_toq(&free,
5953 						    true);
5954 					}
5955 					goto out;
5956 				}
5957 				if (dstmpte->wire_count >= srcmpte->wire_count)
5958 					break;
5959 			}
5960 			addr += PAGE_SIZE;
5961 			src_pte++;
5962 		}
5963 	}
5964 out:
5965 	if (lock != NULL)
5966 		rw_wunlock(lock);
5967 	PMAP_UNLOCK(src_pmap);
5968 	PMAP_UNLOCK(dst_pmap);
5969 }
5970 
5971 int
5972 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
5973 {
5974 	int error;
5975 
5976 	if (dst_pmap->pm_type != src_pmap->pm_type ||
5977 	    dst_pmap->pm_type != PT_X86 ||
5978 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
5979 		return (0);
5980 	for (;;) {
5981 		if (dst_pmap < src_pmap) {
5982 			PMAP_LOCK(dst_pmap);
5983 			PMAP_LOCK(src_pmap);
5984 		} else {
5985 			PMAP_LOCK(src_pmap);
5986 			PMAP_LOCK(dst_pmap);
5987 		}
5988 		error = pmap_pkru_copy(dst_pmap, src_pmap);
5989 		/* Clean up partial copy on failure due to no memory. */
5990 		if (error == ENOMEM)
5991 			pmap_pkru_deassign_all(dst_pmap);
5992 		PMAP_UNLOCK(src_pmap);
5993 		PMAP_UNLOCK(dst_pmap);
5994 		if (error != ENOMEM)
5995 			break;
5996 		vm_wait(NULL);
5997 	}
5998 	return (error);
5999 }
6000 
6001 /*
6002  * Zero the specified hardware page.
6003  */
6004 void
6005 pmap_zero_page(vm_page_t m)
6006 {
6007 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6008 
6009 	pagezero((void *)va);
6010 }
6011 
6012 /*
6013  * Zero an an area within a single hardware page.  off and size must not
6014  * cover an area beyond a single hardware page.
6015  */
6016 void
6017 pmap_zero_page_area(vm_page_t m, int off, int size)
6018 {
6019 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6020 
6021 	if (off == 0 && size == PAGE_SIZE)
6022 		pagezero((void *)va);
6023 	else
6024 		bzero((char *)va + off, size);
6025 }
6026 
6027 /*
6028  * Copy 1 specified hardware page to another.
6029  */
6030 void
6031 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
6032 {
6033 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
6034 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
6035 
6036 	pagecopy((void *)src, (void *)dst);
6037 }
6038 
6039 int unmapped_buf_allowed = 1;
6040 
6041 void
6042 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
6043     vm_offset_t b_offset, int xfersize)
6044 {
6045 	void *a_cp, *b_cp;
6046 	vm_page_t pages[2];
6047 	vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
6048 	int cnt;
6049 	boolean_t mapped;
6050 
6051 	while (xfersize > 0) {
6052 		a_pg_offset = a_offset & PAGE_MASK;
6053 		pages[0] = ma[a_offset >> PAGE_SHIFT];
6054 		b_pg_offset = b_offset & PAGE_MASK;
6055 		pages[1] = mb[b_offset >> PAGE_SHIFT];
6056 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
6057 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
6058 		mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE);
6059 		a_cp = (char *)vaddr[0] + a_pg_offset;
6060 		b_cp = (char *)vaddr[1] + b_pg_offset;
6061 		bcopy(a_cp, b_cp, cnt);
6062 		if (__predict_false(mapped))
6063 			pmap_unmap_io_transient(pages, vaddr, 2, FALSE);
6064 		a_offset += cnt;
6065 		b_offset += cnt;
6066 		xfersize -= cnt;
6067 	}
6068 }
6069 
6070 /*
6071  * Returns true if the pmap's pv is one of the first
6072  * 16 pvs linked to from this page.  This count may
6073  * be changed upwards or downwards in the future; it
6074  * is only necessary that true be returned for a small
6075  * subset of pmaps for proper page aging.
6076  */
6077 boolean_t
6078 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
6079 {
6080 	struct md_page *pvh;
6081 	struct rwlock *lock;
6082 	pv_entry_t pv;
6083 	int loops = 0;
6084 	boolean_t rv;
6085 
6086 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6087 	    ("pmap_page_exists_quick: page %p is not managed", m));
6088 	rv = FALSE;
6089 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6090 	rw_rlock(lock);
6091 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6092 		if (PV_PMAP(pv) == pmap) {
6093 			rv = TRUE;
6094 			break;
6095 		}
6096 		loops++;
6097 		if (loops >= 16)
6098 			break;
6099 	}
6100 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
6101 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6102 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6103 			if (PV_PMAP(pv) == pmap) {
6104 				rv = TRUE;
6105 				break;
6106 			}
6107 			loops++;
6108 			if (loops >= 16)
6109 				break;
6110 		}
6111 	}
6112 	rw_runlock(lock);
6113 	return (rv);
6114 }
6115 
6116 /*
6117  *	pmap_page_wired_mappings:
6118  *
6119  *	Return the number of managed mappings to the given physical page
6120  *	that are wired.
6121  */
6122 int
6123 pmap_page_wired_mappings(vm_page_t m)
6124 {
6125 	struct rwlock *lock;
6126 	struct md_page *pvh;
6127 	pmap_t pmap;
6128 	pt_entry_t *pte;
6129 	pv_entry_t pv;
6130 	int count, md_gen, pvh_gen;
6131 
6132 	if ((m->oflags & VPO_UNMANAGED) != 0)
6133 		return (0);
6134 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6135 	rw_rlock(lock);
6136 restart:
6137 	count = 0;
6138 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6139 		pmap = PV_PMAP(pv);
6140 		if (!PMAP_TRYLOCK(pmap)) {
6141 			md_gen = m->md.pv_gen;
6142 			rw_runlock(lock);
6143 			PMAP_LOCK(pmap);
6144 			rw_rlock(lock);
6145 			if (md_gen != m->md.pv_gen) {
6146 				PMAP_UNLOCK(pmap);
6147 				goto restart;
6148 			}
6149 		}
6150 		pte = pmap_pte(pmap, pv->pv_va);
6151 		if ((*pte & PG_W) != 0)
6152 			count++;
6153 		PMAP_UNLOCK(pmap);
6154 	}
6155 	if ((m->flags & PG_FICTITIOUS) == 0) {
6156 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6157 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6158 			pmap = PV_PMAP(pv);
6159 			if (!PMAP_TRYLOCK(pmap)) {
6160 				md_gen = m->md.pv_gen;
6161 				pvh_gen = pvh->pv_gen;
6162 				rw_runlock(lock);
6163 				PMAP_LOCK(pmap);
6164 				rw_rlock(lock);
6165 				if (md_gen != m->md.pv_gen ||
6166 				    pvh_gen != pvh->pv_gen) {
6167 					PMAP_UNLOCK(pmap);
6168 					goto restart;
6169 				}
6170 			}
6171 			pte = pmap_pde(pmap, pv->pv_va);
6172 			if ((*pte & PG_W) != 0)
6173 				count++;
6174 			PMAP_UNLOCK(pmap);
6175 		}
6176 	}
6177 	rw_runlock(lock);
6178 	return (count);
6179 }
6180 
6181 /*
6182  * Returns TRUE if the given page is mapped individually or as part of
6183  * a 2mpage.  Otherwise, returns FALSE.
6184  */
6185 boolean_t
6186 pmap_page_is_mapped(vm_page_t m)
6187 {
6188 	struct rwlock *lock;
6189 	boolean_t rv;
6190 
6191 	if ((m->oflags & VPO_UNMANAGED) != 0)
6192 		return (FALSE);
6193 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6194 	rw_rlock(lock);
6195 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
6196 	    ((m->flags & PG_FICTITIOUS) == 0 &&
6197 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
6198 	rw_runlock(lock);
6199 	return (rv);
6200 }
6201 
6202 /*
6203  * Destroy all managed, non-wired mappings in the given user-space
6204  * pmap.  This pmap cannot be active on any processor besides the
6205  * caller.
6206  *
6207  * This function cannot be applied to the kernel pmap.  Moreover, it
6208  * is not intended for general use.  It is only to be used during
6209  * process termination.  Consequently, it can be implemented in ways
6210  * that make it faster than pmap_remove().  First, it can more quickly
6211  * destroy mappings by iterating over the pmap's collection of PV
6212  * entries, rather than searching the page table.  Second, it doesn't
6213  * have to test and clear the page table entries atomically, because
6214  * no processor is currently accessing the user address space.  In
6215  * particular, a page table entry's dirty bit won't change state once
6216  * this function starts.
6217  *
6218  * Although this function destroys all of the pmap's managed,
6219  * non-wired mappings, it can delay and batch the invalidation of TLB
6220  * entries without calling pmap_delayed_invl_started() and
6221  * pmap_delayed_invl_finished().  Because the pmap is not active on
6222  * any other processor, none of these TLB entries will ever be used
6223  * before their eventual invalidation.  Consequently, there is no need
6224  * for either pmap_remove_all() or pmap_remove_write() to wait for
6225  * that eventual TLB invalidation.
6226  */
6227 void
6228 pmap_remove_pages(pmap_t pmap)
6229 {
6230 	pd_entry_t ptepde;
6231 	pt_entry_t *pte, tpte;
6232 	pt_entry_t PG_M, PG_RW, PG_V;
6233 	struct spglist free;
6234 	vm_page_t m, mpte, mt;
6235 	pv_entry_t pv;
6236 	struct md_page *pvh;
6237 	struct pv_chunk *pc, *npc;
6238 	struct rwlock *lock;
6239 	int64_t bit;
6240 	uint64_t inuse, bitmask;
6241 	int allfree, field, freed, idx;
6242 	boolean_t superpage;
6243 	vm_paddr_t pa;
6244 
6245 	/*
6246 	 * Assert that the given pmap is only active on the current
6247 	 * CPU.  Unfortunately, we cannot block another CPU from
6248 	 * activating the pmap while this function is executing.
6249 	 */
6250 	KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
6251 #ifdef INVARIANTS
6252 	{
6253 		cpuset_t other_cpus;
6254 
6255 		other_cpus = all_cpus;
6256 		critical_enter();
6257 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
6258 		CPU_AND(&other_cpus, &pmap->pm_active);
6259 		critical_exit();
6260 		KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
6261 	}
6262 #endif
6263 
6264 	lock = NULL;
6265 	PG_M = pmap_modified_bit(pmap);
6266 	PG_V = pmap_valid_bit(pmap);
6267 	PG_RW = pmap_rw_bit(pmap);
6268 
6269 	SLIST_INIT(&free);
6270 	PMAP_LOCK(pmap);
6271 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
6272 		allfree = 1;
6273 		freed = 0;
6274 		for (field = 0; field < _NPCM; field++) {
6275 			inuse = ~pc->pc_map[field] & pc_freemask[field];
6276 			while (inuse != 0) {
6277 				bit = bsfq(inuse);
6278 				bitmask = 1UL << bit;
6279 				idx = field * 64 + bit;
6280 				pv = &pc->pc_pventry[idx];
6281 				inuse &= ~bitmask;
6282 
6283 				pte = pmap_pdpe(pmap, pv->pv_va);
6284 				ptepde = *pte;
6285 				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
6286 				tpte = *pte;
6287 				if ((tpte & (PG_PS | PG_V)) == PG_V) {
6288 					superpage = FALSE;
6289 					ptepde = tpte;
6290 					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
6291 					    PG_FRAME);
6292 					pte = &pte[pmap_pte_index(pv->pv_va)];
6293 					tpte = *pte;
6294 				} else {
6295 					/*
6296 					 * Keep track whether 'tpte' is a
6297 					 * superpage explicitly instead of
6298 					 * relying on PG_PS being set.
6299 					 *
6300 					 * This is because PG_PS is numerically
6301 					 * identical to PG_PTE_PAT and thus a
6302 					 * regular page could be mistaken for
6303 					 * a superpage.
6304 					 */
6305 					superpage = TRUE;
6306 				}
6307 
6308 				if ((tpte & PG_V) == 0) {
6309 					panic("bad pte va %lx pte %lx",
6310 					    pv->pv_va, tpte);
6311 				}
6312 
6313 /*
6314  * We cannot remove wired pages from a process' mapping at this time
6315  */
6316 				if (tpte & PG_W) {
6317 					allfree = 0;
6318 					continue;
6319 				}
6320 
6321 				if (superpage)
6322 					pa = tpte & PG_PS_FRAME;
6323 				else
6324 					pa = tpte & PG_FRAME;
6325 
6326 				m = PHYS_TO_VM_PAGE(pa);
6327 				KASSERT(m->phys_addr == pa,
6328 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
6329 				    m, (uintmax_t)m->phys_addr,
6330 				    (uintmax_t)tpte));
6331 
6332 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
6333 				    m < &vm_page_array[vm_page_array_size],
6334 				    ("pmap_remove_pages: bad tpte %#jx",
6335 				    (uintmax_t)tpte));
6336 
6337 				pte_clear(pte);
6338 
6339 				/*
6340 				 * Update the vm_page_t clean/reference bits.
6341 				 */
6342 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6343 					if (superpage) {
6344 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
6345 							vm_page_dirty(mt);
6346 					} else
6347 						vm_page_dirty(m);
6348 				}
6349 
6350 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
6351 
6352 				/* Mark free */
6353 				pc->pc_map[field] |= bitmask;
6354 				if (superpage) {
6355 					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
6356 					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
6357 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
6358 					pvh->pv_gen++;
6359 					if (TAILQ_EMPTY(&pvh->pv_list)) {
6360 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
6361 							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
6362 							    TAILQ_EMPTY(&mt->md.pv_list))
6363 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
6364 					}
6365 					mpte = pmap_remove_pt_page(pmap, pv->pv_va);
6366 					if (mpte != NULL) {
6367 						pmap_resident_count_dec(pmap, 1);
6368 						KASSERT(mpte->wire_count == NPTEPG,
6369 						    ("pmap_remove_pages: pte page wire count error"));
6370 						mpte->wire_count = 0;
6371 						pmap_add_delayed_free_list(mpte, &free, FALSE);
6372 					}
6373 				} else {
6374 					pmap_resident_count_dec(pmap, 1);
6375 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
6376 					m->md.pv_gen++;
6377 					if ((m->aflags & PGA_WRITEABLE) != 0 &&
6378 					    TAILQ_EMPTY(&m->md.pv_list) &&
6379 					    (m->flags & PG_FICTITIOUS) == 0) {
6380 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6381 						if (TAILQ_EMPTY(&pvh->pv_list))
6382 							vm_page_aflag_clear(m, PGA_WRITEABLE);
6383 					}
6384 				}
6385 				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
6386 				freed++;
6387 			}
6388 		}
6389 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
6390 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
6391 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
6392 		if (allfree) {
6393 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
6394 			free_pv_chunk(pc);
6395 		}
6396 	}
6397 	if (lock != NULL)
6398 		rw_wunlock(lock);
6399 	pmap_invalidate_all(pmap);
6400 	pmap_pkru_deassign_all(pmap);
6401 	PMAP_UNLOCK(pmap);
6402 	vm_page_free_pages_toq(&free, true);
6403 }
6404 
6405 static boolean_t
6406 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
6407 {
6408 	struct rwlock *lock;
6409 	pv_entry_t pv;
6410 	struct md_page *pvh;
6411 	pt_entry_t *pte, mask;
6412 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
6413 	pmap_t pmap;
6414 	int md_gen, pvh_gen;
6415 	boolean_t rv;
6416 
6417 	rv = FALSE;
6418 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6419 	rw_rlock(lock);
6420 restart:
6421 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6422 		pmap = PV_PMAP(pv);
6423 		if (!PMAP_TRYLOCK(pmap)) {
6424 			md_gen = m->md.pv_gen;
6425 			rw_runlock(lock);
6426 			PMAP_LOCK(pmap);
6427 			rw_rlock(lock);
6428 			if (md_gen != m->md.pv_gen) {
6429 				PMAP_UNLOCK(pmap);
6430 				goto restart;
6431 			}
6432 		}
6433 		pte = pmap_pte(pmap, pv->pv_va);
6434 		mask = 0;
6435 		if (modified) {
6436 			PG_M = pmap_modified_bit(pmap);
6437 			PG_RW = pmap_rw_bit(pmap);
6438 			mask |= PG_RW | PG_M;
6439 		}
6440 		if (accessed) {
6441 			PG_A = pmap_accessed_bit(pmap);
6442 			PG_V = pmap_valid_bit(pmap);
6443 			mask |= PG_V | PG_A;
6444 		}
6445 		rv = (*pte & mask) == mask;
6446 		PMAP_UNLOCK(pmap);
6447 		if (rv)
6448 			goto out;
6449 	}
6450 	if ((m->flags & PG_FICTITIOUS) == 0) {
6451 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6452 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6453 			pmap = PV_PMAP(pv);
6454 			if (!PMAP_TRYLOCK(pmap)) {
6455 				md_gen = m->md.pv_gen;
6456 				pvh_gen = pvh->pv_gen;
6457 				rw_runlock(lock);
6458 				PMAP_LOCK(pmap);
6459 				rw_rlock(lock);
6460 				if (md_gen != m->md.pv_gen ||
6461 				    pvh_gen != pvh->pv_gen) {
6462 					PMAP_UNLOCK(pmap);
6463 					goto restart;
6464 				}
6465 			}
6466 			pte = pmap_pde(pmap, pv->pv_va);
6467 			mask = 0;
6468 			if (modified) {
6469 				PG_M = pmap_modified_bit(pmap);
6470 				PG_RW = pmap_rw_bit(pmap);
6471 				mask |= PG_RW | PG_M;
6472 			}
6473 			if (accessed) {
6474 				PG_A = pmap_accessed_bit(pmap);
6475 				PG_V = pmap_valid_bit(pmap);
6476 				mask |= PG_V | PG_A;
6477 			}
6478 			rv = (*pte & mask) == mask;
6479 			PMAP_UNLOCK(pmap);
6480 			if (rv)
6481 				goto out;
6482 		}
6483 	}
6484 out:
6485 	rw_runlock(lock);
6486 	return (rv);
6487 }
6488 
6489 /*
6490  *	pmap_is_modified:
6491  *
6492  *	Return whether or not the specified physical page was modified
6493  *	in any physical maps.
6494  */
6495 boolean_t
6496 pmap_is_modified(vm_page_t m)
6497 {
6498 
6499 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6500 	    ("pmap_is_modified: page %p is not managed", m));
6501 
6502 	/*
6503 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
6504 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
6505 	 * is clear, no PTEs can have PG_M set.
6506 	 */
6507 	VM_OBJECT_ASSERT_WLOCKED(m->object);
6508 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
6509 		return (FALSE);
6510 	return (pmap_page_test_mappings(m, FALSE, TRUE));
6511 }
6512 
6513 /*
6514  *	pmap_is_prefaultable:
6515  *
6516  *	Return whether or not the specified virtual address is eligible
6517  *	for prefault.
6518  */
6519 boolean_t
6520 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
6521 {
6522 	pd_entry_t *pde;
6523 	pt_entry_t *pte, PG_V;
6524 	boolean_t rv;
6525 
6526 	PG_V = pmap_valid_bit(pmap);
6527 	rv = FALSE;
6528 	PMAP_LOCK(pmap);
6529 	pde = pmap_pde(pmap, addr);
6530 	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
6531 		pte = pmap_pde_to_pte(pde, addr);
6532 		rv = (*pte & PG_V) == 0;
6533 	}
6534 	PMAP_UNLOCK(pmap);
6535 	return (rv);
6536 }
6537 
6538 /*
6539  *	pmap_is_referenced:
6540  *
6541  *	Return whether or not the specified physical page was referenced
6542  *	in any physical maps.
6543  */
6544 boolean_t
6545 pmap_is_referenced(vm_page_t m)
6546 {
6547 
6548 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6549 	    ("pmap_is_referenced: page %p is not managed", m));
6550 	return (pmap_page_test_mappings(m, TRUE, FALSE));
6551 }
6552 
6553 /*
6554  * Clear the write and modified bits in each of the given page's mappings.
6555  */
6556 void
6557 pmap_remove_write(vm_page_t m)
6558 {
6559 	struct md_page *pvh;
6560 	pmap_t pmap;
6561 	struct rwlock *lock;
6562 	pv_entry_t next_pv, pv;
6563 	pd_entry_t *pde;
6564 	pt_entry_t oldpte, *pte, PG_M, PG_RW;
6565 	vm_offset_t va;
6566 	int pvh_gen, md_gen;
6567 
6568 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6569 	    ("pmap_remove_write: page %p is not managed", m));
6570 
6571 	/*
6572 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
6573 	 * set by another thread while the object is locked.  Thus,
6574 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
6575 	 */
6576 	VM_OBJECT_ASSERT_WLOCKED(m->object);
6577 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
6578 		return;
6579 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6580 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
6581 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
6582 retry_pv_loop:
6583 	rw_wlock(lock);
6584 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6585 		pmap = PV_PMAP(pv);
6586 		if (!PMAP_TRYLOCK(pmap)) {
6587 			pvh_gen = pvh->pv_gen;
6588 			rw_wunlock(lock);
6589 			PMAP_LOCK(pmap);
6590 			rw_wlock(lock);
6591 			if (pvh_gen != pvh->pv_gen) {
6592 				PMAP_UNLOCK(pmap);
6593 				rw_wunlock(lock);
6594 				goto retry_pv_loop;
6595 			}
6596 		}
6597 		PG_RW = pmap_rw_bit(pmap);
6598 		va = pv->pv_va;
6599 		pde = pmap_pde(pmap, va);
6600 		if ((*pde & PG_RW) != 0)
6601 			(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
6602 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
6603 		    ("inconsistent pv lock %p %p for page %p",
6604 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
6605 		PMAP_UNLOCK(pmap);
6606 	}
6607 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6608 		pmap = PV_PMAP(pv);
6609 		if (!PMAP_TRYLOCK(pmap)) {
6610 			pvh_gen = pvh->pv_gen;
6611 			md_gen = m->md.pv_gen;
6612 			rw_wunlock(lock);
6613 			PMAP_LOCK(pmap);
6614 			rw_wlock(lock);
6615 			if (pvh_gen != pvh->pv_gen ||
6616 			    md_gen != m->md.pv_gen) {
6617 				PMAP_UNLOCK(pmap);
6618 				rw_wunlock(lock);
6619 				goto retry_pv_loop;
6620 			}
6621 		}
6622 		PG_M = pmap_modified_bit(pmap);
6623 		PG_RW = pmap_rw_bit(pmap);
6624 		pde = pmap_pde(pmap, pv->pv_va);
6625 		KASSERT((*pde & PG_PS) == 0,
6626 		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
6627 		    m));
6628 		pte = pmap_pde_to_pte(pde, pv->pv_va);
6629 retry:
6630 		oldpte = *pte;
6631 		if (oldpte & PG_RW) {
6632 			if (!atomic_cmpset_long(pte, oldpte, oldpte &
6633 			    ~(PG_RW | PG_M)))
6634 				goto retry;
6635 			if ((oldpte & PG_M) != 0)
6636 				vm_page_dirty(m);
6637 			pmap_invalidate_page(pmap, pv->pv_va);
6638 		}
6639 		PMAP_UNLOCK(pmap);
6640 	}
6641 	rw_wunlock(lock);
6642 	vm_page_aflag_clear(m, PGA_WRITEABLE);
6643 	pmap_delayed_invl_wait(m);
6644 }
6645 
6646 static __inline boolean_t
6647 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
6648 {
6649 
6650 	if (!pmap_emulate_ad_bits(pmap))
6651 		return (TRUE);
6652 
6653 	KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
6654 
6655 	/*
6656 	 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration
6657 	 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
6658 	 * if the EPT_PG_WRITE bit is set.
6659 	 */
6660 	if ((pte & EPT_PG_WRITE) != 0)
6661 		return (FALSE);
6662 
6663 	/*
6664 	 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
6665 	 */
6666 	if ((pte & EPT_PG_EXECUTE) == 0 ||
6667 	    ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
6668 		return (TRUE);
6669 	else
6670 		return (FALSE);
6671 }
6672 
6673 /*
6674  *	pmap_ts_referenced:
6675  *
6676  *	Return a count of reference bits for a page, clearing those bits.
6677  *	It is not necessary for every reference bit to be cleared, but it
6678  *	is necessary that 0 only be returned when there are truly no
6679  *	reference bits set.
6680  *
6681  *	As an optimization, update the page's dirty field if a modified bit is
6682  *	found while counting reference bits.  This opportunistic update can be
6683  *	performed at low cost and can eliminate the need for some future calls
6684  *	to pmap_is_modified().  However, since this function stops after
6685  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
6686  *	dirty pages.  Those dirty pages will only be detected by a future call
6687  *	to pmap_is_modified().
6688  *
6689  *	A DI block is not needed within this function, because
6690  *	invalidations are performed before the PV list lock is
6691  *	released.
6692  */
6693 int
6694 pmap_ts_referenced(vm_page_t m)
6695 {
6696 	struct md_page *pvh;
6697 	pv_entry_t pv, pvf;
6698 	pmap_t pmap;
6699 	struct rwlock *lock;
6700 	pd_entry_t oldpde, *pde;
6701 	pt_entry_t *pte, PG_A, PG_M, PG_RW;
6702 	vm_offset_t va;
6703 	vm_paddr_t pa;
6704 	int cleared, md_gen, not_cleared, pvh_gen;
6705 	struct spglist free;
6706 	boolean_t demoted;
6707 
6708 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6709 	    ("pmap_ts_referenced: page %p is not managed", m));
6710 	SLIST_INIT(&free);
6711 	cleared = 0;
6712 	pa = VM_PAGE_TO_PHYS(m);
6713 	lock = PHYS_TO_PV_LIST_LOCK(pa);
6714 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
6715 	rw_wlock(lock);
6716 retry:
6717 	not_cleared = 0;
6718 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
6719 		goto small_mappings;
6720 	pv = pvf;
6721 	do {
6722 		if (pvf == NULL)
6723 			pvf = pv;
6724 		pmap = PV_PMAP(pv);
6725 		if (!PMAP_TRYLOCK(pmap)) {
6726 			pvh_gen = pvh->pv_gen;
6727 			rw_wunlock(lock);
6728 			PMAP_LOCK(pmap);
6729 			rw_wlock(lock);
6730 			if (pvh_gen != pvh->pv_gen) {
6731 				PMAP_UNLOCK(pmap);
6732 				goto retry;
6733 			}
6734 		}
6735 		PG_A = pmap_accessed_bit(pmap);
6736 		PG_M = pmap_modified_bit(pmap);
6737 		PG_RW = pmap_rw_bit(pmap);
6738 		va = pv->pv_va;
6739 		pde = pmap_pde(pmap, pv->pv_va);
6740 		oldpde = *pde;
6741 		if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6742 			/*
6743 			 * Although "oldpde" is mapping a 2MB page, because
6744 			 * this function is called at a 4KB page granularity,
6745 			 * we only update the 4KB page under test.
6746 			 */
6747 			vm_page_dirty(m);
6748 		}
6749 		if ((oldpde & PG_A) != 0) {
6750 			/*
6751 			 * Since this reference bit is shared by 512 4KB
6752 			 * pages, it should not be cleared every time it is
6753 			 * tested.  Apply a simple "hash" function on the
6754 			 * physical page number, the virtual superpage number,
6755 			 * and the pmap address to select one 4KB page out of
6756 			 * the 512 on which testing the reference bit will
6757 			 * result in clearing that reference bit.  This
6758 			 * function is designed to avoid the selection of the
6759 			 * same 4KB page for every 2MB page mapping.
6760 			 *
6761 			 * On demotion, a mapping that hasn't been referenced
6762 			 * is simply destroyed.  To avoid the possibility of a
6763 			 * subsequent page fault on a demoted wired mapping,
6764 			 * always leave its reference bit set.  Moreover,
6765 			 * since the superpage is wired, the current state of
6766 			 * its reference bit won't affect page replacement.
6767 			 */
6768 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
6769 			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
6770 			    (oldpde & PG_W) == 0) {
6771 				if (safe_to_clear_referenced(pmap, oldpde)) {
6772 					atomic_clear_long(pde, PG_A);
6773 					pmap_invalidate_page(pmap, pv->pv_va);
6774 					demoted = FALSE;
6775 				} else if (pmap_demote_pde_locked(pmap, pde,
6776 				    pv->pv_va, &lock)) {
6777 					/*
6778 					 * Remove the mapping to a single page
6779 					 * so that a subsequent access may
6780 					 * repromote.  Since the underlying
6781 					 * page table page is fully populated,
6782 					 * this removal never frees a page
6783 					 * table page.
6784 					 */
6785 					demoted = TRUE;
6786 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
6787 					    PG_PS_FRAME);
6788 					pte = pmap_pde_to_pte(pde, va);
6789 					pmap_remove_pte(pmap, pte, va, *pde,
6790 					    NULL, &lock);
6791 					pmap_invalidate_page(pmap, va);
6792 				} else
6793 					demoted = TRUE;
6794 
6795 				if (demoted) {
6796 					/*
6797 					 * The superpage mapping was removed
6798 					 * entirely and therefore 'pv' is no
6799 					 * longer valid.
6800 					 */
6801 					if (pvf == pv)
6802 						pvf = NULL;
6803 					pv = NULL;
6804 				}
6805 				cleared++;
6806 				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
6807 				    ("inconsistent pv lock %p %p for page %p",
6808 				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
6809 			} else
6810 				not_cleared++;
6811 		}
6812 		PMAP_UNLOCK(pmap);
6813 		/* Rotate the PV list if it has more than one entry. */
6814 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
6815 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
6816 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
6817 			pvh->pv_gen++;
6818 		}
6819 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
6820 			goto out;
6821 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
6822 small_mappings:
6823 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
6824 		goto out;
6825 	pv = pvf;
6826 	do {
6827 		if (pvf == NULL)
6828 			pvf = pv;
6829 		pmap = PV_PMAP(pv);
6830 		if (!PMAP_TRYLOCK(pmap)) {
6831 			pvh_gen = pvh->pv_gen;
6832 			md_gen = m->md.pv_gen;
6833 			rw_wunlock(lock);
6834 			PMAP_LOCK(pmap);
6835 			rw_wlock(lock);
6836 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6837 				PMAP_UNLOCK(pmap);
6838 				goto retry;
6839 			}
6840 		}
6841 		PG_A = pmap_accessed_bit(pmap);
6842 		PG_M = pmap_modified_bit(pmap);
6843 		PG_RW = pmap_rw_bit(pmap);
6844 		pde = pmap_pde(pmap, pv->pv_va);
6845 		KASSERT((*pde & PG_PS) == 0,
6846 		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
6847 		    m));
6848 		pte = pmap_pde_to_pte(pde, pv->pv_va);
6849 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6850 			vm_page_dirty(m);
6851 		if ((*pte & PG_A) != 0) {
6852 			if (safe_to_clear_referenced(pmap, *pte)) {
6853 				atomic_clear_long(pte, PG_A);
6854 				pmap_invalidate_page(pmap, pv->pv_va);
6855 				cleared++;
6856 			} else if ((*pte & PG_W) == 0) {
6857 				/*
6858 				 * Wired pages cannot be paged out so
6859 				 * doing accessed bit emulation for
6860 				 * them is wasted effort. We do the
6861 				 * hard work for unwired pages only.
6862 				 */
6863 				pmap_remove_pte(pmap, pte, pv->pv_va,
6864 				    *pde, &free, &lock);
6865 				pmap_invalidate_page(pmap, pv->pv_va);
6866 				cleared++;
6867 				if (pvf == pv)
6868 					pvf = NULL;
6869 				pv = NULL;
6870 				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
6871 				    ("inconsistent pv lock %p %p for page %p",
6872 				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
6873 			} else
6874 				not_cleared++;
6875 		}
6876 		PMAP_UNLOCK(pmap);
6877 		/* Rotate the PV list if it has more than one entry. */
6878 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
6879 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
6880 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
6881 			m->md.pv_gen++;
6882 		}
6883 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
6884 	    not_cleared < PMAP_TS_REFERENCED_MAX);
6885 out:
6886 	rw_wunlock(lock);
6887 	vm_page_free_pages_toq(&free, true);
6888 	return (cleared + not_cleared);
6889 }
6890 
6891 /*
6892  *	Apply the given advice to the specified range of addresses within the
6893  *	given pmap.  Depending on the advice, clear the referenced and/or
6894  *	modified flags in each mapping and set the mapped page's dirty field.
6895  */
6896 void
6897 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
6898 {
6899 	struct rwlock *lock;
6900 	pml4_entry_t *pml4e;
6901 	pdp_entry_t *pdpe;
6902 	pd_entry_t oldpde, *pde;
6903 	pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
6904 	vm_offset_t va, va_next;
6905 	vm_page_t m;
6906 	boolean_t anychanged;
6907 
6908 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
6909 		return;
6910 
6911 	/*
6912 	 * A/D bit emulation requires an alternate code path when clearing
6913 	 * the modified and accessed bits below. Since this function is
6914 	 * advisory in nature we skip it entirely for pmaps that require
6915 	 * A/D bit emulation.
6916 	 */
6917 	if (pmap_emulate_ad_bits(pmap))
6918 		return;
6919 
6920 	PG_A = pmap_accessed_bit(pmap);
6921 	PG_G = pmap_global_bit(pmap);
6922 	PG_M = pmap_modified_bit(pmap);
6923 	PG_V = pmap_valid_bit(pmap);
6924 	PG_RW = pmap_rw_bit(pmap);
6925 	anychanged = FALSE;
6926 	pmap_delayed_invl_started();
6927 	PMAP_LOCK(pmap);
6928 	for (; sva < eva; sva = va_next) {
6929 		pml4e = pmap_pml4e(pmap, sva);
6930 		if ((*pml4e & PG_V) == 0) {
6931 			va_next = (sva + NBPML4) & ~PML4MASK;
6932 			if (va_next < sva)
6933 				va_next = eva;
6934 			continue;
6935 		}
6936 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
6937 		if ((*pdpe & PG_V) == 0) {
6938 			va_next = (sva + NBPDP) & ~PDPMASK;
6939 			if (va_next < sva)
6940 				va_next = eva;
6941 			continue;
6942 		}
6943 		va_next = (sva + NBPDR) & ~PDRMASK;
6944 		if (va_next < sva)
6945 			va_next = eva;
6946 		pde = pmap_pdpe_to_pde(pdpe, sva);
6947 		oldpde = *pde;
6948 		if ((oldpde & PG_V) == 0)
6949 			continue;
6950 		else if ((oldpde & PG_PS) != 0) {
6951 			if ((oldpde & PG_MANAGED) == 0)
6952 				continue;
6953 			lock = NULL;
6954 			if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
6955 				if (lock != NULL)
6956 					rw_wunlock(lock);
6957 
6958 				/*
6959 				 * The large page mapping was destroyed.
6960 				 */
6961 				continue;
6962 			}
6963 
6964 			/*
6965 			 * Unless the page mappings are wired, remove the
6966 			 * mapping to a single page so that a subsequent
6967 			 * access may repromote.  Since the underlying page
6968 			 * table page is fully populated, this removal never
6969 			 * frees a page table page.
6970 			 */
6971 			if ((oldpde & PG_W) == 0) {
6972 				pte = pmap_pde_to_pte(pde, sva);
6973 				KASSERT((*pte & PG_V) != 0,
6974 				    ("pmap_advise: invalid PTE"));
6975 				pmap_remove_pte(pmap, pte, sva, *pde, NULL,
6976 				    &lock);
6977 				anychanged = TRUE;
6978 			}
6979 			if (lock != NULL)
6980 				rw_wunlock(lock);
6981 		}
6982 		if (va_next > eva)
6983 			va_next = eva;
6984 		va = va_next;
6985 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
6986 		    sva += PAGE_SIZE) {
6987 			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
6988 				goto maybe_invlrng;
6989 			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6990 				if (advice == MADV_DONTNEED) {
6991 					/*
6992 					 * Future calls to pmap_is_modified()
6993 					 * can be avoided by making the page
6994 					 * dirty now.
6995 					 */
6996 					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
6997 					vm_page_dirty(m);
6998 				}
6999 				atomic_clear_long(pte, PG_M | PG_A);
7000 			} else if ((*pte & PG_A) != 0)
7001 				atomic_clear_long(pte, PG_A);
7002 			else
7003 				goto maybe_invlrng;
7004 
7005 			if ((*pte & PG_G) != 0) {
7006 				if (va == va_next)
7007 					va = sva;
7008 			} else
7009 				anychanged = TRUE;
7010 			continue;
7011 maybe_invlrng:
7012 			if (va != va_next) {
7013 				pmap_invalidate_range(pmap, va, sva);
7014 				va = va_next;
7015 			}
7016 		}
7017 		if (va != va_next)
7018 			pmap_invalidate_range(pmap, va, sva);
7019 	}
7020 	if (anychanged)
7021 		pmap_invalidate_all(pmap);
7022 	PMAP_UNLOCK(pmap);
7023 	pmap_delayed_invl_finished();
7024 }
7025 
7026 /*
7027  *	Clear the modify bits on the specified physical page.
7028  */
7029 void
7030 pmap_clear_modify(vm_page_t m)
7031 {
7032 	struct md_page *pvh;
7033 	pmap_t pmap;
7034 	pv_entry_t next_pv, pv;
7035 	pd_entry_t oldpde, *pde;
7036 	pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
7037 	struct rwlock *lock;
7038 	vm_offset_t va;
7039 	int md_gen, pvh_gen;
7040 
7041 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7042 	    ("pmap_clear_modify: page %p is not managed", m));
7043 	VM_OBJECT_ASSERT_WLOCKED(m->object);
7044 	KASSERT(!vm_page_xbusied(m),
7045 	    ("pmap_clear_modify: page %p is exclusive busied", m));
7046 
7047 	/*
7048 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
7049 	 * If the object containing the page is locked and the page is not
7050 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
7051 	 */
7052 	if ((m->aflags & PGA_WRITEABLE) == 0)
7053 		return;
7054 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
7055 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
7056 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7057 	rw_wlock(lock);
7058 restart:
7059 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7060 		pmap = PV_PMAP(pv);
7061 		if (!PMAP_TRYLOCK(pmap)) {
7062 			pvh_gen = pvh->pv_gen;
7063 			rw_wunlock(lock);
7064 			PMAP_LOCK(pmap);
7065 			rw_wlock(lock);
7066 			if (pvh_gen != pvh->pv_gen) {
7067 				PMAP_UNLOCK(pmap);
7068 				goto restart;
7069 			}
7070 		}
7071 		PG_M = pmap_modified_bit(pmap);
7072 		PG_V = pmap_valid_bit(pmap);
7073 		PG_RW = pmap_rw_bit(pmap);
7074 		va = pv->pv_va;
7075 		pde = pmap_pde(pmap, va);
7076 		oldpde = *pde;
7077 		if ((oldpde & PG_RW) != 0) {
7078 			if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
7079 				if ((oldpde & PG_W) == 0) {
7080 					/*
7081 					 * Write protect the mapping to a
7082 					 * single page so that a subsequent
7083 					 * write access may repromote.
7084 					 */
7085 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
7086 					    PG_PS_FRAME);
7087 					pte = pmap_pde_to_pte(pde, va);
7088 					oldpte = *pte;
7089 					if ((oldpte & PG_V) != 0) {
7090 						while (!atomic_cmpset_long(pte,
7091 						    oldpte,
7092 						    oldpte & ~(PG_M | PG_RW)))
7093 							oldpte = *pte;
7094 						vm_page_dirty(m);
7095 						pmap_invalidate_page(pmap, va);
7096 					}
7097 				}
7098 			}
7099 		}
7100 		PMAP_UNLOCK(pmap);
7101 	}
7102 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7103 		pmap = PV_PMAP(pv);
7104 		if (!PMAP_TRYLOCK(pmap)) {
7105 			md_gen = m->md.pv_gen;
7106 			pvh_gen = pvh->pv_gen;
7107 			rw_wunlock(lock);
7108 			PMAP_LOCK(pmap);
7109 			rw_wlock(lock);
7110 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7111 				PMAP_UNLOCK(pmap);
7112 				goto restart;
7113 			}
7114 		}
7115 		PG_M = pmap_modified_bit(pmap);
7116 		PG_RW = pmap_rw_bit(pmap);
7117 		pde = pmap_pde(pmap, pv->pv_va);
7118 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
7119 		    " a 2mpage in page %p's pv list", m));
7120 		pte = pmap_pde_to_pte(pde, pv->pv_va);
7121 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
7122 			atomic_clear_long(pte, PG_M);
7123 			pmap_invalidate_page(pmap, pv->pv_va);
7124 		}
7125 		PMAP_UNLOCK(pmap);
7126 	}
7127 	rw_wunlock(lock);
7128 }
7129 
7130 /*
7131  * Miscellaneous support routines follow
7132  */
7133 
7134 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
7135 static __inline void
7136 pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
7137 {
7138 	u_int opte, npte;
7139 
7140 	/*
7141 	 * The cache mode bits are all in the low 32-bits of the
7142 	 * PTE, so we can just spin on updating the low 32-bits.
7143 	 */
7144 	do {
7145 		opte = *(u_int *)pte;
7146 		npte = opte & ~mask;
7147 		npte |= cache_bits;
7148 	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
7149 }
7150 
7151 /* Adjust the cache mode for a 2MB page mapped via a PDE. */
7152 static __inline void
7153 pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
7154 {
7155 	u_int opde, npde;
7156 
7157 	/*
7158 	 * The cache mode bits are all in the low 32-bits of the
7159 	 * PDE, so we can just spin on updating the low 32-bits.
7160 	 */
7161 	do {
7162 		opde = *(u_int *)pde;
7163 		npde = opde & ~mask;
7164 		npde |= cache_bits;
7165 	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
7166 }
7167 
7168 /*
7169  * Map a set of physical memory pages into the kernel virtual
7170  * address space. Return a pointer to where it is mapped. This
7171  * routine is intended to be used for mapping device memory,
7172  * NOT real memory.
7173  */
7174 static void *
7175 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, bool noflush)
7176 {
7177 	struct pmap_preinit_mapping *ppim;
7178 	vm_offset_t va, offset;
7179 	vm_size_t tmpsize;
7180 	int i;
7181 
7182 	offset = pa & PAGE_MASK;
7183 	size = round_page(offset + size);
7184 	pa = trunc_page(pa);
7185 
7186 	if (!pmap_initialized) {
7187 		va = 0;
7188 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7189 			ppim = pmap_preinit_mapping + i;
7190 			if (ppim->va == 0) {
7191 				ppim->pa = pa;
7192 				ppim->sz = size;
7193 				ppim->mode = mode;
7194 				ppim->va = virtual_avail;
7195 				virtual_avail += size;
7196 				va = ppim->va;
7197 				break;
7198 			}
7199 		}
7200 		if (va == 0)
7201 			panic("%s: too many preinit mappings", __func__);
7202 	} else {
7203 		/*
7204 		 * If we have a preinit mapping, re-use it.
7205 		 */
7206 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7207 			ppim = pmap_preinit_mapping + i;
7208 			if (ppim->pa == pa && ppim->sz == size &&
7209 			    ppim->mode == mode)
7210 				return ((void *)(ppim->va + offset));
7211 		}
7212 		/*
7213 		 * If the specified range of physical addresses fits within
7214 		 * the direct map window, use the direct map.
7215 		 */
7216 		if (pa < dmaplimit && pa + size <= dmaplimit) {
7217 			va = PHYS_TO_DMAP(pa);
7218 			PMAP_LOCK(kernel_pmap);
7219 			i = pmap_change_attr_locked(va, size, mode, noflush);
7220 			PMAP_UNLOCK(kernel_pmap);
7221 			if (!i)
7222 				return ((void *)(va + offset));
7223 		}
7224 		va = kva_alloc(size);
7225 		if (va == 0)
7226 			panic("%s: Couldn't allocate KVA", __func__);
7227 	}
7228 	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
7229 		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
7230 	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
7231 	if (!noflush)
7232 		pmap_invalidate_cache_range(va, va + tmpsize);
7233 	return ((void *)(va + offset));
7234 }
7235 
7236 void *
7237 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
7238 {
7239 
7240 	return (pmap_mapdev_internal(pa, size, mode, false));
7241 }
7242 
7243 void *
7244 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
7245 {
7246 
7247 	return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, false));
7248 }
7249 
7250 void *
7251 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size)
7252 {
7253 
7254 	return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, true));
7255 }
7256 
7257 void *
7258 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
7259 {
7260 
7261 	return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, false));
7262 }
7263 
7264 void
7265 pmap_unmapdev(vm_offset_t va, vm_size_t size)
7266 {
7267 	struct pmap_preinit_mapping *ppim;
7268 	vm_offset_t offset;
7269 	int i;
7270 
7271 	/* If we gave a direct map region in pmap_mapdev, do nothing */
7272 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
7273 		return;
7274 	offset = va & PAGE_MASK;
7275 	size = round_page(offset + size);
7276 	va = trunc_page(va);
7277 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7278 		ppim = pmap_preinit_mapping + i;
7279 		if (ppim->va == va && ppim->sz == size) {
7280 			if (pmap_initialized)
7281 				return;
7282 			ppim->pa = 0;
7283 			ppim->va = 0;
7284 			ppim->sz = 0;
7285 			ppim->mode = 0;
7286 			if (va + size == virtual_avail)
7287 				virtual_avail = va;
7288 			return;
7289 		}
7290 	}
7291 	if (pmap_initialized)
7292 		kva_free(va, size);
7293 }
7294 
7295 /*
7296  * Tries to demote a 1GB page mapping.
7297  */
7298 static boolean_t
7299 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
7300 {
7301 	pdp_entry_t newpdpe, oldpdpe;
7302 	pd_entry_t *firstpde, newpde, *pde;
7303 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
7304 	vm_paddr_t pdpgpa;
7305 	vm_page_t pdpg;
7306 
7307 	PG_A = pmap_accessed_bit(pmap);
7308 	PG_M = pmap_modified_bit(pmap);
7309 	PG_V = pmap_valid_bit(pmap);
7310 	PG_RW = pmap_rw_bit(pmap);
7311 
7312 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
7313 	oldpdpe = *pdpe;
7314 	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
7315 	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
7316 	if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
7317 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
7318 		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
7319 		    " in pmap %p", va, pmap);
7320 		return (FALSE);
7321 	}
7322 	pdpgpa = VM_PAGE_TO_PHYS(pdpg);
7323 	firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa);
7324 	newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
7325 	KASSERT((oldpdpe & PG_A) != 0,
7326 	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
7327 	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
7328 	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
7329 	newpde = oldpdpe;
7330 
7331 	/*
7332 	 * Initialize the page directory page.
7333 	 */
7334 	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
7335 		*pde = newpde;
7336 		newpde += NBPDR;
7337 	}
7338 
7339 	/*
7340 	 * Demote the mapping.
7341 	 */
7342 	*pdpe = newpdpe;
7343 
7344 	/*
7345 	 * Invalidate a stale recursive mapping of the page directory page.
7346 	 */
7347 	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
7348 
7349 	pmap_pdpe_demotions++;
7350 	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
7351 	    " in pmap %p", va, pmap);
7352 	return (TRUE);
7353 }
7354 
7355 /*
7356  * Sets the memory attribute for the specified page.
7357  */
7358 void
7359 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
7360 {
7361 
7362 	m->md.pat_mode = ma;
7363 
7364 	/*
7365 	 * If "m" is a normal page, update its direct mapping.  This update
7366 	 * can be relied upon to perform any cache operations that are
7367 	 * required for data coherence.
7368 	 */
7369 	if ((m->flags & PG_FICTITIOUS) == 0 &&
7370 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
7371 	    m->md.pat_mode))
7372 		panic("memory attribute change on the direct map failed");
7373 }
7374 
7375 /*
7376  * Changes the specified virtual address range's memory type to that given by
7377  * the parameter "mode".  The specified virtual address range must be
7378  * completely contained within either the direct map or the kernel map.  If
7379  * the virtual address range is contained within the kernel map, then the
7380  * memory type for each of the corresponding ranges of the direct map is also
7381  * changed.  (The corresponding ranges of the direct map are those ranges that
7382  * map the same physical pages as the specified virtual address range.)  These
7383  * changes to the direct map are necessary because Intel describes the
7384  * behavior of their processors as "undefined" if two or more mappings to the
7385  * same physical page have different memory types.
7386  *
7387  * Returns zero if the change completed successfully, and either EINVAL or
7388  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
7389  * of the virtual address range was not mapped, and ENOMEM is returned if
7390  * there was insufficient memory available to complete the change.  In the
7391  * latter case, the memory type may have been changed on some part of the
7392  * virtual address range or the direct map.
7393  */
7394 int
7395 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
7396 {
7397 	int error;
7398 
7399 	PMAP_LOCK(kernel_pmap);
7400 	error = pmap_change_attr_locked(va, size, mode, false);
7401 	PMAP_UNLOCK(kernel_pmap);
7402 	return (error);
7403 }
7404 
7405 static int
7406 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool noflush)
7407 {
7408 	vm_offset_t base, offset, tmpva;
7409 	vm_paddr_t pa_start, pa_end, pa_end1;
7410 	pdp_entry_t *pdpe;
7411 	pd_entry_t *pde;
7412 	pt_entry_t *pte;
7413 	int cache_bits_pte, cache_bits_pde, error;
7414 	boolean_t changed;
7415 
7416 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
7417 	base = trunc_page(va);
7418 	offset = va & PAGE_MASK;
7419 	size = round_page(offset + size);
7420 
7421 	/*
7422 	 * Only supported on kernel virtual addresses, including the direct
7423 	 * map but excluding the recursive map.
7424 	 */
7425 	if (base < DMAP_MIN_ADDRESS)
7426 		return (EINVAL);
7427 
7428 	cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
7429 	cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
7430 	changed = FALSE;
7431 
7432 	/*
7433 	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
7434 	 * into 4KB pages if required.
7435 	 */
7436 	for (tmpva = base; tmpva < base + size; ) {
7437 		pdpe = pmap_pdpe(kernel_pmap, tmpva);
7438 		if (pdpe == NULL || *pdpe == 0)
7439 			return (EINVAL);
7440 		if (*pdpe & PG_PS) {
7441 			/*
7442 			 * If the current 1GB page already has the required
7443 			 * memory type, then we need not demote this page. Just
7444 			 * increment tmpva to the next 1GB page frame.
7445 			 */
7446 			if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
7447 				tmpva = trunc_1gpage(tmpva) + NBPDP;
7448 				continue;
7449 			}
7450 
7451 			/*
7452 			 * If the current offset aligns with a 1GB page frame
7453 			 * and there is at least 1GB left within the range, then
7454 			 * we need not break down this page into 2MB pages.
7455 			 */
7456 			if ((tmpva & PDPMASK) == 0 &&
7457 			    tmpva + PDPMASK < base + size) {
7458 				tmpva += NBPDP;
7459 				continue;
7460 			}
7461 			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
7462 				return (ENOMEM);
7463 		}
7464 		pde = pmap_pdpe_to_pde(pdpe, tmpva);
7465 		if (*pde == 0)
7466 			return (EINVAL);
7467 		if (*pde & PG_PS) {
7468 			/*
7469 			 * If the current 2MB page already has the required
7470 			 * memory type, then we need not demote this page. Just
7471 			 * increment tmpva to the next 2MB page frame.
7472 			 */
7473 			if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
7474 				tmpva = trunc_2mpage(tmpva) + NBPDR;
7475 				continue;
7476 			}
7477 
7478 			/*
7479 			 * If the current offset aligns with a 2MB page frame
7480 			 * and there is at least 2MB left within the range, then
7481 			 * we need not break down this page into 4KB pages.
7482 			 */
7483 			if ((tmpva & PDRMASK) == 0 &&
7484 			    tmpva + PDRMASK < base + size) {
7485 				tmpva += NBPDR;
7486 				continue;
7487 			}
7488 			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
7489 				return (ENOMEM);
7490 		}
7491 		pte = pmap_pde_to_pte(pde, tmpva);
7492 		if (*pte == 0)
7493 			return (EINVAL);
7494 		tmpva += PAGE_SIZE;
7495 	}
7496 	error = 0;
7497 
7498 	/*
7499 	 * Ok, all the pages exist, so run through them updating their
7500 	 * cache mode if required.
7501 	 */
7502 	pa_start = pa_end = 0;
7503 	for (tmpva = base; tmpva < base + size; ) {
7504 		pdpe = pmap_pdpe(kernel_pmap, tmpva);
7505 		if (*pdpe & PG_PS) {
7506 			if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
7507 				pmap_pde_attr(pdpe, cache_bits_pde,
7508 				    X86_PG_PDE_CACHE);
7509 				changed = TRUE;
7510 			}
7511 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
7512 			    (*pdpe & PG_PS_FRAME) < dmaplimit) {
7513 				if (pa_start == pa_end) {
7514 					/* Start physical address run. */
7515 					pa_start = *pdpe & PG_PS_FRAME;
7516 					pa_end = pa_start + NBPDP;
7517 				} else if (pa_end == (*pdpe & PG_PS_FRAME))
7518 					pa_end += NBPDP;
7519 				else {
7520 					/* Run ended, update direct map. */
7521 					error = pmap_change_attr_locked(
7522 					    PHYS_TO_DMAP(pa_start),
7523 					    pa_end - pa_start, mode, noflush);
7524 					if (error != 0)
7525 						break;
7526 					/* Start physical address run. */
7527 					pa_start = *pdpe & PG_PS_FRAME;
7528 					pa_end = pa_start + NBPDP;
7529 				}
7530 			}
7531 			tmpva = trunc_1gpage(tmpva) + NBPDP;
7532 			continue;
7533 		}
7534 		pde = pmap_pdpe_to_pde(pdpe, tmpva);
7535 		if (*pde & PG_PS) {
7536 			if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
7537 				pmap_pde_attr(pde, cache_bits_pde,
7538 				    X86_PG_PDE_CACHE);
7539 				changed = TRUE;
7540 			}
7541 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
7542 			    (*pde & PG_PS_FRAME) < dmaplimit) {
7543 				if (pa_start == pa_end) {
7544 					/* Start physical address run. */
7545 					pa_start = *pde & PG_PS_FRAME;
7546 					pa_end = pa_start + NBPDR;
7547 				} else if (pa_end == (*pde & PG_PS_FRAME))
7548 					pa_end += NBPDR;
7549 				else {
7550 					/* Run ended, update direct map. */
7551 					error = pmap_change_attr_locked(
7552 					    PHYS_TO_DMAP(pa_start),
7553 					    pa_end - pa_start, mode, noflush);
7554 					if (error != 0)
7555 						break;
7556 					/* Start physical address run. */
7557 					pa_start = *pde & PG_PS_FRAME;
7558 					pa_end = pa_start + NBPDR;
7559 				}
7560 			}
7561 			tmpva = trunc_2mpage(tmpva) + NBPDR;
7562 		} else {
7563 			pte = pmap_pde_to_pte(pde, tmpva);
7564 			if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
7565 				pmap_pte_attr(pte, cache_bits_pte,
7566 				    X86_PG_PTE_CACHE);
7567 				changed = TRUE;
7568 			}
7569 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
7570 			    (*pte & PG_FRAME) < dmaplimit) {
7571 				if (pa_start == pa_end) {
7572 					/* Start physical address run. */
7573 					pa_start = *pte & PG_FRAME;
7574 					pa_end = pa_start + PAGE_SIZE;
7575 				} else if (pa_end == (*pte & PG_FRAME))
7576 					pa_end += PAGE_SIZE;
7577 				else {
7578 					/* Run ended, update direct map. */
7579 					error = pmap_change_attr_locked(
7580 					    PHYS_TO_DMAP(pa_start),
7581 					    pa_end - pa_start, mode, noflush);
7582 					if (error != 0)
7583 						break;
7584 					/* Start physical address run. */
7585 					pa_start = *pte & PG_FRAME;
7586 					pa_end = pa_start + PAGE_SIZE;
7587 				}
7588 			}
7589 			tmpva += PAGE_SIZE;
7590 		}
7591 	}
7592 	if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
7593 		pa_end1 = MIN(pa_end, dmaplimit);
7594 		if (pa_start != pa_end1)
7595 			error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
7596 			    pa_end1 - pa_start, mode, noflush);
7597 	}
7598 
7599 	/*
7600 	 * Flush CPU caches if required to make sure any data isn't cached that
7601 	 * shouldn't be, etc.
7602 	 */
7603 	if (changed) {
7604 		pmap_invalidate_range(kernel_pmap, base, tmpva);
7605 		if (!noflush)
7606 			pmap_invalidate_cache_range(base, tmpva);
7607 	}
7608 	return (error);
7609 }
7610 
7611 /*
7612  * Demotes any mapping within the direct map region that covers more than the
7613  * specified range of physical addresses.  This range's size must be a power
7614  * of two and its starting address must be a multiple of its size.  Since the
7615  * demotion does not change any attributes of the mapping, a TLB invalidation
7616  * is not mandatory.  The caller may, however, request a TLB invalidation.
7617  */
7618 void
7619 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
7620 {
7621 	pdp_entry_t *pdpe;
7622 	pd_entry_t *pde;
7623 	vm_offset_t va;
7624 	boolean_t changed;
7625 
7626 	if (len == 0)
7627 		return;
7628 	KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
7629 	KASSERT((base & (len - 1)) == 0,
7630 	    ("pmap_demote_DMAP: base is not a multiple of len"));
7631 	if (len < NBPDP && base < dmaplimit) {
7632 		va = PHYS_TO_DMAP(base);
7633 		changed = FALSE;
7634 		PMAP_LOCK(kernel_pmap);
7635 		pdpe = pmap_pdpe(kernel_pmap, va);
7636 		if ((*pdpe & X86_PG_V) == 0)
7637 			panic("pmap_demote_DMAP: invalid PDPE");
7638 		if ((*pdpe & PG_PS) != 0) {
7639 			if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
7640 				panic("pmap_demote_DMAP: PDPE failed");
7641 			changed = TRUE;
7642 		}
7643 		if (len < NBPDR) {
7644 			pde = pmap_pdpe_to_pde(pdpe, va);
7645 			if ((*pde & X86_PG_V) == 0)
7646 				panic("pmap_demote_DMAP: invalid PDE");
7647 			if ((*pde & PG_PS) != 0) {
7648 				if (!pmap_demote_pde(kernel_pmap, pde, va))
7649 					panic("pmap_demote_DMAP: PDE failed");
7650 				changed = TRUE;
7651 			}
7652 		}
7653 		if (changed && invalidate)
7654 			pmap_invalidate_page(kernel_pmap, va);
7655 		PMAP_UNLOCK(kernel_pmap);
7656 	}
7657 }
7658 
7659 /*
7660  * perform the pmap work for mincore
7661  */
7662 int
7663 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
7664 {
7665 	pd_entry_t *pdep;
7666 	pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
7667 	vm_paddr_t pa;
7668 	int val;
7669 
7670 	PG_A = pmap_accessed_bit(pmap);
7671 	PG_M = pmap_modified_bit(pmap);
7672 	PG_V = pmap_valid_bit(pmap);
7673 	PG_RW = pmap_rw_bit(pmap);
7674 
7675 	PMAP_LOCK(pmap);
7676 retry:
7677 	pdep = pmap_pde(pmap, addr);
7678 	if (pdep != NULL && (*pdep & PG_V)) {
7679 		if (*pdep & PG_PS) {
7680 			pte = *pdep;
7681 			/* Compute the physical address of the 4KB page. */
7682 			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
7683 			    PG_FRAME;
7684 			val = MINCORE_SUPER;
7685 		} else {
7686 			pte = *pmap_pde_to_pte(pdep, addr);
7687 			pa = pte & PG_FRAME;
7688 			val = 0;
7689 		}
7690 	} else {
7691 		pte = 0;
7692 		pa = 0;
7693 		val = 0;
7694 	}
7695 	if ((pte & PG_V) != 0) {
7696 		val |= MINCORE_INCORE;
7697 		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
7698 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
7699 		if ((pte & PG_A) != 0)
7700 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
7701 	}
7702 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
7703 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
7704 	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
7705 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
7706 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
7707 			goto retry;
7708 	} else
7709 		PA_UNLOCK_COND(*locked_pa);
7710 	PMAP_UNLOCK(pmap);
7711 	return (val);
7712 }
7713 
7714 static uint64_t
7715 pmap_pcid_alloc(pmap_t pmap, u_int cpuid)
7716 {
7717 	uint32_t gen, new_gen, pcid_next;
7718 
7719 	CRITICAL_ASSERT(curthread);
7720 	gen = PCPU_GET(pcid_gen);
7721 	if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN)
7722 		return (pti ? 0 : CR3_PCID_SAVE);
7723 	if (pmap->pm_pcids[cpuid].pm_gen == gen)
7724 		return (CR3_PCID_SAVE);
7725 	pcid_next = PCPU_GET(pcid_next);
7726 	KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) ||
7727 	    (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN),
7728 	    ("cpu %d pcid_next %#x", cpuid, pcid_next));
7729 	if ((!pti && pcid_next == PMAP_PCID_OVERMAX) ||
7730 	    (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) {
7731 		new_gen = gen + 1;
7732 		if (new_gen == 0)
7733 			new_gen = 1;
7734 		PCPU_SET(pcid_gen, new_gen);
7735 		pcid_next = PMAP_PCID_KERN + 1;
7736 	} else {
7737 		new_gen = gen;
7738 	}
7739 	pmap->pm_pcids[cpuid].pm_pcid = pcid_next;
7740 	pmap->pm_pcids[cpuid].pm_gen = new_gen;
7741 	PCPU_SET(pcid_next, pcid_next + 1);
7742 	return (0);
7743 }
7744 
7745 static uint64_t
7746 pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid)
7747 {
7748 	uint64_t cached;
7749 
7750 	cached = pmap_pcid_alloc(pmap, cpuid);
7751 	KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX,
7752 	    ("pmap %p cpu %d pcid %#x", pmap, cpuid,
7753 	    pmap->pm_pcids[cpuid].pm_pcid));
7754 	KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN ||
7755 	    pmap == kernel_pmap,
7756 	    ("non-kernel pmap pmap %p cpu %d pcid %#x",
7757 	    pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid));
7758 	return (cached);
7759 }
7760 
7761 static void
7762 pmap_activate_sw_pti_post(pmap_t pmap)
7763 {
7764 
7765 	if (pmap->pm_ucr3 != PMAP_NO_CR3)
7766 		PCPU_GET(tssp)->tss_rsp0 = ((vm_offset_t)PCPU_PTR(pti_stack) +
7767 		    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful;
7768 }
7769 
7770 static void inline
7771 pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1)
7772 {
7773 	struct invpcid_descr d;
7774 	uint64_t cached, cr3, kcr3, ucr3;
7775 
7776 	cached = pmap_pcid_alloc_checked(pmap, cpuid);
7777 	cr3 = rcr3();
7778 	if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
7779 		load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid);
7780 	PCPU_SET(curpmap, pmap);
7781 	kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid;
7782 	ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid |
7783 	    PMAP_PCID_USER_PT;
7784 
7785 	if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) {
7786 		/*
7787 		 * Explicitly invalidate translations cached from the
7788 		 * user page table.  They are not automatically
7789 		 * flushed by reload of cr3 with the kernel page table
7790 		 * pointer above.
7791 		 *
7792 		 * Note that the if() condition is resolved statically
7793 		 * by using the function argument instead of
7794 		 * runtime-evaluated invpcid_works value.
7795 		 */
7796 		if (invpcid_works1) {
7797 			d.pcid = PMAP_PCID_USER_PT |
7798 			    pmap->pm_pcids[cpuid].pm_pcid;
7799 			d.pad = 0;
7800 			d.addr = 0;
7801 			invpcid(&d, INVPCID_CTX);
7802 		} else {
7803 			pmap_pti_pcid_invalidate(ucr3, kcr3);
7804 		}
7805 	}
7806 
7807 	PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE);
7808 	PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE);
7809 	if (cached)
7810 		PCPU_INC(pm_save_cnt);
7811 }
7812 
7813 static void
7814 pmap_activate_sw_pcid_invpcid_pti(pmap_t pmap, u_int cpuid)
7815 {
7816 
7817 	pmap_activate_sw_pcid_pti(pmap, cpuid, true);
7818 	pmap_activate_sw_pti_post(pmap);
7819 }
7820 
7821 static void
7822 pmap_activate_sw_pcid_noinvpcid_pti(pmap_t pmap, u_int cpuid)
7823 {
7824 	register_t rflags;
7825 
7826 	/*
7827 	 * If the INVPCID instruction is not available,
7828 	 * invltlb_pcid_handler() is used to handle an invalidate_all
7829 	 * IPI, which checks for curpmap == smp_tlb_pmap.  The below
7830 	 * sequence of operations has a window where %CR3 is loaded
7831 	 * with the new pmap's PML4 address, but the curpmap value has
7832 	 * not yet been updated.  This causes the invltlb IPI handler,
7833 	 * which is called between the updates, to execute as a NOP,
7834 	 * which leaves stale TLB entries.
7835 	 *
7836 	 * Note that the most typical use of pmap_activate_sw(), from
7837 	 * the context switch, is immune to this race, because
7838 	 * interrupts are disabled (while the thread lock is owned),
7839 	 * and the IPI happens after curpmap is updated.  Protect
7840 	 * other callers in a similar way, by disabling interrupts
7841 	 * around the %cr3 register reload and curpmap assignment.
7842 	 */
7843 	rflags = intr_disable();
7844 	pmap_activate_sw_pcid_pti(pmap, cpuid, false);
7845 	intr_restore(rflags);
7846 	pmap_activate_sw_pti_post(pmap);
7847 }
7848 
7849 static void
7850 pmap_activate_sw_pcid_nopti(pmap_t pmap, u_int cpuid)
7851 {
7852 	uint64_t cached, cr3;
7853 
7854 	cached = pmap_pcid_alloc_checked(pmap, cpuid);
7855 	cr3 = rcr3();
7856 	if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
7857 		load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid |
7858 		    cached);
7859 	PCPU_SET(curpmap, pmap);
7860 	if (cached)
7861 		PCPU_INC(pm_save_cnt);
7862 }
7863 
7864 static void
7865 pmap_activate_sw_pcid_noinvpcid_nopti(pmap_t pmap, u_int cpuid)
7866 {
7867 	register_t rflags;
7868 
7869 	rflags = intr_disable();
7870 	pmap_activate_sw_pcid_nopti(pmap, cpuid);
7871 	intr_restore(rflags);
7872 }
7873 
7874 static void
7875 pmap_activate_sw_nopcid_nopti(pmap_t pmap, u_int cpuid __unused)
7876 {
7877 
7878 	load_cr3(pmap->pm_cr3);
7879 	PCPU_SET(curpmap, pmap);
7880 }
7881 
7882 static void
7883 pmap_activate_sw_nopcid_pti(pmap_t pmap, u_int cpuid __unused)
7884 {
7885 
7886 	pmap_activate_sw_nopcid_nopti(pmap, cpuid);
7887 	PCPU_SET(kcr3, pmap->pm_cr3);
7888 	PCPU_SET(ucr3, pmap->pm_ucr3);
7889 	pmap_activate_sw_pti_post(pmap);
7890 }
7891 
7892 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (pmap_t, u_int), static)
7893 {
7894 
7895 	if (pmap_pcid_enabled && pti && invpcid_works)
7896 		return (pmap_activate_sw_pcid_invpcid_pti);
7897 	else if (pmap_pcid_enabled && pti && !invpcid_works)
7898 		return (pmap_activate_sw_pcid_noinvpcid_pti);
7899 	else if (pmap_pcid_enabled && !pti && invpcid_works)
7900 		return (pmap_activate_sw_pcid_nopti);
7901 	else if (pmap_pcid_enabled && !pti && !invpcid_works)
7902 		return (pmap_activate_sw_pcid_noinvpcid_nopti);
7903 	else if (!pmap_pcid_enabled && pti)
7904 		return (pmap_activate_sw_nopcid_pti);
7905 	else /* if (!pmap_pcid_enabled && !pti) */
7906 		return (pmap_activate_sw_nopcid_nopti);
7907 }
7908 
7909 void
7910 pmap_activate_sw(struct thread *td)
7911 {
7912 	pmap_t oldpmap, pmap;
7913 	u_int cpuid;
7914 
7915 	oldpmap = PCPU_GET(curpmap);
7916 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
7917 	if (oldpmap == pmap)
7918 		return;
7919 	cpuid = PCPU_GET(cpuid);
7920 #ifdef SMP
7921 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
7922 #else
7923 	CPU_SET(cpuid, &pmap->pm_active);
7924 #endif
7925 	pmap_activate_sw_mode(pmap, cpuid);
7926 #ifdef SMP
7927 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
7928 #else
7929 	CPU_CLR(cpuid, &oldpmap->pm_active);
7930 #endif
7931 }
7932 
7933 void
7934 pmap_activate(struct thread *td)
7935 {
7936 
7937 	critical_enter();
7938 	pmap_activate_sw(td);
7939 	critical_exit();
7940 }
7941 
7942 void
7943 pmap_activate_boot(pmap_t pmap)
7944 {
7945 	uint64_t kcr3;
7946 	u_int cpuid;
7947 
7948 	/*
7949 	 * kernel_pmap must be never deactivated, and we ensure that
7950 	 * by never activating it at all.
7951 	 */
7952 	MPASS(pmap != kernel_pmap);
7953 
7954 	cpuid = PCPU_GET(cpuid);
7955 #ifdef SMP
7956 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
7957 #else
7958 	CPU_SET(cpuid, &pmap->pm_active);
7959 #endif
7960 	PCPU_SET(curpmap, pmap);
7961 	if (pti) {
7962 		kcr3 = pmap->pm_cr3;
7963 		if (pmap_pcid_enabled)
7964 			kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE;
7965 	} else {
7966 		kcr3 = PMAP_NO_CR3;
7967 	}
7968 	PCPU_SET(kcr3, kcr3);
7969 	PCPU_SET(ucr3, PMAP_NO_CR3);
7970 }
7971 
7972 void
7973 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
7974 {
7975 }
7976 
7977 /*
7978  *	Increase the starting virtual address of the given mapping if a
7979  *	different alignment might result in more superpage mappings.
7980  */
7981 void
7982 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
7983     vm_offset_t *addr, vm_size_t size)
7984 {
7985 	vm_offset_t superpage_offset;
7986 
7987 	if (size < NBPDR)
7988 		return;
7989 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
7990 		offset += ptoa(object->pg_color);
7991 	superpage_offset = offset & PDRMASK;
7992 	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
7993 	    (*addr & PDRMASK) == superpage_offset)
7994 		return;
7995 	if ((*addr & PDRMASK) < superpage_offset)
7996 		*addr = (*addr & ~PDRMASK) + superpage_offset;
7997 	else
7998 		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
7999 }
8000 
8001 #ifdef INVARIANTS
8002 static unsigned long num_dirty_emulations;
8003 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
8004 	     &num_dirty_emulations, 0, NULL);
8005 
8006 static unsigned long num_accessed_emulations;
8007 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
8008 	     &num_accessed_emulations, 0, NULL);
8009 
8010 static unsigned long num_superpage_accessed_emulations;
8011 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
8012 	     &num_superpage_accessed_emulations, 0, NULL);
8013 
8014 static unsigned long ad_emulation_superpage_promotions;
8015 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
8016 	     &ad_emulation_superpage_promotions, 0, NULL);
8017 #endif	/* INVARIANTS */
8018 
8019 int
8020 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
8021 {
8022 	int rv;
8023 	struct rwlock *lock;
8024 #if VM_NRESERVLEVEL > 0
8025 	vm_page_t m, mpte;
8026 #endif
8027 	pd_entry_t *pde;
8028 	pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
8029 
8030 	KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
8031 	    ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
8032 
8033 	if (!pmap_emulate_ad_bits(pmap))
8034 		return (-1);
8035 
8036 	PG_A = pmap_accessed_bit(pmap);
8037 	PG_M = pmap_modified_bit(pmap);
8038 	PG_V = pmap_valid_bit(pmap);
8039 	PG_RW = pmap_rw_bit(pmap);
8040 
8041 	rv = -1;
8042 	lock = NULL;
8043 	PMAP_LOCK(pmap);
8044 
8045 	pde = pmap_pde(pmap, va);
8046 	if (pde == NULL || (*pde & PG_V) == 0)
8047 		goto done;
8048 
8049 	if ((*pde & PG_PS) != 0) {
8050 		if (ftype == VM_PROT_READ) {
8051 #ifdef INVARIANTS
8052 			atomic_add_long(&num_superpage_accessed_emulations, 1);
8053 #endif
8054 			*pde |= PG_A;
8055 			rv = 0;
8056 		}
8057 		goto done;
8058 	}
8059 
8060 	pte = pmap_pde_to_pte(pde, va);
8061 	if ((*pte & PG_V) == 0)
8062 		goto done;
8063 
8064 	if (ftype == VM_PROT_WRITE) {
8065 		if ((*pte & PG_RW) == 0)
8066 			goto done;
8067 		/*
8068 		 * Set the modified and accessed bits simultaneously.
8069 		 *
8070 		 * Intel EPT PTEs that do software emulation of A/D bits map
8071 		 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
8072 		 * An EPT misconfiguration is triggered if the PTE is writable
8073 		 * but not readable (WR=10). This is avoided by setting PG_A
8074 		 * and PG_M simultaneously.
8075 		 */
8076 		*pte |= PG_M | PG_A;
8077 	} else {
8078 		*pte |= PG_A;
8079 	}
8080 
8081 #if VM_NRESERVLEVEL > 0
8082 	/* try to promote the mapping */
8083 	if (va < VM_MAXUSER_ADDRESS)
8084 		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
8085 	else
8086 		mpte = NULL;
8087 
8088 	m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
8089 
8090 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
8091 	    pmap_ps_enabled(pmap) &&
8092 	    (m->flags & PG_FICTITIOUS) == 0 &&
8093 	    vm_reserv_level_iffullpop(m) == 0) {
8094 		pmap_promote_pde(pmap, pde, va, &lock);
8095 #ifdef INVARIANTS
8096 		atomic_add_long(&ad_emulation_superpage_promotions, 1);
8097 #endif
8098 	}
8099 #endif
8100 
8101 #ifdef INVARIANTS
8102 	if (ftype == VM_PROT_WRITE)
8103 		atomic_add_long(&num_dirty_emulations, 1);
8104 	else
8105 		atomic_add_long(&num_accessed_emulations, 1);
8106 #endif
8107 	rv = 0;		/* success */
8108 done:
8109 	if (lock != NULL)
8110 		rw_wunlock(lock);
8111 	PMAP_UNLOCK(pmap);
8112 	return (rv);
8113 }
8114 
8115 void
8116 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
8117 {
8118 	pml4_entry_t *pml4;
8119 	pdp_entry_t *pdp;
8120 	pd_entry_t *pde;
8121 	pt_entry_t *pte, PG_V;
8122 	int idx;
8123 
8124 	idx = 0;
8125 	PG_V = pmap_valid_bit(pmap);
8126 	PMAP_LOCK(pmap);
8127 
8128 	pml4 = pmap_pml4e(pmap, va);
8129 	ptr[idx++] = *pml4;
8130 	if ((*pml4 & PG_V) == 0)
8131 		goto done;
8132 
8133 	pdp = pmap_pml4e_to_pdpe(pml4, va);
8134 	ptr[idx++] = *pdp;
8135 	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
8136 		goto done;
8137 
8138 	pde = pmap_pdpe_to_pde(pdp, va);
8139 	ptr[idx++] = *pde;
8140 	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
8141 		goto done;
8142 
8143 	pte = pmap_pde_to_pte(pde, va);
8144 	ptr[idx++] = *pte;
8145 
8146 done:
8147 	PMAP_UNLOCK(pmap);
8148 	*num = idx;
8149 }
8150 
8151 /**
8152  * Get the kernel virtual address of a set of physical pages. If there are
8153  * physical addresses not covered by the DMAP perform a transient mapping
8154  * that will be removed when calling pmap_unmap_io_transient.
8155  *
8156  * \param page        The pages the caller wishes to obtain the virtual
8157  *                    address on the kernel memory map.
8158  * \param vaddr       On return contains the kernel virtual memory address
8159  *                    of the pages passed in the page parameter.
8160  * \param count       Number of pages passed in.
8161  * \param can_fault   TRUE if the thread using the mapped pages can take
8162  *                    page faults, FALSE otherwise.
8163  *
8164  * \returns TRUE if the caller must call pmap_unmap_io_transient when
8165  *          finished or FALSE otherwise.
8166  *
8167  */
8168 boolean_t
8169 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
8170     boolean_t can_fault)
8171 {
8172 	vm_paddr_t paddr;
8173 	boolean_t needs_mapping;
8174 	pt_entry_t *pte;
8175 	int cache_bits, error __unused, i;
8176 
8177 	/*
8178 	 * Allocate any KVA space that we need, this is done in a separate
8179 	 * loop to prevent calling vmem_alloc while pinned.
8180 	 */
8181 	needs_mapping = FALSE;
8182 	for (i = 0; i < count; i++) {
8183 		paddr = VM_PAGE_TO_PHYS(page[i]);
8184 		if (__predict_false(paddr >= dmaplimit)) {
8185 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
8186 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
8187 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
8188 			needs_mapping = TRUE;
8189 		} else {
8190 			vaddr[i] = PHYS_TO_DMAP(paddr);
8191 		}
8192 	}
8193 
8194 	/* Exit early if everything is covered by the DMAP */
8195 	if (!needs_mapping)
8196 		return (FALSE);
8197 
8198 	/*
8199 	 * NB:  The sequence of updating a page table followed by accesses
8200 	 * to the corresponding pages used in the !DMAP case is subject to
8201 	 * the situation described in the "AMD64 Architecture Programmer's
8202 	 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
8203 	 * Coherency Considerations".  Therefore, issuing the INVLPG right
8204 	 * after modifying the PTE bits is crucial.
8205 	 */
8206 	if (!can_fault)
8207 		sched_pin();
8208 	for (i = 0; i < count; i++) {
8209 		paddr = VM_PAGE_TO_PHYS(page[i]);
8210 		if (paddr >= dmaplimit) {
8211 			if (can_fault) {
8212 				/*
8213 				 * Slow path, since we can get page faults
8214 				 * while mappings are active don't pin the
8215 				 * thread to the CPU and instead add a global
8216 				 * mapping visible to all CPUs.
8217 				 */
8218 				pmap_qenter(vaddr[i], &page[i], 1);
8219 			} else {
8220 				pte = vtopte(vaddr[i]);
8221 				cache_bits = pmap_cache_bits(kernel_pmap,
8222 				    page[i]->md.pat_mode, 0);
8223 				pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
8224 				    cache_bits);
8225 				invlpg(vaddr[i]);
8226 			}
8227 		}
8228 	}
8229 
8230 	return (needs_mapping);
8231 }
8232 
8233 void
8234 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
8235     boolean_t can_fault)
8236 {
8237 	vm_paddr_t paddr;
8238 	int i;
8239 
8240 	if (!can_fault)
8241 		sched_unpin();
8242 	for (i = 0; i < count; i++) {
8243 		paddr = VM_PAGE_TO_PHYS(page[i]);
8244 		if (paddr >= dmaplimit) {
8245 			if (can_fault)
8246 				pmap_qremove(vaddr[i], 1);
8247 			vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
8248 		}
8249 	}
8250 }
8251 
8252 vm_offset_t
8253 pmap_quick_enter_page(vm_page_t m)
8254 {
8255 	vm_paddr_t paddr;
8256 
8257 	paddr = VM_PAGE_TO_PHYS(m);
8258 	if (paddr < dmaplimit)
8259 		return (PHYS_TO_DMAP(paddr));
8260 	mtx_lock_spin(&qframe_mtx);
8261 	KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
8262 	pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
8263 	    X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0));
8264 	return (qframe);
8265 }
8266 
8267 void
8268 pmap_quick_remove_page(vm_offset_t addr)
8269 {
8270 
8271 	if (addr != qframe)
8272 		return;
8273 	pte_store(vtopte(qframe), 0);
8274 	invlpg(qframe);
8275 	mtx_unlock_spin(&qframe_mtx);
8276 }
8277 
8278 /*
8279  * Pdp pages from the large map are managed differently from either
8280  * kernel or user page table pages.  They are permanently allocated at
8281  * initialization time, and their wire count is permanently set to
8282  * zero.  The pml4 entries pointing to those pages are copied into
8283  * each allocated pmap.
8284  *
8285  * In contrast, pd and pt pages are managed like user page table
8286  * pages.  They are dynamically allocated, and their wire count
8287  * represents the number of valid entries within the page.
8288  */
8289 static vm_page_t
8290 pmap_large_map_getptp_unlocked(void)
8291 {
8292 	vm_page_t m;
8293 
8294 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
8295 	    VM_ALLOC_ZERO);
8296 	if (m != NULL && (m->flags & PG_ZERO) == 0)
8297 		pmap_zero_page(m);
8298 	return (m);
8299 }
8300 
8301 static vm_page_t
8302 pmap_large_map_getptp(void)
8303 {
8304 	vm_page_t m;
8305 
8306 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
8307 	m = pmap_large_map_getptp_unlocked();
8308 	if (m == NULL) {
8309 		PMAP_UNLOCK(kernel_pmap);
8310 		vm_wait(NULL);
8311 		PMAP_LOCK(kernel_pmap);
8312 		/* Callers retry. */
8313 	}
8314 	return (m);
8315 }
8316 
8317 static pdp_entry_t *
8318 pmap_large_map_pdpe(vm_offset_t va)
8319 {
8320 	vm_pindex_t pml4_idx;
8321 	vm_paddr_t mphys;
8322 
8323 	pml4_idx = pmap_pml4e_index(va);
8324 	KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
8325 	    ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
8326 	    "%#jx lm_ents %d",
8327 	    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
8328 	KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0,
8329 	    ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
8330 	    "LMSPML4I %#jx lm_ents %d",
8331 	    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
8332 	mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME;
8333 	return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
8334 }
8335 
8336 static pd_entry_t *
8337 pmap_large_map_pde(vm_offset_t va)
8338 {
8339 	pdp_entry_t *pdpe;
8340 	vm_page_t m;
8341 	vm_paddr_t mphys;
8342 
8343 retry:
8344 	pdpe = pmap_large_map_pdpe(va);
8345 	if (*pdpe == 0) {
8346 		m = pmap_large_map_getptp();
8347 		if (m == NULL)
8348 			goto retry;
8349 		mphys = VM_PAGE_TO_PHYS(m);
8350 		*pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
8351 	} else {
8352 		MPASS((*pdpe & X86_PG_PS) == 0);
8353 		mphys = *pdpe & PG_FRAME;
8354 	}
8355 	return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va));
8356 }
8357 
8358 static pt_entry_t *
8359 pmap_large_map_pte(vm_offset_t va)
8360 {
8361 	pd_entry_t *pde;
8362 	vm_page_t m;
8363 	vm_paddr_t mphys;
8364 
8365 retry:
8366 	pde = pmap_large_map_pde(va);
8367 	if (*pde == 0) {
8368 		m = pmap_large_map_getptp();
8369 		if (m == NULL)
8370 			goto retry;
8371 		mphys = VM_PAGE_TO_PHYS(m);
8372 		*pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
8373 		PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->wire_count++;
8374 	} else {
8375 		MPASS((*pde & X86_PG_PS) == 0);
8376 		mphys = *pde & PG_FRAME;
8377 	}
8378 	return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va));
8379 }
8380 
8381 static int
8382 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase,
8383     vmem_addr_t *vmem_res)
8384 {
8385 
8386 	/*
8387 	 * Large mappings are all but static.  Consequently, there
8388 	 * is no point in waiting for an earlier allocation to be
8389 	 * freed.
8390 	 */
8391 	return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN,
8392 	    VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res));
8393 }
8394 
8395 int
8396 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr,
8397     vm_memattr_t mattr)
8398 {
8399 	pdp_entry_t *pdpe;
8400 	pd_entry_t *pde;
8401 	pt_entry_t *pte;
8402 	vm_offset_t va, inc;
8403 	vmem_addr_t vmem_res;
8404 	vm_paddr_t pa;
8405 	int error;
8406 
8407 	if (len == 0 || spa + len < spa)
8408 		return (EINVAL);
8409 
8410 	/* See if DMAP can serve. */
8411 	if (spa + len <= dmaplimit) {
8412 		va = PHYS_TO_DMAP(spa);
8413 		*addr = (void *)va;
8414 		return (pmap_change_attr(va, len, mattr));
8415 	}
8416 
8417 	/*
8418 	 * No, allocate KVA.  Fit the address with best possible
8419 	 * alignment for superpages.  Fall back to worse align if
8420 	 * failed.
8421 	 */
8422 	error = ENOMEM;
8423 	if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len,
8424 	    NBPDP) >= roundup2(spa, NBPDP) + NBPDP)
8425 		error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK,
8426 		    &vmem_res);
8427 	if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa,
8428 	    NBPDR) + NBPDR)
8429 		error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK,
8430 		    &vmem_res);
8431 	if (error != 0)
8432 		error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res);
8433 	if (error != 0)
8434 		return (error);
8435 
8436 	/*
8437 	 * Fill pagetable.  PG_M is not pre-set, we scan modified bits
8438 	 * in the pagetable to minimize flushing.  No need to
8439 	 * invalidate TLB, since we only update invalid entries.
8440 	 */
8441 	PMAP_LOCK(kernel_pmap);
8442 	for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc,
8443 	    len -= inc) {
8444 		if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP &&
8445 		    (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) {
8446 			pdpe = pmap_large_map_pdpe(va);
8447 			MPASS(*pdpe == 0);
8448 			*pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW |
8449 			    X86_PG_V | X86_PG_A | pg_nx |
8450 			    pmap_cache_bits(kernel_pmap, mattr, TRUE);
8451 			inc = NBPDP;
8452 		} else if (len >= NBPDR && (pa & PDRMASK) == 0 &&
8453 		    (va & PDRMASK) == 0) {
8454 			pde = pmap_large_map_pde(va);
8455 			MPASS(*pde == 0);
8456 			*pde = pa | pg_g | X86_PG_PS | X86_PG_RW |
8457 			    X86_PG_V | X86_PG_A | pg_nx |
8458 			    pmap_cache_bits(kernel_pmap, mattr, TRUE);
8459 			PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->
8460 			    wire_count++;
8461 			inc = NBPDR;
8462 		} else {
8463 			pte = pmap_large_map_pte(va);
8464 			MPASS(*pte == 0);
8465 			*pte = pa | pg_g | X86_PG_RW | X86_PG_V |
8466 			    X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap,
8467 			    mattr, FALSE);
8468 			PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))->
8469 			    wire_count++;
8470 			inc = PAGE_SIZE;
8471 		}
8472 	}
8473 	PMAP_UNLOCK(kernel_pmap);
8474 	MPASS(len == 0);
8475 
8476 	*addr = (void *)vmem_res;
8477 	return (0);
8478 }
8479 
8480 void
8481 pmap_large_unmap(void *svaa, vm_size_t len)
8482 {
8483 	vm_offset_t sva, va;
8484 	vm_size_t inc;
8485 	pdp_entry_t *pdpe, pdp;
8486 	pd_entry_t *pde, pd;
8487 	pt_entry_t *pte;
8488 	vm_page_t m;
8489 	struct spglist spgf;
8490 
8491 	sva = (vm_offset_t)svaa;
8492 	if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS &&
8493 	    sva + len <= DMAP_MIN_ADDRESS + dmaplimit))
8494 		return;
8495 
8496 	SLIST_INIT(&spgf);
8497 	KASSERT(LARGEMAP_MIN_ADDRESS <= sva && sva + len <=
8498 	    LARGEMAP_MAX_ADDRESS + NBPML4 * (u_long)lm_ents,
8499 	    ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len));
8500 	PMAP_LOCK(kernel_pmap);
8501 	for (va = sva; va < sva + len; va += inc) {
8502 		pdpe = pmap_large_map_pdpe(va);
8503 		pdp = *pdpe;
8504 		KASSERT((pdp & X86_PG_V) != 0,
8505 		    ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
8506 		    (u_long)pdpe, pdp));
8507 		if ((pdp & X86_PG_PS) != 0) {
8508 			KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
8509 			    ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
8510 			    (u_long)pdpe, pdp));
8511 			KASSERT((va & PDPMASK) == 0,
8512 			    ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va,
8513 			    (u_long)pdpe, pdp));
8514 			KASSERT(va + NBPDP <= sva + len,
8515 			    ("unmap covers partial 1GB page, sva %#lx va %#lx "
8516 			    "pdpe %#lx pdp %#lx len %#lx", sva, va,
8517 			    (u_long)pdpe, pdp, len));
8518 			*pdpe = 0;
8519 			inc = NBPDP;
8520 			continue;
8521 		}
8522 		pde = pmap_pdpe_to_pde(pdpe, va);
8523 		pd = *pde;
8524 		KASSERT((pd & X86_PG_V) != 0,
8525 		    ("invalid pd va %#lx pde %#lx pd %#lx", va,
8526 		    (u_long)pde, pd));
8527 		if ((pd & X86_PG_PS) != 0) {
8528 			KASSERT((va & PDRMASK) == 0,
8529 			    ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va,
8530 			    (u_long)pde, pd));
8531 			KASSERT(va + NBPDR <= sva + len,
8532 			    ("unmap covers partial 2MB page, sva %#lx va %#lx "
8533 			    "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde,
8534 			    pd, len));
8535 			pde_store(pde, 0);
8536 			inc = NBPDR;
8537 			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
8538 			m->wire_count--;
8539 			if (m->wire_count == 0) {
8540 				*pdpe = 0;
8541 				SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
8542 			}
8543 			continue;
8544 		}
8545 		pte = pmap_pde_to_pte(pde, va);
8546 		KASSERT((*pte & X86_PG_V) != 0,
8547 		    ("invalid pte va %#lx pte %#lx pt %#lx", va,
8548 		    (u_long)pte, *pte));
8549 		pte_clear(pte);
8550 		inc = PAGE_SIZE;
8551 		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte));
8552 		m->wire_count--;
8553 		if (m->wire_count == 0) {
8554 			*pde = 0;
8555 			SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
8556 			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
8557 			m->wire_count--;
8558 			if (m->wire_count == 0) {
8559 				*pdpe = 0;
8560 				SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
8561 			}
8562 		}
8563 	}
8564 	pmap_invalidate_range(kernel_pmap, sva, sva + len);
8565 	PMAP_UNLOCK(kernel_pmap);
8566 	vm_page_free_pages_toq(&spgf, false);
8567 	vmem_free(large_vmem, sva, len);
8568 }
8569 
8570 static void
8571 pmap_large_map_wb_fence_mfence(void)
8572 {
8573 
8574 	mfence();
8575 }
8576 
8577 static void
8578 pmap_large_map_wb_fence_sfence(void)
8579 {
8580 
8581 	sfence();
8582 }
8583 
8584 static void
8585 pmap_large_map_wb_fence_nop(void)
8586 {
8587 }
8588 
8589 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void), static)
8590 {
8591 
8592 	if (cpu_vendor_id != CPU_VENDOR_INTEL)
8593 		return (pmap_large_map_wb_fence_mfence);
8594 	else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB |
8595 	    CPUID_STDEXT_CLFLUSHOPT)) == 0)
8596 		return (pmap_large_map_wb_fence_sfence);
8597 	else
8598 		/* clflush is strongly enough ordered */
8599 		return (pmap_large_map_wb_fence_nop);
8600 }
8601 
8602 static void
8603 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len)
8604 {
8605 
8606 	for (; len > 0; len -= cpu_clflush_line_size,
8607 	    va += cpu_clflush_line_size)
8608 		clwb(va);
8609 }
8610 
8611 static void
8612 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len)
8613 {
8614 
8615 	for (; len > 0; len -= cpu_clflush_line_size,
8616 	    va += cpu_clflush_line_size)
8617 		clflushopt(va);
8618 }
8619 
8620 static void
8621 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len)
8622 {
8623 
8624 	for (; len > 0; len -= cpu_clflush_line_size,
8625 	    va += cpu_clflush_line_size)
8626 		clflush(va);
8627 }
8628 
8629 static void
8630 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused)
8631 {
8632 }
8633 
8634 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t),
8635     static)
8636 {
8637 
8638 	if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0)
8639 		return (pmap_large_map_flush_range_clwb);
8640 	else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0)
8641 		return (pmap_large_map_flush_range_clflushopt);
8642 	else if ((cpu_feature & CPUID_CLFSH) != 0)
8643 		return (pmap_large_map_flush_range_clflush);
8644 	else
8645 		return (pmap_large_map_flush_range_nop);
8646 }
8647 
8648 static void
8649 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva)
8650 {
8651 	volatile u_long *pe;
8652 	u_long p;
8653 	vm_offset_t va;
8654 	vm_size_t inc;
8655 	bool seen_other;
8656 
8657 	for (va = sva; va < eva; va += inc) {
8658 		inc = 0;
8659 		if ((amd_feature & AMDID_PAGE1GB) != 0) {
8660 			pe = (volatile u_long *)pmap_large_map_pdpe(va);
8661 			p = *pe;
8662 			if ((p & X86_PG_PS) != 0)
8663 				inc = NBPDP;
8664 		}
8665 		if (inc == 0) {
8666 			pe = (volatile u_long *)pmap_large_map_pde(va);
8667 			p = *pe;
8668 			if ((p & X86_PG_PS) != 0)
8669 				inc = NBPDR;
8670 		}
8671 		if (inc == 0) {
8672 			pe = (volatile u_long *)pmap_large_map_pte(va);
8673 			p = *pe;
8674 			inc = PAGE_SIZE;
8675 		}
8676 		seen_other = false;
8677 		for (;;) {
8678 			if ((p & X86_PG_AVAIL1) != 0) {
8679 				/*
8680 				 * Spin-wait for the end of a parallel
8681 				 * write-back.
8682 				 */
8683 				cpu_spinwait();
8684 				p = *pe;
8685 
8686 				/*
8687 				 * If we saw other write-back
8688 				 * occuring, we cannot rely on PG_M to
8689 				 * indicate state of the cache.  The
8690 				 * PG_M bit is cleared before the
8691 				 * flush to avoid ignoring new writes,
8692 				 * and writes which are relevant for
8693 				 * us might happen after.
8694 				 */
8695 				seen_other = true;
8696 				continue;
8697 			}
8698 
8699 			if ((p & X86_PG_M) != 0 || seen_other) {
8700 				if (!atomic_fcmpset_long(pe, &p,
8701 				    (p & ~X86_PG_M) | X86_PG_AVAIL1))
8702 					/*
8703 					 * If we saw PG_M without
8704 					 * PG_AVAIL1, and then on the
8705 					 * next attempt we do not
8706 					 * observe either PG_M or
8707 					 * PG_AVAIL1, the other
8708 					 * write-back started after us
8709 					 * and finished before us.  We
8710 					 * can rely on it doing our
8711 					 * work.
8712 					 */
8713 					continue;
8714 				pmap_large_map_flush_range(va, inc);
8715 				atomic_clear_long(pe, X86_PG_AVAIL1);
8716 			}
8717 			break;
8718 		}
8719 		maybe_yield();
8720 	}
8721 }
8722 
8723 /*
8724  * Write-back cache lines for the given address range.
8725  *
8726  * Must be called only on the range or sub-range returned from
8727  * pmap_large_map().  Must not be called on the coalesced ranges.
8728  *
8729  * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH
8730  * instructions support.
8731  */
8732 void
8733 pmap_large_map_wb(void *svap, vm_size_t len)
8734 {
8735 	vm_offset_t eva, sva;
8736 
8737 	sva = (vm_offset_t)svap;
8738 	eva = sva + len;
8739 	pmap_large_map_wb_fence();
8740 	if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) {
8741 		pmap_large_map_flush_range(sva, len);
8742 	} else {
8743 		KASSERT(sva >= LARGEMAP_MIN_ADDRESS &&
8744 		    eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4,
8745 		    ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len));
8746 		pmap_large_map_wb_large(sva, eva);
8747 	}
8748 	pmap_large_map_wb_fence();
8749 }
8750 
8751 static vm_page_t
8752 pmap_pti_alloc_page(void)
8753 {
8754 	vm_page_t m;
8755 
8756 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8757 	m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY |
8758 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
8759 	return (m);
8760 }
8761 
8762 static bool
8763 pmap_pti_free_page(vm_page_t m)
8764 {
8765 
8766 	KASSERT(m->wire_count > 0, ("page %p not wired", m));
8767 	if (!vm_page_unwire_noq(m))
8768 		return (false);
8769 	vm_page_free_zero(m);
8770 	return (true);
8771 }
8772 
8773 static void
8774 pmap_pti_init(void)
8775 {
8776 	vm_page_t pml4_pg;
8777 	pdp_entry_t *pdpe;
8778 	vm_offset_t va;
8779 	int i;
8780 
8781 	if (!pti)
8782 		return;
8783 	pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL);
8784 	VM_OBJECT_WLOCK(pti_obj);
8785 	pml4_pg = pmap_pti_alloc_page();
8786 	pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
8787 	for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
8788 	    va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
8789 		pdpe = pmap_pti_pdpe(va);
8790 		pmap_pti_wire_pte(pdpe);
8791 	}
8792 	pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
8793 	    (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
8794 	pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt +
8795 	    sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false);
8796 	pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
8797 	    sizeof(struct gate_descriptor) * NIDT, false);
8798 	pmap_pti_add_kva_locked((vm_offset_t)common_tss,
8799 	    (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false);
8800 	CPU_FOREACH(i) {
8801 		/* Doublefault stack IST 1 */
8802 		va = common_tss[i].tss_ist1;
8803 		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
8804 		/* NMI stack IST 2 */
8805 		va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu);
8806 		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
8807 		/* MC# stack IST 3 */
8808 		va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu);
8809 		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
8810 		/* DB# stack IST 4 */
8811 		va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu);
8812 		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
8813 	}
8814 	pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
8815 	    (vm_offset_t)etext, true);
8816 	pti_finalized = true;
8817 	VM_OBJECT_WUNLOCK(pti_obj);
8818 }
8819 SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL);
8820 
8821 static pdp_entry_t *
8822 pmap_pti_pdpe(vm_offset_t va)
8823 {
8824 	pml4_entry_t *pml4e;
8825 	pdp_entry_t *pdpe;
8826 	vm_page_t m;
8827 	vm_pindex_t pml4_idx;
8828 	vm_paddr_t mphys;
8829 
8830 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8831 
8832 	pml4_idx = pmap_pml4e_index(va);
8833 	pml4e = &pti_pml4[pml4_idx];
8834 	m = NULL;
8835 	if (*pml4e == 0) {
8836 		if (pti_finalized)
8837 			panic("pml4 alloc after finalization\n");
8838 		m = pmap_pti_alloc_page();
8839 		if (*pml4e != 0) {
8840 			pmap_pti_free_page(m);
8841 			mphys = *pml4e & ~PAGE_MASK;
8842 		} else {
8843 			mphys = VM_PAGE_TO_PHYS(m);
8844 			*pml4e = mphys | X86_PG_RW | X86_PG_V;
8845 		}
8846 	} else {
8847 		mphys = *pml4e & ~PAGE_MASK;
8848 	}
8849 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va);
8850 	return (pdpe);
8851 }
8852 
8853 static void
8854 pmap_pti_wire_pte(void *pte)
8855 {
8856 	vm_page_t m;
8857 
8858 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8859 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
8860 	m->wire_count++;
8861 }
8862 
8863 static void
8864 pmap_pti_unwire_pde(void *pde, bool only_ref)
8865 {
8866 	vm_page_t m;
8867 
8868 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8869 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde));
8870 	MPASS(m->wire_count > 0);
8871 	MPASS(only_ref || m->wire_count > 1);
8872 	pmap_pti_free_page(m);
8873 }
8874 
8875 static void
8876 pmap_pti_unwire_pte(void *pte, vm_offset_t va)
8877 {
8878 	vm_page_t m;
8879 	pd_entry_t *pde;
8880 
8881 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8882 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
8883 	MPASS(m->wire_count > 0);
8884 	if (pmap_pti_free_page(m)) {
8885 		pde = pmap_pti_pde(va);
8886 		MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V);
8887 		*pde = 0;
8888 		pmap_pti_unwire_pde(pde, false);
8889 	}
8890 }
8891 
8892 static pd_entry_t *
8893 pmap_pti_pde(vm_offset_t va)
8894 {
8895 	pdp_entry_t *pdpe;
8896 	pd_entry_t *pde;
8897 	vm_page_t m;
8898 	vm_pindex_t pd_idx;
8899 	vm_paddr_t mphys;
8900 
8901 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8902 
8903 	pdpe = pmap_pti_pdpe(va);
8904 	if (*pdpe == 0) {
8905 		m = pmap_pti_alloc_page();
8906 		if (*pdpe != 0) {
8907 			pmap_pti_free_page(m);
8908 			MPASS((*pdpe & X86_PG_PS) == 0);
8909 			mphys = *pdpe & ~PAGE_MASK;
8910 		} else {
8911 			mphys =  VM_PAGE_TO_PHYS(m);
8912 			*pdpe = mphys | X86_PG_RW | X86_PG_V;
8913 		}
8914 	} else {
8915 		MPASS((*pdpe & X86_PG_PS) == 0);
8916 		mphys = *pdpe & ~PAGE_MASK;
8917 	}
8918 
8919 	pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
8920 	pd_idx = pmap_pde_index(va);
8921 	pde += pd_idx;
8922 	return (pde);
8923 }
8924 
8925 static pt_entry_t *
8926 pmap_pti_pte(vm_offset_t va, bool *unwire_pde)
8927 {
8928 	pd_entry_t *pde;
8929 	pt_entry_t *pte;
8930 	vm_page_t m;
8931 	vm_paddr_t mphys;
8932 
8933 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8934 
8935 	pde = pmap_pti_pde(va);
8936 	if (unwire_pde != NULL) {
8937 		*unwire_pde = true;
8938 		pmap_pti_wire_pte(pde);
8939 	}
8940 	if (*pde == 0) {
8941 		m = pmap_pti_alloc_page();
8942 		if (*pde != 0) {
8943 			pmap_pti_free_page(m);
8944 			MPASS((*pde & X86_PG_PS) == 0);
8945 			mphys = *pde & ~(PAGE_MASK | pg_nx);
8946 		} else {
8947 			mphys = VM_PAGE_TO_PHYS(m);
8948 			*pde = mphys | X86_PG_RW | X86_PG_V;
8949 			if (unwire_pde != NULL)
8950 				*unwire_pde = false;
8951 		}
8952 	} else {
8953 		MPASS((*pde & X86_PG_PS) == 0);
8954 		mphys = *pde & ~(PAGE_MASK | pg_nx);
8955 	}
8956 
8957 	pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
8958 	pte += pmap_pte_index(va);
8959 
8960 	return (pte);
8961 }
8962 
8963 static void
8964 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec)
8965 {
8966 	vm_paddr_t pa;
8967 	pd_entry_t *pde;
8968 	pt_entry_t *pte, ptev;
8969 	bool unwire_pde;
8970 
8971 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8972 
8973 	sva = trunc_page(sva);
8974 	MPASS(sva > VM_MAXUSER_ADDRESS);
8975 	eva = round_page(eva);
8976 	MPASS(sva < eva);
8977 	for (; sva < eva; sva += PAGE_SIZE) {
8978 		pte = pmap_pti_pte(sva, &unwire_pde);
8979 		pa = pmap_kextract(sva);
8980 		ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G |
8981 		    (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap,
8982 		    VM_MEMATTR_DEFAULT, FALSE);
8983 		if (*pte == 0) {
8984 			pte_store(pte, ptev);
8985 			pmap_pti_wire_pte(pte);
8986 		} else {
8987 			KASSERT(!pti_finalized,
8988 			    ("pti overlap after fin %#lx %#lx %#lx",
8989 			    sva, *pte, ptev));
8990 			KASSERT(*pte == ptev,
8991 			    ("pti non-identical pte after fin %#lx %#lx %#lx",
8992 			    sva, *pte, ptev));
8993 		}
8994 		if (unwire_pde) {
8995 			pde = pmap_pti_pde(sva);
8996 			pmap_pti_unwire_pde(pde, true);
8997 		}
8998 	}
8999 }
9000 
9001 void
9002 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec)
9003 {
9004 
9005 	if (!pti)
9006 		return;
9007 	VM_OBJECT_WLOCK(pti_obj);
9008 	pmap_pti_add_kva_locked(sva, eva, exec);
9009 	VM_OBJECT_WUNLOCK(pti_obj);
9010 }
9011 
9012 void
9013 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
9014 {
9015 	pt_entry_t *pte;
9016 	vm_offset_t va;
9017 
9018 	if (!pti)
9019 		return;
9020 	sva = rounddown2(sva, PAGE_SIZE);
9021 	MPASS(sva > VM_MAXUSER_ADDRESS);
9022 	eva = roundup2(eva, PAGE_SIZE);
9023 	MPASS(sva < eva);
9024 	VM_OBJECT_WLOCK(pti_obj);
9025 	for (va = sva; va < eva; va += PAGE_SIZE) {
9026 		pte = pmap_pti_pte(va, NULL);
9027 		KASSERT((*pte & X86_PG_V) != 0,
9028 		    ("invalid pte va %#lx pte %#lx pt %#lx", va,
9029 		    (u_long)pte, *pte));
9030 		pte_clear(pte);
9031 		pmap_pti_unwire_pte(pte, va);
9032 	}
9033 	pmap_invalidate_range(kernel_pmap, sva, eva);
9034 	VM_OBJECT_WUNLOCK(pti_obj);
9035 }
9036 
9037 static void *
9038 pkru_dup_range(void *ctx __unused, void *data)
9039 {
9040 	struct pmap_pkru_range *node, *new_node;
9041 
9042 	new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
9043 	if (new_node == NULL)
9044 		return (NULL);
9045 	node = data;
9046 	memcpy(new_node, node, sizeof(*node));
9047 	return (new_node);
9048 }
9049 
9050 static void
9051 pkru_free_range(void *ctx __unused, void *node)
9052 {
9053 
9054 	uma_zfree(pmap_pkru_ranges_zone, node);
9055 }
9056 
9057 static int
9058 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
9059     int flags)
9060 {
9061 	struct pmap_pkru_range *ppr;
9062 	int error;
9063 
9064 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9065 	MPASS(pmap->pm_type == PT_X86);
9066 	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
9067 	if ((flags & AMD64_PKRU_EXCL) != 0 &&
9068 	    !rangeset_check_empty(&pmap->pm_pkru, sva, eva))
9069 		return (EBUSY);
9070 	ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
9071 	if (ppr == NULL)
9072 		return (ENOMEM);
9073 	ppr->pkru_keyidx = keyidx;
9074 	ppr->pkru_flags = flags & AMD64_PKRU_PERSIST;
9075 	error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr);
9076 	if (error != 0)
9077 		uma_zfree(pmap_pkru_ranges_zone, ppr);
9078 	return (error);
9079 }
9080 
9081 static int
9082 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9083 {
9084 
9085 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9086 	MPASS(pmap->pm_type == PT_X86);
9087 	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
9088 	return (rangeset_remove(&pmap->pm_pkru, sva, eva));
9089 }
9090 
9091 static void
9092 pmap_pkru_deassign_all(pmap_t pmap)
9093 {
9094 
9095 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9096 	if (pmap->pm_type == PT_X86 &&
9097 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
9098 		rangeset_remove_all(&pmap->pm_pkru);
9099 }
9100 
9101 static bool
9102 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9103 {
9104 	struct pmap_pkru_range *ppr, *prev_ppr;
9105 	vm_offset_t va;
9106 
9107 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9108 	if (pmap->pm_type != PT_X86 ||
9109 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
9110 	    sva >= VM_MAXUSER_ADDRESS)
9111 		return (true);
9112 	MPASS(eva <= VM_MAXUSER_ADDRESS);
9113 	for (va = sva, prev_ppr = NULL; va < eva;) {
9114 		ppr = rangeset_lookup(&pmap->pm_pkru, va);
9115 		if ((ppr == NULL) ^ (prev_ppr == NULL))
9116 			return (false);
9117 		if (ppr == NULL) {
9118 			va += PAGE_SIZE;
9119 			continue;
9120 		}
9121 		if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx)
9122 			return (false);
9123 		va = ppr->pkru_rs_el.re_end;
9124 	}
9125 	return (true);
9126 }
9127 
9128 static pt_entry_t
9129 pmap_pkru_get(pmap_t pmap, vm_offset_t va)
9130 {
9131 	struct pmap_pkru_range *ppr;
9132 
9133 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9134 	if (pmap->pm_type != PT_X86 ||
9135 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
9136 	    va >= VM_MAXUSER_ADDRESS)
9137 		return (0);
9138 	ppr = rangeset_lookup(&pmap->pm_pkru, va);
9139 	if (ppr != NULL)
9140 		return (X86_PG_PKU(ppr->pkru_keyidx));
9141 	return (0);
9142 }
9143 
9144 static bool
9145 pred_pkru_on_remove(void *ctx __unused, void *r)
9146 {
9147 	struct pmap_pkru_range *ppr;
9148 
9149 	ppr = r;
9150 	return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0);
9151 }
9152 
9153 static void
9154 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9155 {
9156 
9157 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9158 	if (pmap->pm_type == PT_X86 &&
9159 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
9160 		rangeset_remove_pred(&pmap->pm_pkru, sva, eva,
9161 		    pred_pkru_on_remove);
9162 	}
9163 }
9164 
9165 static int
9166 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap)
9167 {
9168 
9169 	PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
9170 	PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
9171 	MPASS(dst_pmap->pm_type == PT_X86);
9172 	MPASS(src_pmap->pm_type == PT_X86);
9173 	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
9174 	if (src_pmap->pm_pkru.rs_data_ctx == NULL)
9175 		return (0);
9176 	return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru));
9177 }
9178 
9179 static void
9180 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
9181     u_int keyidx)
9182 {
9183 	pml4_entry_t *pml4e;
9184 	pdp_entry_t *pdpe;
9185 	pd_entry_t newpde, ptpaddr, *pde;
9186 	pt_entry_t newpte, *ptep, pte;
9187 	vm_offset_t va, va_next;
9188 	bool changed;
9189 
9190 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9191 	MPASS(pmap->pm_type == PT_X86);
9192 	MPASS(keyidx <= PMAP_MAX_PKRU_IDX);
9193 
9194 	for (changed = false, va = sva; va < eva; va = va_next) {
9195 		pml4e = pmap_pml4e(pmap, va);
9196 		if ((*pml4e & X86_PG_V) == 0) {
9197 			va_next = (va + NBPML4) & ~PML4MASK;
9198 			if (va_next < va)
9199 				va_next = eva;
9200 			continue;
9201 		}
9202 
9203 		pdpe = pmap_pml4e_to_pdpe(pml4e, va);
9204 		if ((*pdpe & X86_PG_V) == 0) {
9205 			va_next = (va + NBPDP) & ~PDPMASK;
9206 			if (va_next < va)
9207 				va_next = eva;
9208 			continue;
9209 		}
9210 
9211 		va_next = (va + NBPDR) & ~PDRMASK;
9212 		if (va_next < va)
9213 			va_next = eva;
9214 
9215 		pde = pmap_pdpe_to_pde(pdpe, va);
9216 		ptpaddr = *pde;
9217 		if (ptpaddr == 0)
9218 			continue;
9219 
9220 		MPASS((ptpaddr & X86_PG_V) != 0);
9221 		if ((ptpaddr & PG_PS) != 0) {
9222 			if (va + NBPDR == va_next && eva >= va_next) {
9223 				newpde = (ptpaddr & ~X86_PG_PKU_MASK) |
9224 				    X86_PG_PKU(keyidx);
9225 				if (newpde != ptpaddr) {
9226 					*pde = newpde;
9227 					changed = true;
9228 				}
9229 				continue;
9230 			} else if (!pmap_demote_pde(pmap, pde, va)) {
9231 				continue;
9232 			}
9233 		}
9234 
9235 		if (va_next > eva)
9236 			va_next = eva;
9237 
9238 		for (ptep = pmap_pde_to_pte(pde, va); va != va_next;
9239 		    ptep++, va += PAGE_SIZE) {
9240 			pte = *ptep;
9241 			if ((pte & X86_PG_V) == 0)
9242 				continue;
9243 			newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx);
9244 			if (newpte != pte) {
9245 				*ptep = newpte;
9246 				changed = true;
9247 			}
9248 		}
9249 	}
9250 	if (changed)
9251 		pmap_invalidate_range(pmap, sva, eva);
9252 }
9253 
9254 static int
9255 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
9256     u_int keyidx, int flags)
9257 {
9258 
9259 	if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX ||
9260 	    (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0)
9261 		return (EINVAL);
9262 	if (eva <= sva || eva > VM_MAXUSER_ADDRESS)
9263 		return (EFAULT);
9264 	if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
9265 		return (ENOTSUP);
9266 	return (0);
9267 }
9268 
9269 int
9270 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
9271     int flags)
9272 {
9273 	int error;
9274 
9275 	sva = trunc_page(sva);
9276 	eva = round_page(eva);
9277 	error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags);
9278 	if (error != 0)
9279 		return (error);
9280 	for (;;) {
9281 		PMAP_LOCK(pmap);
9282 		error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags);
9283 		if (error == 0)
9284 			pmap_pkru_update_range(pmap, sva, eva, keyidx);
9285 		PMAP_UNLOCK(pmap);
9286 		if (error != ENOMEM)
9287 			break;
9288 		vm_wait(NULL);
9289 	}
9290 	return (error);
9291 }
9292 
9293 int
9294 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9295 {
9296 	int error;
9297 
9298 	sva = trunc_page(sva);
9299 	eva = round_page(eva);
9300 	error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0);
9301 	if (error != 0)
9302 		return (error);
9303 	for (;;) {
9304 		PMAP_LOCK(pmap);
9305 		error = pmap_pkru_deassign(pmap, sva, eva);
9306 		if (error == 0)
9307 			pmap_pkru_update_range(pmap, sva, eva, 0);
9308 		PMAP_UNLOCK(pmap);
9309 		if (error != ENOMEM)
9310 			break;
9311 		vm_wait(NULL);
9312 	}
9313 	return (error);
9314 }
9315 
9316 #include "opt_ddb.h"
9317 #ifdef DDB
9318 #include <sys/kdb.h>
9319 #include <ddb/ddb.h>
9320 
9321 DB_SHOW_COMMAND(pte, pmap_print_pte)
9322 {
9323 	pmap_t pmap;
9324 	pml4_entry_t *pml4;
9325 	pdp_entry_t *pdp;
9326 	pd_entry_t *pde;
9327 	pt_entry_t *pte, PG_V;
9328 	vm_offset_t va;
9329 
9330 	if (!have_addr) {
9331 		db_printf("show pte addr\n");
9332 		return;
9333 	}
9334 	va = (vm_offset_t)addr;
9335 
9336 	if (kdb_thread != NULL)
9337 		pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
9338 	else
9339 		pmap = PCPU_GET(curpmap);
9340 
9341 	PG_V = pmap_valid_bit(pmap);
9342 	pml4 = pmap_pml4e(pmap, va);
9343 	db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
9344 	if ((*pml4 & PG_V) == 0) {
9345 		db_printf("\n");
9346 		return;
9347 	}
9348 	pdp = pmap_pml4e_to_pdpe(pml4, va);
9349 	db_printf(" pdpe %#016lx", *pdp);
9350 	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
9351 		db_printf("\n");
9352 		return;
9353 	}
9354 	pde = pmap_pdpe_to_pde(pdp, va);
9355 	db_printf(" pde %#016lx", *pde);
9356 	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
9357 		db_printf("\n");
9358 		return;
9359 	}
9360 	pte = pmap_pde_to_pte(pde, va);
9361 	db_printf(" pte %#016lx\n", *pte);
9362 }
9363 
9364 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
9365 {
9366 	vm_paddr_t a;
9367 
9368 	if (have_addr) {
9369 		a = (vm_paddr_t)addr;
9370 		db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
9371 	} else {
9372 		db_printf("show phys2dmap addr\n");
9373 	}
9374 }
9375 #endif
9376