xref: /freebsd/sys/arm64/arm64/pmap.c (revision 780fb4a2)
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  * Copyright (c) 2003 Peter Wemm
9  * All rights reserved.
10  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11  * All rights reserved.
12  * Copyright (c) 2014 Andrew Turner
13  * All rights reserved.
14  * Copyright (c) 2014-2016 The FreeBSD Foundation
15  * All rights reserved.
16  *
17  * This code is derived from software contributed to Berkeley by
18  * the Systems Programming Group of the University of Utah Computer
19  * Science Department and William Jolitz of UUNET Technologies Inc.
20  *
21  * This software was developed by Andrew Turner under sponsorship from
22  * the FreeBSD Foundation.
23  *
24  * Redistribution and use in source and binary forms, with or without
25  * modification, are permitted provided that the following conditions
26  * are met:
27  * 1. Redistributions of source code must retain the above copyright
28  *    notice, this list of conditions and the following disclaimer.
29  * 2. Redistributions in binary form must reproduce the above copyright
30  *    notice, this list of conditions and the following disclaimer in the
31  *    documentation and/or other materials provided with the distribution.
32  * 3. All advertising materials mentioning features or use of this software
33  *    must display the following acknowledgement:
34  *	This product includes software developed by the University of
35  *	California, Berkeley and its contributors.
36  * 4. Neither the name of the University nor the names of its contributors
37  *    may be used to endorse or promote products derived from this software
38  *    without specific prior written permission.
39  *
40  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  *
52  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
53  */
54 /*-
55  * Copyright (c) 2003 Networks Associates Technology, Inc.
56  * All rights reserved.
57  *
58  * This software was developed for the FreeBSD Project by Jake Burkholder,
59  * Safeport Network Services, and Network Associates Laboratories, the
60  * Security Research Division of Network Associates, Inc. under
61  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
62  * CHATS research program.
63  *
64  * Redistribution and use in source and binary forms, with or without
65  * modification, are permitted provided that the following conditions
66  * are met:
67  * 1. Redistributions of source code must retain the above copyright
68  *    notice, this list of conditions and the following disclaimer.
69  * 2. Redistributions in binary form must reproduce the above copyright
70  *    notice, this list of conditions and the following disclaimer in the
71  *    documentation and/or other materials provided with the distribution.
72  *
73  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
74  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83  * SUCH DAMAGE.
84  */
85 
86 #include <sys/cdefs.h>
87 __FBSDID("$FreeBSD$");
88 
89 /*
90  *	Manages physical address maps.
91  *
92  *	Since the information managed by this module is
93  *	also stored by the logical address mapping module,
94  *	this module may throw away valid virtual-to-physical
95  *	mappings at almost any time.  However, invalidations
96  *	of virtual-to-physical mappings must be done as
97  *	requested.
98  *
99  *	In order to cope with hardware architectures which
100  *	make virtual-to-physical map invalidates expensive,
101  *	this module may delay invalidate or reduced protection
102  *	operations until such time as they are actually
103  *	necessary.  This module is given full information as
104  *	to which processors are currently using which maps,
105  *	and to when physical maps must be made correct.
106  */
107 
108 #include "opt_vm.h"
109 
110 #include <sys/param.h>
111 #include <sys/bitstring.h>
112 #include <sys/bus.h>
113 #include <sys/systm.h>
114 #include <sys/kernel.h>
115 #include <sys/ktr.h>
116 #include <sys/lock.h>
117 #include <sys/malloc.h>
118 #include <sys/mman.h>
119 #include <sys/msgbuf.h>
120 #include <sys/mutex.h>
121 #include <sys/proc.h>
122 #include <sys/rwlock.h>
123 #include <sys/sx.h>
124 #include <sys/vmem.h>
125 #include <sys/vmmeter.h>
126 #include <sys/sched.h>
127 #include <sys/sysctl.h>
128 #include <sys/_unrhdr.h>
129 #include <sys/smp.h>
130 
131 #include <vm/vm.h>
132 #include <vm/vm_param.h>
133 #include <vm/vm_kern.h>
134 #include <vm/vm_page.h>
135 #include <vm/vm_map.h>
136 #include <vm/vm_object.h>
137 #include <vm/vm_extern.h>
138 #include <vm/vm_pageout.h>
139 #include <vm/vm_pager.h>
140 #include <vm/vm_phys.h>
141 #include <vm/vm_radix.h>
142 #include <vm/vm_reserv.h>
143 #include <vm/uma.h>
144 
145 #include <machine/machdep.h>
146 #include <machine/md_var.h>
147 #include <machine/pcb.h>
148 
149 #include <arm/include/physmem.h>
150 
151 #define	NL0PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
152 #define	NL1PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
153 #define	NL2PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
154 #define	NL3PG		(PAGE_SIZE/(sizeof (pt_entry_t)))
155 
156 #define	NUL0E		L0_ENTRIES
157 #define	NUL1E		(NUL0E * NL1PG)
158 #define	NUL2E		(NUL1E * NL2PG)
159 
160 #if !defined(DIAGNOSTIC)
161 #ifdef __GNUC_GNU_INLINE__
162 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
163 #else
164 #define PMAP_INLINE	extern inline
165 #endif
166 #else
167 #define PMAP_INLINE
168 #endif
169 
170 /*
171  * These are configured by the mair_el1 register. This is set up in locore.S
172  */
173 #define	DEVICE_MEMORY	0
174 #define	UNCACHED_MEMORY	1
175 #define	CACHED_MEMORY	2
176 
177 
178 #ifdef PV_STATS
179 #define PV_STAT(x)	do { x ; } while (0)
180 #else
181 #define PV_STAT(x)	do { } while (0)
182 #endif
183 
184 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
185 #define	pa_to_pvh(pa)		(&pv_table[pmap_l2_pindex(pa)])
186 
187 #define	NPV_LIST_LOCKS	MAXCPU
188 
189 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
190 			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
191 
192 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
193 	struct rwlock **_lockp = (lockp);		\
194 	struct rwlock *_new_lock;			\
195 							\
196 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
197 	if (_new_lock != *_lockp) {			\
198 		if (*_lockp != NULL)			\
199 			rw_wunlock(*_lockp);		\
200 		*_lockp = _new_lock;			\
201 		rw_wlock(*_lockp);			\
202 	}						\
203 } while (0)
204 
205 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
206 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
207 
208 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
209 	struct rwlock **_lockp = (lockp);		\
210 							\
211 	if (*_lockp != NULL) {				\
212 		rw_wunlock(*_lockp);			\
213 		*_lockp = NULL;				\
214 	}						\
215 } while (0)
216 
217 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
218 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
219 
220 struct pmap kernel_pmap_store;
221 
222 /* Used for mapping ACPI memory before VM is initialized */
223 #define	PMAP_PREINIT_MAPPING_COUNT	32
224 #define	PMAP_PREINIT_MAPPING_SIZE	(PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
225 static vm_offset_t preinit_map_va;	/* Start VA of pre-init mapping space */
226 static int vm_initialized = 0;		/* No need to use pre-init maps when set */
227 
228 /*
229  * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
230  * Always map entire L2 block for simplicity.
231  * VA of L2 block = preinit_map_va + i * L2_SIZE
232  */
233 static struct pmap_preinit_mapping {
234 	vm_paddr_t	pa;
235 	vm_offset_t	va;
236 	vm_size_t	size;
237 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
238 
239 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
240 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
241 vm_offset_t kernel_vm_end = 0;
242 
243 /*
244  * Data for the pv entry allocation mechanism.
245  * Updates to pv_invl_gen are protected by the pv_list_locks[]
246  * elements, but reads are not.
247  */
248 static struct md_page *pv_table;
249 static struct md_page pv_dummy;
250 
251 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
252 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
253 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
254 
255 /* This code assumes all L1 DMAP entries will be used */
256 CTASSERT((DMAP_MIN_ADDRESS  & ~L0_OFFSET) == DMAP_MIN_ADDRESS);
257 CTASSERT((DMAP_MAX_ADDRESS  & ~L0_OFFSET) == DMAP_MAX_ADDRESS);
258 
259 #define	DMAP_TABLES	((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
260 extern pt_entry_t pagetable_dmap[];
261 
262 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
263 static vm_paddr_t physmap[PHYSMAP_SIZE];
264 static u_int physmap_idx;
265 
266 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
267 
268 static int superpages_enabled = 1;
269 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
270     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
271     "Are large page mappings enabled?");
272 
273 /*
274  * Data for the pv entry allocation mechanism
275  */
276 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
277 static struct mtx pv_chunks_mutex;
278 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
279 
280 static void	free_pv_chunk(struct pv_chunk *pc);
281 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
282 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
283 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
284 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
285 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
286 		    vm_offset_t va);
287 
288 static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode);
289 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
290 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
291 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
292     vm_offset_t va, struct rwlock **lockp);
293 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
294 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
295     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
296 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
297     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
298 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
299     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
300 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
301     vm_page_t m, struct rwlock **lockp);
302 
303 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
304 		struct rwlock **lockp);
305 
306 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
307     struct spglist *free);
308 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
309 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
310 
311 /*
312  * These load the old table data and store the new value.
313  * They need to be atomic as the System MMU may write to the table at
314  * the same time as the CPU.
315  */
316 #define	pmap_load_store(table, entry) atomic_swap_64(table, entry)
317 #define	pmap_set(table, mask) atomic_set_64(table, mask)
318 #define	pmap_load_clear(table) atomic_swap_64(table, 0)
319 #define	pmap_load(table) (*table)
320 
321 /********************/
322 /* Inline functions */
323 /********************/
324 
325 static __inline void
326 pagecopy(void *s, void *d)
327 {
328 
329 	memcpy(d, s, PAGE_SIZE);
330 }
331 
332 static __inline pd_entry_t *
333 pmap_l0(pmap_t pmap, vm_offset_t va)
334 {
335 
336 	return (&pmap->pm_l0[pmap_l0_index(va)]);
337 }
338 
339 static __inline pd_entry_t *
340 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
341 {
342 	pd_entry_t *l1;
343 
344 	l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
345 	return (&l1[pmap_l1_index(va)]);
346 }
347 
348 static __inline pd_entry_t *
349 pmap_l1(pmap_t pmap, vm_offset_t va)
350 {
351 	pd_entry_t *l0;
352 
353 	l0 = pmap_l0(pmap, va);
354 	if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
355 		return (NULL);
356 
357 	return (pmap_l0_to_l1(l0, va));
358 }
359 
360 static __inline pd_entry_t *
361 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
362 {
363 	pd_entry_t *l2;
364 
365 	l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
366 	return (&l2[pmap_l2_index(va)]);
367 }
368 
369 static __inline pd_entry_t *
370 pmap_l2(pmap_t pmap, vm_offset_t va)
371 {
372 	pd_entry_t *l1;
373 
374 	l1 = pmap_l1(pmap, va);
375 	if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
376 		return (NULL);
377 
378 	return (pmap_l1_to_l2(l1, va));
379 }
380 
381 static __inline pt_entry_t *
382 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
383 {
384 	pt_entry_t *l3;
385 
386 	l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK);
387 	return (&l3[pmap_l3_index(va)]);
388 }
389 
390 /*
391  * Returns the lowest valid pde for a given virtual address.
392  * The next level may or may not point to a valid page or block.
393  */
394 static __inline pd_entry_t *
395 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
396 {
397 	pd_entry_t *l0, *l1, *l2, desc;
398 
399 	l0 = pmap_l0(pmap, va);
400 	desc = pmap_load(l0) & ATTR_DESCR_MASK;
401 	if (desc != L0_TABLE) {
402 		*level = -1;
403 		return (NULL);
404 	}
405 
406 	l1 = pmap_l0_to_l1(l0, va);
407 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
408 	if (desc != L1_TABLE) {
409 		*level = 0;
410 		return (l0);
411 	}
412 
413 	l2 = pmap_l1_to_l2(l1, va);
414 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
415 	if (desc != L2_TABLE) {
416 		*level = 1;
417 		return (l1);
418 	}
419 
420 	*level = 2;
421 	return (l2);
422 }
423 
424 /*
425  * Returns the lowest valid pte block or table entry for a given virtual
426  * address. If there are no valid entries return NULL and set the level to
427  * the first invalid level.
428  */
429 static __inline pt_entry_t *
430 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
431 {
432 	pd_entry_t *l1, *l2, desc;
433 	pt_entry_t *l3;
434 
435 	l1 = pmap_l1(pmap, va);
436 	if (l1 == NULL) {
437 		*level = 0;
438 		return (NULL);
439 	}
440 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
441 	if (desc == L1_BLOCK) {
442 		*level = 1;
443 		return (l1);
444 	}
445 
446 	if (desc != L1_TABLE) {
447 		*level = 1;
448 		return (NULL);
449 	}
450 
451 	l2 = pmap_l1_to_l2(l1, va);
452 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
453 	if (desc == L2_BLOCK) {
454 		*level = 2;
455 		return (l2);
456 	}
457 
458 	if (desc != L2_TABLE) {
459 		*level = 2;
460 		return (NULL);
461 	}
462 
463 	*level = 3;
464 	l3 = pmap_l2_to_l3(l2, va);
465 	if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
466 		return (NULL);
467 
468 	return (l3);
469 }
470 
471 static inline bool
472 pmap_superpages_enabled(void)
473 {
474 
475 	return (superpages_enabled != 0);
476 }
477 
478 bool
479 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
480     pd_entry_t **l2, pt_entry_t **l3)
481 {
482 	pd_entry_t *l0p, *l1p, *l2p;
483 
484 	if (pmap->pm_l0 == NULL)
485 		return (false);
486 
487 	l0p = pmap_l0(pmap, va);
488 	*l0 = l0p;
489 
490 	if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
491 		return (false);
492 
493 	l1p = pmap_l0_to_l1(l0p, va);
494 	*l1 = l1p;
495 
496 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
497 		*l2 = NULL;
498 		*l3 = NULL;
499 		return (true);
500 	}
501 
502 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
503 		return (false);
504 
505 	l2p = pmap_l1_to_l2(l1p, va);
506 	*l2 = l2p;
507 
508 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
509 		*l3 = NULL;
510 		return (true);
511 	}
512 
513 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
514 		return (false);
515 
516 	*l3 = pmap_l2_to_l3(l2p, va);
517 
518 	return (true);
519 }
520 
521 static __inline int
522 pmap_l3_valid(pt_entry_t l3)
523 {
524 
525 	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
526 }
527 
528 
529 CTASSERT(L1_BLOCK == L2_BLOCK);
530 
531 /*
532  * Checks if the page is dirty. We currently lack proper tracking of this on
533  * arm64 so for now assume is a page mapped as rw was accessed it is.
534  */
535 static inline int
536 pmap_page_dirty(pt_entry_t pte)
537 {
538 
539 	return ((pte & (ATTR_AF | ATTR_AP_RW_BIT)) ==
540 	    (ATTR_AF | ATTR_AP(ATTR_AP_RW)));
541 }
542 
543 static __inline void
544 pmap_resident_count_inc(pmap_t pmap, int count)
545 {
546 
547 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
548 	pmap->pm_stats.resident_count += count;
549 }
550 
551 static __inline void
552 pmap_resident_count_dec(pmap_t pmap, int count)
553 {
554 
555 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
556 	KASSERT(pmap->pm_stats.resident_count >= count,
557 	    ("pmap %p resident count underflow %ld %d", pmap,
558 	    pmap->pm_stats.resident_count, count));
559 	pmap->pm_stats.resident_count -= count;
560 }
561 
562 static pt_entry_t *
563 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot,
564     u_int *l2_slot)
565 {
566 	pt_entry_t *l2;
567 	pd_entry_t *l1;
568 
569 	l1 = (pd_entry_t *)l1pt;
570 	*l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK;
571 
572 	/* Check locore has used a table L1 map */
573 	KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE,
574 	   ("Invalid bootstrap L1 table"));
575 	/* Find the address of the L2 table */
576 	l2 = (pt_entry_t *)init_pt_va;
577 	*l2_slot = pmap_l2_index(va);
578 
579 	return (l2);
580 }
581 
582 static vm_paddr_t
583 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
584 {
585 	u_int l1_slot, l2_slot;
586 	pt_entry_t *l2;
587 
588 	l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot);
589 
590 	return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET));
591 }
592 
593 static vm_offset_t
594 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa,
595     vm_offset_t freemempos)
596 {
597 	pt_entry_t *l2;
598 	vm_offset_t va;
599 	vm_paddr_t l2_pa, pa;
600 	u_int l1_slot, l2_slot, prev_l1_slot;
601 	int i;
602 
603 	dmap_phys_base = min_pa & ~L1_OFFSET;
604 	dmap_phys_max = 0;
605 	dmap_max_addr = 0;
606 	l2 = NULL;
607 	prev_l1_slot = -1;
608 
609 #define	DMAP_TABLES	((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
610 	memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES);
611 
612 	for (i = 0; i < (physmap_idx * 2); i += 2) {
613 		pa = physmap[i] & ~L2_OFFSET;
614 		va = pa - dmap_phys_base + DMAP_MIN_ADDRESS;
615 
616 		/* Create L2 mappings at the start of the region */
617 		if ((pa & L1_OFFSET) != 0) {
618 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
619 			if (l1_slot != prev_l1_slot) {
620 				prev_l1_slot = l1_slot;
621 				l2 = (pt_entry_t *)freemempos;
622 				l2_pa = pmap_early_vtophys(kern_l1,
623 				    (vm_offset_t)l2);
624 				freemempos += PAGE_SIZE;
625 
626 				pmap_load_store(&pagetable_dmap[l1_slot],
627 				    (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE);
628 
629 				memset(l2, 0, PAGE_SIZE);
630 			}
631 			KASSERT(l2 != NULL,
632 			    ("pmap_bootstrap_dmap: NULL l2 map"));
633 			for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
634 			    pa += L2_SIZE, va += L2_SIZE) {
635 				/*
636 				 * We are on a boundary, stop to
637 				 * create a level 1 block
638 				 */
639 				if ((pa & L1_OFFSET) == 0)
640 					break;
641 
642 				l2_slot = pmap_l2_index(va);
643 				KASSERT(l2_slot != 0, ("..."));
644 				pmap_load_store(&l2[l2_slot],
645 				    (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN |
646 				    ATTR_IDX(CACHED_MEMORY) | L2_BLOCK);
647 			}
648 			KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS),
649 			    ("..."));
650 		}
651 
652 		for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] &&
653 		    (physmap[i + 1] - pa) >= L1_SIZE;
654 		    pa += L1_SIZE, va += L1_SIZE) {
655 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
656 			pmap_load_store(&pagetable_dmap[l1_slot],
657 			    (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_XN |
658 			    ATTR_IDX(CACHED_MEMORY) | L1_BLOCK);
659 		}
660 
661 		/* Create L2 mappings at the end of the region */
662 		if (pa < physmap[i + 1]) {
663 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
664 			if (l1_slot != prev_l1_slot) {
665 				prev_l1_slot = l1_slot;
666 				l2 = (pt_entry_t *)freemempos;
667 				l2_pa = pmap_early_vtophys(kern_l1,
668 				    (vm_offset_t)l2);
669 				freemempos += PAGE_SIZE;
670 
671 				pmap_load_store(&pagetable_dmap[l1_slot],
672 				    (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE);
673 
674 				memset(l2, 0, PAGE_SIZE);
675 			}
676 			KASSERT(l2 != NULL,
677 			    ("pmap_bootstrap_dmap: NULL l2 map"));
678 			for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
679 			    pa += L2_SIZE, va += L2_SIZE) {
680 				l2_slot = pmap_l2_index(va);
681 				pmap_load_store(&l2[l2_slot],
682 				    (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN |
683 				    ATTR_IDX(CACHED_MEMORY) | L2_BLOCK);
684 			}
685 		}
686 
687 		if (pa > dmap_phys_max) {
688 			dmap_phys_max = pa;
689 			dmap_max_addr = va;
690 		}
691 	}
692 
693 	cpu_tlb_flushID();
694 
695 	return (freemempos);
696 }
697 
698 static vm_offset_t
699 pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start)
700 {
701 	vm_offset_t l2pt;
702 	vm_paddr_t pa;
703 	pd_entry_t *l1;
704 	u_int l1_slot;
705 
706 	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
707 
708 	l1 = (pd_entry_t *)l1pt;
709 	l1_slot = pmap_l1_index(va);
710 	l2pt = l2_start;
711 
712 	for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) {
713 		KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
714 
715 		pa = pmap_early_vtophys(l1pt, l2pt);
716 		pmap_load_store(&l1[l1_slot],
717 		    (pa & ~Ln_TABLE_MASK) | L1_TABLE);
718 		l2pt += PAGE_SIZE;
719 	}
720 
721 	/* Clean the L2 page table */
722 	memset((void *)l2_start, 0, l2pt - l2_start);
723 
724 	return l2pt;
725 }
726 
727 static vm_offset_t
728 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
729 {
730 	vm_offset_t l3pt;
731 	vm_paddr_t pa;
732 	pd_entry_t *l2;
733 	u_int l2_slot;
734 
735 	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
736 
737 	l2 = pmap_l2(kernel_pmap, va);
738 	l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE);
739 	l2_slot = pmap_l2_index(va);
740 	l3pt = l3_start;
741 
742 	for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
743 		KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
744 
745 		pa = pmap_early_vtophys(l1pt, l3pt);
746 		pmap_load_store(&l2[l2_slot],
747 		    (pa & ~Ln_TABLE_MASK) | L2_TABLE);
748 		l3pt += PAGE_SIZE;
749 	}
750 
751 	/* Clean the L2 page table */
752 	memset((void *)l3_start, 0, l3pt - l3_start);
753 
754 	return l3pt;
755 }
756 
757 /*
758  *	Bootstrap the system enough to run with virtual memory.
759  */
760 void
761 pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart,
762     vm_size_t kernlen)
763 {
764 	u_int l1_slot, l2_slot;
765 	uint64_t kern_delta;
766 	pt_entry_t *l2;
767 	vm_offset_t va, freemempos;
768 	vm_offset_t dpcpu, msgbufpv;
769 	vm_paddr_t start_pa, pa, min_pa;
770 	int i;
771 
772 	kern_delta = KERNBASE - kernstart;
773 
774 	printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
775 	printf("%lx\n", l1pt);
776 	printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);
777 
778 	/* Set this early so we can use the pagetable walking functions */
779 	kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt;
780 	PMAP_LOCK_INIT(kernel_pmap);
781 
782 	/* Assume the address we were loaded to is a valid physical address */
783 	min_pa = KERNBASE - kern_delta;
784 
785 	physmap_idx = arm_physmem_avail(physmap, nitems(physmap));
786 	physmap_idx /= 2;
787 
788 	/*
789 	 * Find the minimum physical address. physmap is sorted,
790 	 * but may contain empty ranges.
791 	 */
792 	for (i = 0; i < (physmap_idx * 2); i += 2) {
793 		if (physmap[i] == physmap[i + 1])
794 			continue;
795 		if (physmap[i] <= min_pa)
796 			min_pa = physmap[i];
797 	}
798 
799 	freemempos = KERNBASE + kernlen;
800 	freemempos = roundup2(freemempos, PAGE_SIZE);
801 
802 	/* Create a direct map region early so we can use it for pa -> va */
803 	freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos);
804 
805 	va = KERNBASE;
806 	start_pa = pa = KERNBASE - kern_delta;
807 
808 	/*
809 	 * Read the page table to find out what is already mapped.
810 	 * This assumes we have mapped a block of memory from KERNBASE
811 	 * using a single L1 entry.
812 	 */
813 	l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot);
814 
815 	/* Sanity check the index, KERNBASE should be the first VA */
816 	KASSERT(l2_slot == 0, ("The L2 index is non-zero"));
817 
818 	/* Find how many pages we have mapped */
819 	for (; l2_slot < Ln_ENTRIES; l2_slot++) {
820 		if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0)
821 			break;
822 
823 		/* Check locore used L2 blocks */
824 		KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK,
825 		    ("Invalid bootstrap L2 table"));
826 		KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa,
827 		    ("Incorrect PA in L2 table"));
828 
829 		va += L2_SIZE;
830 		pa += L2_SIZE;
831 	}
832 
833 	va = roundup2(va, L1_SIZE);
834 
835 	/* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */
836 	freemempos = pmap_bootstrap_l2(l1pt, va, freemempos);
837 	/* And the l3 tables for the early devmap */
838 	freemempos = pmap_bootstrap_l3(l1pt,
839 	    VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos);
840 
841 	cpu_tlb_flushID();
842 
843 #define alloc_pages(var, np)						\
844 	(var) = freemempos;						\
845 	freemempos += (np * PAGE_SIZE);					\
846 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
847 
848 	/* Allocate dynamic per-cpu area. */
849 	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
850 	dpcpu_init((void *)dpcpu, 0);
851 
852 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
853 	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
854 	msgbufp = (void *)msgbufpv;
855 
856 	/* Reserve some VA space for early BIOS/ACPI mapping */
857 	preinit_map_va = roundup2(freemempos, L2_SIZE);
858 
859 	virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
860 	virtual_avail = roundup2(virtual_avail, L1_SIZE);
861 	virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE;
862 	kernel_vm_end = virtual_avail;
863 
864 	pa = pmap_early_vtophys(l1pt, freemempos);
865 
866 	arm_physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
867 
868 	cpu_tlb_flushID();
869 }
870 
871 /*
872  *	Initialize a vm_page's machine-dependent fields.
873  */
874 void
875 pmap_page_init(vm_page_t m)
876 {
877 
878 	TAILQ_INIT(&m->md.pv_list);
879 	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
880 }
881 
882 /*
883  *	Initialize the pmap module.
884  *	Called by vm_init, to initialize any structures that the pmap
885  *	system needs to map virtual memory.
886  */
887 void
888 pmap_init(void)
889 {
890 	vm_size_t s;
891 	int i, pv_npg;
892 
893 	/*
894 	 * Are large page mappings enabled?
895 	 */
896 	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
897 
898 	/*
899 	 * Initialize the pv chunk list mutex.
900 	 */
901 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
902 
903 	/*
904 	 * Initialize the pool of pv list locks.
905 	 */
906 	for (i = 0; i < NPV_LIST_LOCKS; i++)
907 		rw_init(&pv_list_locks[i], "pmap pv list");
908 
909 	/*
910 	 * Calculate the size of the pv head table for superpages.
911 	 */
912 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
913 
914 	/*
915 	 * Allocate memory for the pv head table for superpages.
916 	 */
917 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
918 	s = round_page(s);
919 	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
920 	    M_WAITOK | M_ZERO);
921 	for (i = 0; i < pv_npg; i++)
922 		TAILQ_INIT(&pv_table[i].pv_list);
923 	TAILQ_INIT(&pv_dummy.pv_list);
924 
925 	vm_initialized = 1;
926 }
927 
928 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0,
929     "2MB page mapping counters");
930 
931 static u_long pmap_l2_demotions;
932 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
933     &pmap_l2_demotions, 0, "2MB page demotions");
934 
935 static u_long pmap_l2_p_failures;
936 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
937     &pmap_l2_p_failures, 0, "2MB page promotion failures");
938 
939 static u_long pmap_l2_promotions;
940 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
941     &pmap_l2_promotions, 0, "2MB page promotions");
942 
943 /*
944  * Invalidate a single TLB entry.
945  */
946 static __inline void
947 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
948 {
949 
950 	sched_pin();
951 	__asm __volatile(
952 	    "dsb  ishst		\n"
953 	    "tlbi vaae1is, %0	\n"
954 	    "dsb  ish		\n"
955 	    "isb		\n"
956 	    : : "r"(va >> PAGE_SHIFT));
957 	sched_unpin();
958 }
959 
960 static __inline void
961 pmap_invalidate_range_nopin(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
962 {
963 	vm_offset_t addr;
964 
965 	dsb(ishst);
966 	for (addr = sva; addr < eva; addr += PAGE_SIZE) {
967 		__asm __volatile(
968 		    "tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT));
969 	}
970 	__asm __volatile(
971 	    "dsb  ish	\n"
972 	    "isb	\n");
973 }
974 
975 static __inline void
976 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
977 {
978 
979 	sched_pin();
980 	pmap_invalidate_range_nopin(pmap, sva, eva);
981 	sched_unpin();
982 }
983 
984 static __inline void
985 pmap_invalidate_all(pmap_t pmap)
986 {
987 
988 	sched_pin();
989 	__asm __volatile(
990 	    "dsb  ishst		\n"
991 	    "tlbi vmalle1is	\n"
992 	    "dsb  ish		\n"
993 	    "isb		\n");
994 	sched_unpin();
995 }
996 
997 /*
998  *	Routine:	pmap_extract
999  *	Function:
1000  *		Extract the physical page address associated
1001  *		with the given map/virtual_address pair.
1002  */
1003 vm_paddr_t
1004 pmap_extract(pmap_t pmap, vm_offset_t va)
1005 {
1006 	pt_entry_t *pte, tpte;
1007 	vm_paddr_t pa;
1008 	int lvl;
1009 
1010 	pa = 0;
1011 	PMAP_LOCK(pmap);
1012 	/*
1013 	 * Find the block or page map for this virtual address. pmap_pte
1014 	 * will return either a valid block/page entry, or NULL.
1015 	 */
1016 	pte = pmap_pte(pmap, va, &lvl);
1017 	if (pte != NULL) {
1018 		tpte = pmap_load(pte);
1019 		pa = tpte & ~ATTR_MASK;
1020 		switch(lvl) {
1021 		case 1:
1022 			KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1023 			    ("pmap_extract: Invalid L1 pte found: %lx",
1024 			    tpte & ATTR_DESCR_MASK));
1025 			pa |= (va & L1_OFFSET);
1026 			break;
1027 		case 2:
1028 			KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1029 			    ("pmap_extract: Invalid L2 pte found: %lx",
1030 			    tpte & ATTR_DESCR_MASK));
1031 			pa |= (va & L2_OFFSET);
1032 			break;
1033 		case 3:
1034 			KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1035 			    ("pmap_extract: Invalid L3 pte found: %lx",
1036 			    tpte & ATTR_DESCR_MASK));
1037 			pa |= (va & L3_OFFSET);
1038 			break;
1039 		}
1040 	}
1041 	PMAP_UNLOCK(pmap);
1042 	return (pa);
1043 }
1044 
1045 /*
1046  *	Routine:	pmap_extract_and_hold
1047  *	Function:
1048  *		Atomically extract and hold the physical page
1049  *		with the given pmap and virtual address pair
1050  *		if that mapping permits the given protection.
1051  */
1052 vm_page_t
1053 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1054 {
1055 	pt_entry_t *pte, tpte;
1056 	vm_offset_t off;
1057 	vm_paddr_t pa;
1058 	vm_page_t m;
1059 	int lvl;
1060 
1061 	pa = 0;
1062 	m = NULL;
1063 	PMAP_LOCK(pmap);
1064 retry:
1065 	pte = pmap_pte(pmap, va, &lvl);
1066 	if (pte != NULL) {
1067 		tpte = pmap_load(pte);
1068 
1069 		KASSERT(lvl > 0 && lvl <= 3,
1070 		    ("pmap_extract_and_hold: Invalid level %d", lvl));
1071 		CTASSERT(L1_BLOCK == L2_BLOCK);
1072 		KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
1073 		    (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
1074 		    ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
1075 		     tpte & ATTR_DESCR_MASK));
1076 		if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) ||
1077 		    ((prot & VM_PROT_WRITE) == 0)) {
1078 			switch(lvl) {
1079 			case 1:
1080 				off = va & L1_OFFSET;
1081 				break;
1082 			case 2:
1083 				off = va & L2_OFFSET;
1084 				break;
1085 			case 3:
1086 			default:
1087 				off = 0;
1088 			}
1089 			if (vm_page_pa_tryrelock(pmap,
1090 			    (tpte & ~ATTR_MASK) | off, &pa))
1091 				goto retry;
1092 			m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off);
1093 			vm_page_hold(m);
1094 		}
1095 	}
1096 	PA_UNLOCK_COND(pa);
1097 	PMAP_UNLOCK(pmap);
1098 	return (m);
1099 }
1100 
1101 vm_paddr_t
1102 pmap_kextract(vm_offset_t va)
1103 {
1104 	pt_entry_t *pte, tpte;
1105 	vm_paddr_t pa;
1106 	int lvl;
1107 
1108 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1109 		pa = DMAP_TO_PHYS(va);
1110 	} else {
1111 		pa = 0;
1112 		pte = pmap_pte(kernel_pmap, va, &lvl);
1113 		if (pte != NULL) {
1114 			tpte = pmap_load(pte);
1115 			pa = tpte & ~ATTR_MASK;
1116 			switch(lvl) {
1117 			case 1:
1118 				KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1119 				    ("pmap_kextract: Invalid L1 pte found: %lx",
1120 				    tpte & ATTR_DESCR_MASK));
1121 				pa |= (va & L1_OFFSET);
1122 				break;
1123 			case 2:
1124 				KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1125 				    ("pmap_kextract: Invalid L2 pte found: %lx",
1126 				    tpte & ATTR_DESCR_MASK));
1127 				pa |= (va & L2_OFFSET);
1128 				break;
1129 			case 3:
1130 				KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1131 				    ("pmap_kextract: Invalid L3 pte found: %lx",
1132 				    tpte & ATTR_DESCR_MASK));
1133 				pa |= (va & L3_OFFSET);
1134 				break;
1135 			}
1136 		}
1137 	}
1138 	return (pa);
1139 }
1140 
1141 /***************************************************
1142  * Low level mapping routines.....
1143  ***************************************************/
1144 
1145 void
1146 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
1147 {
1148 	pd_entry_t *pde;
1149 	pt_entry_t *pte, attr;
1150 	vm_offset_t va;
1151 	int lvl;
1152 
1153 	KASSERT((pa & L3_OFFSET) == 0,
1154 	   ("pmap_kenter: Invalid physical address"));
1155 	KASSERT((sva & L3_OFFSET) == 0,
1156 	   ("pmap_kenter: Invalid virtual address"));
1157 	KASSERT((size & PAGE_MASK) == 0,
1158 	    ("pmap_kenter: Mapping is not page-sized"));
1159 
1160 	attr = ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE;
1161 	if (mode == DEVICE_MEMORY)
1162 		attr |= ATTR_XN;
1163 
1164 	va = sva;
1165 	while (size != 0) {
1166 		pde = pmap_pde(kernel_pmap, va, &lvl);
1167 		KASSERT(pde != NULL,
1168 		    ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
1169 		KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
1170 
1171 		pte = pmap_l2_to_l3(pde, va);
1172 		pmap_load_store(pte, (pa & ~L3_OFFSET) | attr);
1173 
1174 		va += PAGE_SIZE;
1175 		pa += PAGE_SIZE;
1176 		size -= PAGE_SIZE;
1177 	}
1178 	pmap_invalidate_range(kernel_pmap, sva, va);
1179 }
1180 
1181 void
1182 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
1183 {
1184 
1185 	pmap_kenter(sva, size, pa, DEVICE_MEMORY);
1186 }
1187 
1188 /*
1189  * Remove a page from the kernel pagetables.
1190  */
1191 PMAP_INLINE void
1192 pmap_kremove(vm_offset_t va)
1193 {
1194 	pt_entry_t *pte;
1195 	int lvl;
1196 
1197 	pte = pmap_pte(kernel_pmap, va, &lvl);
1198 	KASSERT(pte != NULL, ("pmap_kremove: Invalid address"));
1199 	KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl));
1200 
1201 	pmap_load_clear(pte);
1202 	pmap_invalidate_page(kernel_pmap, va);
1203 }
1204 
1205 void
1206 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
1207 {
1208 	pt_entry_t *pte;
1209 	vm_offset_t va;
1210 	int lvl;
1211 
1212 	KASSERT((sva & L3_OFFSET) == 0,
1213 	   ("pmap_kremove_device: Invalid virtual address"));
1214 	KASSERT((size & PAGE_MASK) == 0,
1215 	    ("pmap_kremove_device: Mapping is not page-sized"));
1216 
1217 	va = sva;
1218 	while (size != 0) {
1219 		pte = pmap_pte(kernel_pmap, va, &lvl);
1220 		KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va));
1221 		KASSERT(lvl == 3,
1222 		    ("Invalid device pagetable level: %d != 3", lvl));
1223 		pmap_load_clear(pte);
1224 
1225 		va += PAGE_SIZE;
1226 		size -= PAGE_SIZE;
1227 	}
1228 	pmap_invalidate_range(kernel_pmap, sva, va);
1229 }
1230 
1231 /*
1232  *	Used to map a range of physical addresses into kernel
1233  *	virtual address space.
1234  *
1235  *	The value passed in '*virt' is a suggested virtual address for
1236  *	the mapping. Architectures which can support a direct-mapped
1237  *	physical to virtual region can return the appropriate address
1238  *	within that region, leaving '*virt' unchanged. Other
1239  *	architectures should map the pages starting at '*virt' and
1240  *	update '*virt' with the first usable address after the mapped
1241  *	region.
1242  */
1243 vm_offset_t
1244 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1245 {
1246 	return PHYS_TO_DMAP(start);
1247 }
1248 
1249 
1250 /*
1251  * Add a list of wired pages to the kva
1252  * this routine is only used for temporary
1253  * kernel mappings that do not need to have
1254  * page modification or references recorded.
1255  * Note that old mappings are simply written
1256  * over.  The page *must* be wired.
1257  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1258  */
1259 void
1260 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1261 {
1262 	pd_entry_t *pde;
1263 	pt_entry_t *pte, pa;
1264 	vm_offset_t va;
1265 	vm_page_t m;
1266 	int i, lvl;
1267 
1268 	va = sva;
1269 	for (i = 0; i < count; i++) {
1270 		pde = pmap_pde(kernel_pmap, va, &lvl);
1271 		KASSERT(pde != NULL,
1272 		    ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
1273 		KASSERT(lvl == 2,
1274 		    ("pmap_qenter: Invalid level %d", lvl));
1275 
1276 		m = ma[i];
1277 		pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) |
1278 		    ATTR_IDX(m->md.pv_memattr) | L3_PAGE;
1279 		if (m->md.pv_memattr == DEVICE_MEMORY)
1280 			pa |= ATTR_XN;
1281 		pte = pmap_l2_to_l3(pde, va);
1282 		pmap_load_store(pte, pa);
1283 
1284 		va += L3_SIZE;
1285 	}
1286 	pmap_invalidate_range(kernel_pmap, sva, va);
1287 }
1288 
1289 /*
1290  * This routine tears out page mappings from the
1291  * kernel -- it is meant only for temporary mappings.
1292  */
1293 void
1294 pmap_qremove(vm_offset_t sva, int count)
1295 {
1296 	pt_entry_t *pte;
1297 	vm_offset_t va;
1298 	int lvl;
1299 
1300 	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
1301 
1302 	va = sva;
1303 	while (count-- > 0) {
1304 		pte = pmap_pte(kernel_pmap, va, &lvl);
1305 		KASSERT(lvl == 3,
1306 		    ("Invalid device pagetable level: %d != 3", lvl));
1307 		if (pte != NULL) {
1308 			pmap_load_clear(pte);
1309 		}
1310 
1311 		va += PAGE_SIZE;
1312 	}
1313 	pmap_invalidate_range(kernel_pmap, sva, va);
1314 }
1315 
1316 /***************************************************
1317  * Page table page management routines.....
1318  ***************************************************/
1319 /*
1320  * Schedule the specified unused page table page to be freed.  Specifically,
1321  * add the page to the specified list of pages that will be released to the
1322  * physical memory manager after the TLB has been updated.
1323  */
1324 static __inline void
1325 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1326     boolean_t set_PG_ZERO)
1327 {
1328 
1329 	if (set_PG_ZERO)
1330 		m->flags |= PG_ZERO;
1331 	else
1332 		m->flags &= ~PG_ZERO;
1333 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1334 }
1335 
1336 /*
1337  * Decrements a page table page's wire count, which is used to record the
1338  * number of valid page table entries within the page.  If the wire count
1339  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1340  * page table page was unmapped and FALSE otherwise.
1341  */
1342 static inline boolean_t
1343 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1344 {
1345 
1346 	--m->wire_count;
1347 	if (m->wire_count == 0) {
1348 		_pmap_unwire_l3(pmap, va, m, free);
1349 		return (TRUE);
1350 	} else
1351 		return (FALSE);
1352 }
1353 
1354 static void
1355 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1356 {
1357 
1358 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1359 	/*
1360 	 * unmap the page table page
1361 	 */
1362 	if (m->pindex >= (NUL2E + NUL1E)) {
1363 		/* l1 page */
1364 		pd_entry_t *l0;
1365 
1366 		l0 = pmap_l0(pmap, va);
1367 		pmap_load_clear(l0);
1368 	} else if (m->pindex >= NUL2E) {
1369 		/* l2 page */
1370 		pd_entry_t *l1;
1371 
1372 		l1 = pmap_l1(pmap, va);
1373 		pmap_load_clear(l1);
1374 	} else {
1375 		/* l3 page */
1376 		pd_entry_t *l2;
1377 
1378 		l2 = pmap_l2(pmap, va);
1379 		pmap_load_clear(l2);
1380 	}
1381 	pmap_resident_count_dec(pmap, 1);
1382 	if (m->pindex < NUL2E) {
1383 		/* We just released an l3, unhold the matching l2 */
1384 		pd_entry_t *l1, tl1;
1385 		vm_page_t l2pg;
1386 
1387 		l1 = pmap_l1(pmap, va);
1388 		tl1 = pmap_load(l1);
1389 		l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
1390 		pmap_unwire_l3(pmap, va, l2pg, free);
1391 	} else if (m->pindex < (NUL2E + NUL1E)) {
1392 		/* We just released an l2, unhold the matching l1 */
1393 		pd_entry_t *l0, tl0;
1394 		vm_page_t l1pg;
1395 
1396 		l0 = pmap_l0(pmap, va);
1397 		tl0 = pmap_load(l0);
1398 		l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
1399 		pmap_unwire_l3(pmap, va, l1pg, free);
1400 	}
1401 	pmap_invalidate_page(pmap, va);
1402 
1403 	vm_wire_sub(1);
1404 
1405 	/*
1406 	 * Put page on a list so that it is released after
1407 	 * *ALL* TLB shootdown is done
1408 	 */
1409 	pmap_add_delayed_free_list(m, free, TRUE);
1410 }
1411 
1412 /*
1413  * After removing a page table entry, this routine is used to
1414  * conditionally free the page, and manage the hold/wire counts.
1415  */
1416 static int
1417 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
1418     struct spglist *free)
1419 {
1420 	vm_page_t mpte;
1421 
1422 	if (va >= VM_MAXUSER_ADDRESS)
1423 		return (0);
1424 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1425 	mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK);
1426 	return (pmap_unwire_l3(pmap, va, mpte, free));
1427 }
1428 
1429 void
1430 pmap_pinit0(pmap_t pmap)
1431 {
1432 
1433 	PMAP_LOCK_INIT(pmap);
1434 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1435 	pmap->pm_l0 = kernel_pmap->pm_l0;
1436 	pmap->pm_root.rt_root = 0;
1437 }
1438 
1439 int
1440 pmap_pinit(pmap_t pmap)
1441 {
1442 	vm_paddr_t l0phys;
1443 	vm_page_t l0pt;
1444 
1445 	/*
1446 	 * allocate the l0 page
1447 	 */
1448 	while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
1449 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1450 		vm_wait(NULL);
1451 
1452 	l0phys = VM_PAGE_TO_PHYS(l0pt);
1453 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys);
1454 
1455 	if ((l0pt->flags & PG_ZERO) == 0)
1456 		pagezero(pmap->pm_l0);
1457 
1458 	pmap->pm_root.rt_root = 0;
1459 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1460 
1461 	return (1);
1462 }
1463 
1464 /*
1465  * This routine is called if the desired page table page does not exist.
1466  *
1467  * If page table page allocation fails, this routine may sleep before
1468  * returning NULL.  It sleeps only if a lock pointer was given.
1469  *
1470  * Note: If a page allocation fails at page table level two or three,
1471  * one or two pages may be held during the wait, only to be released
1472  * afterwards.  This conservative approach is easily argued to avoid
1473  * race conditions.
1474  */
1475 static vm_page_t
1476 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1477 {
1478 	vm_page_t m, l1pg, l2pg;
1479 
1480 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1481 
1482 	/*
1483 	 * Allocate a page table page.
1484 	 */
1485 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1486 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1487 		if (lockp != NULL) {
1488 			RELEASE_PV_LIST_LOCK(lockp);
1489 			PMAP_UNLOCK(pmap);
1490 			vm_wait(NULL);
1491 			PMAP_LOCK(pmap);
1492 		}
1493 
1494 		/*
1495 		 * Indicate the need to retry.  While waiting, the page table
1496 		 * page may have been allocated.
1497 		 */
1498 		return (NULL);
1499 	}
1500 	if ((m->flags & PG_ZERO) == 0)
1501 		pmap_zero_page(m);
1502 
1503 	/*
1504 	 * Map the pagetable page into the process address space, if
1505 	 * it isn't already there.
1506 	 */
1507 
1508 	if (ptepindex >= (NUL2E + NUL1E)) {
1509 		pd_entry_t *l0;
1510 		vm_pindex_t l0index;
1511 
1512 		l0index = ptepindex - (NUL2E + NUL1E);
1513 		l0 = &pmap->pm_l0[l0index];
1514 		pmap_load_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE);
1515 	} else if (ptepindex >= NUL2E) {
1516 		vm_pindex_t l0index, l1index;
1517 		pd_entry_t *l0, *l1;
1518 		pd_entry_t tl0;
1519 
1520 		l1index = ptepindex - NUL2E;
1521 		l0index = l1index >> L0_ENTRIES_SHIFT;
1522 
1523 		l0 = &pmap->pm_l0[l0index];
1524 		tl0 = pmap_load(l0);
1525 		if (tl0 == 0) {
1526 			/* recurse for allocating page dir */
1527 			if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
1528 			    lockp) == NULL) {
1529 				vm_page_unwire_noq(m);
1530 				vm_page_free_zero(m);
1531 				return (NULL);
1532 			}
1533 		} else {
1534 			l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
1535 			l1pg->wire_count++;
1536 		}
1537 
1538 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
1539 		l1 = &l1[ptepindex & Ln_ADDR_MASK];
1540 		pmap_load_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE);
1541 	} else {
1542 		vm_pindex_t l0index, l1index;
1543 		pd_entry_t *l0, *l1, *l2;
1544 		pd_entry_t tl0, tl1;
1545 
1546 		l1index = ptepindex >> Ln_ENTRIES_SHIFT;
1547 		l0index = l1index >> L0_ENTRIES_SHIFT;
1548 
1549 		l0 = &pmap->pm_l0[l0index];
1550 		tl0 = pmap_load(l0);
1551 		if (tl0 == 0) {
1552 			/* recurse for allocating page dir */
1553 			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1554 			    lockp) == NULL) {
1555 				vm_page_unwire_noq(m);
1556 				vm_page_free_zero(m);
1557 				return (NULL);
1558 			}
1559 			tl0 = pmap_load(l0);
1560 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
1561 			l1 = &l1[l1index & Ln_ADDR_MASK];
1562 		} else {
1563 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
1564 			l1 = &l1[l1index & Ln_ADDR_MASK];
1565 			tl1 = pmap_load(l1);
1566 			if (tl1 == 0) {
1567 				/* recurse for allocating page dir */
1568 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1569 				    lockp) == NULL) {
1570 					vm_page_unwire_noq(m);
1571 					vm_page_free_zero(m);
1572 					return (NULL);
1573 				}
1574 			} else {
1575 				l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
1576 				l2pg->wire_count++;
1577 			}
1578 		}
1579 
1580 		l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
1581 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
1582 		pmap_load_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE);
1583 	}
1584 
1585 	pmap_resident_count_inc(pmap, 1);
1586 
1587 	return (m);
1588 }
1589 
1590 static vm_page_t
1591 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1592 {
1593 	vm_pindex_t ptepindex;
1594 	pd_entry_t *pde, tpde;
1595 #ifdef INVARIANTS
1596 	pt_entry_t *pte;
1597 #endif
1598 	vm_page_t m;
1599 	int lvl;
1600 
1601 	/*
1602 	 * Calculate pagetable page index
1603 	 */
1604 	ptepindex = pmap_l2_pindex(va);
1605 retry:
1606 	/*
1607 	 * Get the page directory entry
1608 	 */
1609 	pde = pmap_pde(pmap, va, &lvl);
1610 
1611 	/*
1612 	 * If the page table page is mapped, we just increment the hold count,
1613 	 * and activate it. If we get a level 2 pde it will point to a level 3
1614 	 * table.
1615 	 */
1616 	switch (lvl) {
1617 	case -1:
1618 		break;
1619 	case 0:
1620 #ifdef INVARIANTS
1621 		pte = pmap_l0_to_l1(pde, va);
1622 		KASSERT(pmap_load(pte) == 0,
1623 		    ("pmap_alloc_l3: TODO: l0 superpages"));
1624 #endif
1625 		break;
1626 	case 1:
1627 #ifdef INVARIANTS
1628 		pte = pmap_l1_to_l2(pde, va);
1629 		KASSERT(pmap_load(pte) == 0,
1630 		    ("pmap_alloc_l3: TODO: l1 superpages"));
1631 #endif
1632 		break;
1633 	case 2:
1634 		tpde = pmap_load(pde);
1635 		if (tpde != 0) {
1636 			m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK);
1637 			m->wire_count++;
1638 			return (m);
1639 		}
1640 		break;
1641 	default:
1642 		panic("pmap_alloc_l3: Invalid level %d", lvl);
1643 	}
1644 
1645 	/*
1646 	 * Here if the pte page isn't mapped, or if it has been deallocated.
1647 	 */
1648 	m = _pmap_alloc_l3(pmap, ptepindex, lockp);
1649 	if (m == NULL && lockp != NULL)
1650 		goto retry;
1651 
1652 	return (m);
1653 }
1654 
1655 
1656 /***************************************************
1657  * Pmap allocation/deallocation routines.
1658  ***************************************************/
1659 
1660 /*
1661  * Release any resources held by the given physical map.
1662  * Called when a pmap initialized by pmap_pinit is being released.
1663  * Should only be called if the map contains no valid mappings.
1664  */
1665 void
1666 pmap_release(pmap_t pmap)
1667 {
1668 	vm_page_t m;
1669 
1670 	KASSERT(pmap->pm_stats.resident_count == 0,
1671 	    ("pmap_release: pmap resident count %ld != 0",
1672 	    pmap->pm_stats.resident_count));
1673 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
1674 	    ("pmap_release: pmap has reserved page table page(s)"));
1675 
1676 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0));
1677 
1678 	vm_page_unwire_noq(m);
1679 	vm_page_free_zero(m);
1680 }
1681 
1682 static int
1683 kvm_size(SYSCTL_HANDLER_ARGS)
1684 {
1685 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
1686 
1687 	return sysctl_handle_long(oidp, &ksize, 0, req);
1688 }
1689 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1690     0, 0, kvm_size, "LU", "Size of KVM");
1691 
1692 static int
1693 kvm_free(SYSCTL_HANDLER_ARGS)
1694 {
1695 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1696 
1697 	return sysctl_handle_long(oidp, &kfree, 0, req);
1698 }
1699 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1700     0, 0, kvm_free, "LU", "Amount of KVM free");
1701 
1702 /*
1703  * grow the number of kernel page table entries, if needed
1704  */
1705 void
1706 pmap_growkernel(vm_offset_t addr)
1707 {
1708 	vm_paddr_t paddr;
1709 	vm_page_t nkpg;
1710 	pd_entry_t *l0, *l1, *l2;
1711 
1712 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1713 
1714 	addr = roundup2(addr, L2_SIZE);
1715 	if (addr - 1 >= kernel_map->max_offset)
1716 		addr = kernel_map->max_offset;
1717 	while (kernel_vm_end < addr) {
1718 		l0 = pmap_l0(kernel_pmap, kernel_vm_end);
1719 		KASSERT(pmap_load(l0) != 0,
1720 		    ("pmap_growkernel: No level 0 kernel entry"));
1721 
1722 		l1 = pmap_l0_to_l1(l0, kernel_vm_end);
1723 		if (pmap_load(l1) == 0) {
1724 			/* We need a new PDP entry */
1725 			nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT,
1726 			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
1727 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1728 			if (nkpg == NULL)
1729 				panic("pmap_growkernel: no memory to grow kernel");
1730 			if ((nkpg->flags & PG_ZERO) == 0)
1731 				pmap_zero_page(nkpg);
1732 			paddr = VM_PAGE_TO_PHYS(nkpg);
1733 			pmap_load_store(l1, paddr | L1_TABLE);
1734 			continue; /* try again */
1735 		}
1736 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
1737 		if ((pmap_load(l2) & ATTR_AF) != 0) {
1738 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1739 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1740 				kernel_vm_end = kernel_map->max_offset;
1741 				break;
1742 			}
1743 			continue;
1744 		}
1745 
1746 		nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT,
1747 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1748 		    VM_ALLOC_ZERO);
1749 		if (nkpg == NULL)
1750 			panic("pmap_growkernel: no memory to grow kernel");
1751 		if ((nkpg->flags & PG_ZERO) == 0)
1752 			pmap_zero_page(nkpg);
1753 		paddr = VM_PAGE_TO_PHYS(nkpg);
1754 		pmap_load_store(l2, paddr | L2_TABLE);
1755 		pmap_invalidate_page(kernel_pmap, kernel_vm_end);
1756 
1757 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1758 		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1759 			kernel_vm_end = kernel_map->max_offset;
1760 			break;
1761 		}
1762 	}
1763 }
1764 
1765 
1766 /***************************************************
1767  * page management routines.
1768  ***************************************************/
1769 
1770 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1771 CTASSERT(_NPCM == 3);
1772 CTASSERT(_NPCPV == 168);
1773 
1774 static __inline struct pv_chunk *
1775 pv_to_chunk(pv_entry_t pv)
1776 {
1777 
1778 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
1779 }
1780 
1781 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1782 
1783 #define	PC_FREE0	0xfffffffffffffffful
1784 #define	PC_FREE1	0xfffffffffffffffful
1785 #define	PC_FREE2	0x000000fffffffffful
1786 
1787 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
1788 
1789 #if 0
1790 #ifdef PV_STATS
1791 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1792 
1793 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1794 	"Current number of pv entry chunks");
1795 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1796 	"Current number of pv entry chunks allocated");
1797 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1798 	"Current number of pv entry chunks frees");
1799 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1800 	"Number of times tried to get a chunk page but failed.");
1801 
1802 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
1803 static int pv_entry_spare;
1804 
1805 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1806 	"Current number of pv entry frees");
1807 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1808 	"Current number of pv entry allocs");
1809 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1810 	"Current number of pv entries");
1811 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1812 	"Current number of spare pv entries");
1813 #endif
1814 #endif /* 0 */
1815 
1816 /*
1817  * We are in a serious low memory condition.  Resort to
1818  * drastic measures to free some pages so we can allocate
1819  * another pv entry chunk.
1820  *
1821  * Returns NULL if PV entries were reclaimed from the specified pmap.
1822  *
1823  * We do not, however, unmap 2mpages because subsequent accesses will
1824  * allocate per-page pv entries until repromotion occurs, thereby
1825  * exacerbating the shortage of free pv entries.
1826  */
1827 static vm_page_t
1828 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
1829 {
1830 	struct pch new_tail;
1831 	struct pv_chunk *pc;
1832 	struct md_page *pvh;
1833 	pd_entry_t *pde;
1834 	pmap_t pmap;
1835 	pt_entry_t *pte, tpte;
1836 	pv_entry_t pv;
1837 	vm_offset_t va;
1838 	vm_page_t m, m_pc;
1839 	struct spglist free;
1840 	uint64_t inuse;
1841 	int bit, field, freed, lvl;
1842 
1843 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
1844 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
1845 	pmap = NULL;
1846 	m_pc = NULL;
1847 	SLIST_INIT(&free);
1848 	TAILQ_INIT(&new_tail);
1849 	mtx_lock(&pv_chunks_mutex);
1850 	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
1851 		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1852 		mtx_unlock(&pv_chunks_mutex);
1853 		if (pmap != pc->pc_pmap) {
1854 			if (pmap != NULL && pmap != locked_pmap)
1855 				PMAP_UNLOCK(pmap);
1856 			pmap = pc->pc_pmap;
1857 			/* Avoid deadlock and lock recursion. */
1858 			if (pmap > locked_pmap) {
1859 				RELEASE_PV_LIST_LOCK(lockp);
1860 				PMAP_LOCK(pmap);
1861 			} else if (pmap != locked_pmap &&
1862 			    !PMAP_TRYLOCK(pmap)) {
1863 				pmap = NULL;
1864 				TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
1865 				mtx_lock(&pv_chunks_mutex);
1866 				continue;
1867 			}
1868 		}
1869 
1870 		/*
1871 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
1872 		 */
1873 		freed = 0;
1874 		for (field = 0; field < _NPCM; field++) {
1875 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
1876 			    inuse != 0; inuse &= ~(1UL << bit)) {
1877 				bit = ffsl(inuse) - 1;
1878 				pv = &pc->pc_pventry[field * 64 + bit];
1879 				va = pv->pv_va;
1880 				pde = pmap_pde(pmap, va, &lvl);
1881 				if (lvl != 2)
1882 					continue;
1883 				pte = pmap_l2_to_l3(pde, va);
1884 				tpte = pmap_load(pte);
1885 				if ((tpte & ATTR_SW_WIRED) != 0)
1886 					continue;
1887 				tpte = pmap_load_clear(pte);
1888 				pmap_invalidate_page(pmap, va);
1889 				m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK);
1890 				if (pmap_page_dirty(tpte))
1891 					vm_page_dirty(m);
1892 				if ((tpte & ATTR_AF) != 0)
1893 					vm_page_aflag_set(m, PGA_REFERENCED);
1894 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1895 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
1896 				m->md.pv_gen++;
1897 				if (TAILQ_EMPTY(&m->md.pv_list) &&
1898 				    (m->flags & PG_FICTITIOUS) == 0) {
1899 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
1900 					if (TAILQ_EMPTY(&pvh->pv_list)) {
1901 						vm_page_aflag_clear(m,
1902 						    PGA_WRITEABLE);
1903 					}
1904 				}
1905 				pc->pc_map[field] |= 1UL << bit;
1906 				pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
1907 				freed++;
1908 			}
1909 		}
1910 		if (freed == 0) {
1911 			TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
1912 			mtx_lock(&pv_chunks_mutex);
1913 			continue;
1914 		}
1915 		/* Every freed mapping is for a 4 KB page. */
1916 		pmap_resident_count_dec(pmap, freed);
1917 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
1918 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
1919 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
1920 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1921 		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
1922 		    pc->pc_map[2] == PC_FREE2) {
1923 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1924 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1925 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1926 			/* Entire chunk is free; return it. */
1927 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1928 			dump_drop_page(m_pc->phys_addr);
1929 			mtx_lock(&pv_chunks_mutex);
1930 			break;
1931 		}
1932 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1933 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
1934 		mtx_lock(&pv_chunks_mutex);
1935 		/* One freed pv entry in locked_pmap is sufficient. */
1936 		if (pmap == locked_pmap)
1937 			break;
1938 	}
1939 	TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
1940 	mtx_unlock(&pv_chunks_mutex);
1941 	if (pmap != NULL && pmap != locked_pmap)
1942 		PMAP_UNLOCK(pmap);
1943 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
1944 		m_pc = SLIST_FIRST(&free);
1945 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
1946 		/* Recycle a freed page table page. */
1947 		m_pc->wire_count = 1;
1948 		vm_wire_add(1);
1949 	}
1950 	vm_page_free_pages_toq(&free, false);
1951 	return (m_pc);
1952 }
1953 
1954 /*
1955  * free the pv_entry back to the free list
1956  */
1957 static void
1958 free_pv_entry(pmap_t pmap, pv_entry_t pv)
1959 {
1960 	struct pv_chunk *pc;
1961 	int idx, field, bit;
1962 
1963 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1964 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
1965 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
1966 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
1967 	pc = pv_to_chunk(pv);
1968 	idx = pv - &pc->pc_pventry[0];
1969 	field = idx / 64;
1970 	bit = idx % 64;
1971 	pc->pc_map[field] |= 1ul << bit;
1972 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
1973 	    pc->pc_map[2] != PC_FREE2) {
1974 		/* 98% of the time, pc is already at the head of the list. */
1975 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
1976 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1977 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1978 		}
1979 		return;
1980 	}
1981 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1982 	free_pv_chunk(pc);
1983 }
1984 
1985 static void
1986 free_pv_chunk(struct pv_chunk *pc)
1987 {
1988 	vm_page_t m;
1989 
1990 	mtx_lock(&pv_chunks_mutex);
1991  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1992 	mtx_unlock(&pv_chunks_mutex);
1993 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1994 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1995 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1996 	/* entire chunk is free, return it */
1997 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1998 	dump_drop_page(m->phys_addr);
1999 	vm_page_unwire_noq(m);
2000 	vm_page_free(m);
2001 }
2002 
2003 /*
2004  * Returns a new PV entry, allocating a new PV chunk from the system when
2005  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
2006  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
2007  * returned.
2008  *
2009  * The given PV list lock may be released.
2010  */
2011 static pv_entry_t
2012 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2013 {
2014 	int bit, field;
2015 	pv_entry_t pv;
2016 	struct pv_chunk *pc;
2017 	vm_page_t m;
2018 
2019 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2020 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2021 retry:
2022 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2023 	if (pc != NULL) {
2024 		for (field = 0; field < _NPCM; field++) {
2025 			if (pc->pc_map[field]) {
2026 				bit = ffsl(pc->pc_map[field]) - 1;
2027 				break;
2028 			}
2029 		}
2030 		if (field < _NPCM) {
2031 			pv = &pc->pc_pventry[field * 64 + bit];
2032 			pc->pc_map[field] &= ~(1ul << bit);
2033 			/* If this was the last item, move it to tail */
2034 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
2035 			    pc->pc_map[2] == 0) {
2036 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2037 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2038 				    pc_list);
2039 			}
2040 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
2041 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2042 			return (pv);
2043 		}
2044 	}
2045 	/* No free items, allocate another chunk */
2046 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2047 	    VM_ALLOC_WIRED);
2048 	if (m == NULL) {
2049 		if (lockp == NULL) {
2050 			PV_STAT(pc_chunk_tryfail++);
2051 			return (NULL);
2052 		}
2053 		m = reclaim_pv_chunk(pmap, lockp);
2054 		if (m == NULL)
2055 			goto retry;
2056 	}
2057 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2058 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2059 	dump_add_page(m->phys_addr);
2060 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2061 	pc->pc_pmap = pmap;
2062 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
2063 	pc->pc_map[1] = PC_FREE1;
2064 	pc->pc_map[2] = PC_FREE2;
2065 	mtx_lock(&pv_chunks_mutex);
2066 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2067 	mtx_unlock(&pv_chunks_mutex);
2068 	pv = &pc->pc_pventry[0];
2069 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2070 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
2071 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2072 	return (pv);
2073 }
2074 
2075 /*
2076  * Ensure that the number of spare PV entries in the specified pmap meets or
2077  * exceeds the given count, "needed".
2078  *
2079  * The given PV list lock may be released.
2080  */
2081 static void
2082 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2083 {
2084 	struct pch new_tail;
2085 	struct pv_chunk *pc;
2086 	vm_page_t m;
2087 	int avail, free;
2088 	bool reclaimed;
2089 
2090 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2091 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2092 
2093 	/*
2094 	 * Newly allocated PV chunks must be stored in a private list until
2095 	 * the required number of PV chunks have been allocated.  Otherwise,
2096 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
2097 	 * contrast, these chunks must be added to the pmap upon allocation.
2098 	 */
2099 	TAILQ_INIT(&new_tail);
2100 retry:
2101 	avail = 0;
2102 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
2103 		bit_count((bitstr_t *)pc->pc_map, 0,
2104 		    sizeof(pc->pc_map) * NBBY, &free);
2105 		if (free == 0)
2106 			break;
2107 		avail += free;
2108 		if (avail >= needed)
2109 			break;
2110 	}
2111 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
2112 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2113 		    VM_ALLOC_WIRED);
2114 		if (m == NULL) {
2115 			m = reclaim_pv_chunk(pmap, lockp);
2116 			if (m == NULL)
2117 				goto retry;
2118 			reclaimed = true;
2119 		}
2120 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2121 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2122 		dump_add_page(m->phys_addr);
2123 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2124 		pc->pc_pmap = pmap;
2125 		pc->pc_map[0] = PC_FREE0;
2126 		pc->pc_map[1] = PC_FREE1;
2127 		pc->pc_map[2] = PC_FREE2;
2128 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2129 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2130 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
2131 
2132 		/*
2133 		 * The reclaim might have freed a chunk from the current pmap.
2134 		 * If that chunk contained available entries, we need to
2135 		 * re-count the number of available entries.
2136 		 */
2137 		if (reclaimed)
2138 			goto retry;
2139 	}
2140 	if (!TAILQ_EMPTY(&new_tail)) {
2141 		mtx_lock(&pv_chunks_mutex);
2142 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2143 		mtx_unlock(&pv_chunks_mutex);
2144 	}
2145 }
2146 
2147 /*
2148  * First find and then remove the pv entry for the specified pmap and virtual
2149  * address from the specified pv list.  Returns the pv entry if found and NULL
2150  * otherwise.  This operation can be performed on pv lists for either 4KB or
2151  * 2MB page mappings.
2152  */
2153 static __inline pv_entry_t
2154 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2155 {
2156 	pv_entry_t pv;
2157 
2158 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2159 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2160 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2161 			pvh->pv_gen++;
2162 			break;
2163 		}
2164 	}
2165 	return (pv);
2166 }
2167 
2168 /*
2169  * After demotion from a 2MB page mapping to 512 4KB page mappings,
2170  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2171  * entries for each of the 4KB page mappings.
2172  */
2173 static void
2174 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2175     struct rwlock **lockp)
2176 {
2177 	struct md_page *pvh;
2178 	struct pv_chunk *pc;
2179 	pv_entry_t pv;
2180 	vm_offset_t va_last;
2181 	vm_page_t m;
2182 	int bit, field;
2183 
2184 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2185 	KASSERT((pa & L2_OFFSET) == 0,
2186 	    ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
2187 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2188 
2189 	/*
2190 	 * Transfer the 2mpage's pv entry for this mapping to the first
2191 	 * page's pv list.  Once this transfer begins, the pv list lock
2192 	 * must not be released until the last pv entry is reinstantiated.
2193 	 */
2194 	pvh = pa_to_pvh(pa);
2195 	va = va & ~L2_OFFSET;
2196 	pv = pmap_pvh_remove(pvh, pmap, va);
2197 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
2198 	m = PHYS_TO_VM_PAGE(pa);
2199 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2200 	m->md.pv_gen++;
2201 	/* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
2202 	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
2203 	va_last = va + L2_SIZE - PAGE_SIZE;
2204 	for (;;) {
2205 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2206 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
2207 		    pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
2208 		for (field = 0; field < _NPCM; field++) {
2209 			while (pc->pc_map[field]) {
2210 				bit = ffsl(pc->pc_map[field]) - 1;
2211 				pc->pc_map[field] &= ~(1ul << bit);
2212 				pv = &pc->pc_pventry[field * 64 + bit];
2213 				va += PAGE_SIZE;
2214 				pv->pv_va = va;
2215 				m++;
2216 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2217 			    ("pmap_pv_demote_l2: page %p is not managed", m));
2218 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2219 				m->md.pv_gen++;
2220 				if (va == va_last)
2221 					goto out;
2222 			}
2223 		}
2224 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2225 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2226 	}
2227 out:
2228 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
2229 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2230 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2231 	}
2232 	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
2233 	PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
2234 }
2235 
2236 /*
2237  * First find and then destroy the pv entry for the specified pmap and virtual
2238  * address.  This operation can be performed on pv lists for either 4KB or 2MB
2239  * page mappings.
2240  */
2241 static void
2242 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2243 {
2244 	pv_entry_t pv;
2245 
2246 	pv = pmap_pvh_remove(pvh, pmap, va);
2247 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2248 	free_pv_entry(pmap, pv);
2249 }
2250 
2251 /*
2252  * Conditionally create the PV entry for a 4KB page mapping if the required
2253  * memory can be allocated without resorting to reclamation.
2254  */
2255 static boolean_t
2256 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
2257     struct rwlock **lockp)
2258 {
2259 	pv_entry_t pv;
2260 
2261 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2262 	/* Pass NULL instead of the lock pointer to disable reclamation. */
2263 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
2264 		pv->pv_va = va;
2265 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2266 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2267 		m->md.pv_gen++;
2268 		return (TRUE);
2269 	} else
2270 		return (FALSE);
2271 }
2272 
2273 /*
2274  * pmap_remove_l2: do the things to unmap a level 2 superpage in a process
2275  */
2276 static int
2277 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
2278     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
2279 {
2280 	struct md_page *pvh;
2281 	pt_entry_t old_l2;
2282 	vm_offset_t eva, va;
2283 	vm_page_t m, ml3;
2284 
2285 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2286 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
2287 	old_l2 = pmap_load_clear(l2);
2288 	pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
2289 	if (old_l2 & ATTR_SW_WIRED)
2290 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
2291 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
2292 	if (old_l2 & ATTR_SW_MANAGED) {
2293 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK);
2294 		pvh = pa_to_pvh(old_l2 & ~ATTR_MASK);
2295 		pmap_pvh_free(pvh, pmap, sva);
2296 		eva = sva + L2_SIZE;
2297 		for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
2298 		    va < eva; va += PAGE_SIZE, m++) {
2299 			if (pmap_page_dirty(old_l2))
2300 				vm_page_dirty(m);
2301 			if (old_l2 & ATTR_AF)
2302 				vm_page_aflag_set(m, PGA_REFERENCED);
2303 			if (TAILQ_EMPTY(&m->md.pv_list) &&
2304 			    TAILQ_EMPTY(&pvh->pv_list))
2305 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2306 		}
2307 	}
2308 	KASSERT(pmap != kernel_pmap,
2309 	    ("Attempting to remove an l2 kernel page"));
2310 	ml3 = pmap_remove_pt_page(pmap, sva);
2311 	if (ml3 != NULL) {
2312 		pmap_resident_count_dec(pmap, 1);
2313 		KASSERT(ml3->wire_count == NL3PG,
2314 		    ("pmap_remove_pages: l3 page wire count error"));
2315 		ml3->wire_count = 1;
2316 		vm_page_unwire_noq(ml3);
2317 		pmap_add_delayed_free_list(ml3, free, FALSE);
2318 	}
2319 	return (pmap_unuse_pt(pmap, sva, l1e, free));
2320 }
2321 
2322 /*
2323  * pmap_remove_l3: do the things to unmap a page in a process
2324  */
2325 static int
2326 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
2327     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
2328 {
2329 	struct md_page *pvh;
2330 	pt_entry_t old_l3;
2331 	vm_page_t m;
2332 
2333 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2334 	old_l3 = pmap_load_clear(l3);
2335 	pmap_invalidate_page(pmap, va);
2336 	if (old_l3 & ATTR_SW_WIRED)
2337 		pmap->pm_stats.wired_count -= 1;
2338 	pmap_resident_count_dec(pmap, 1);
2339 	if (old_l3 & ATTR_SW_MANAGED) {
2340 		m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
2341 		if (pmap_page_dirty(old_l3))
2342 			vm_page_dirty(m);
2343 		if (old_l3 & ATTR_AF)
2344 			vm_page_aflag_set(m, PGA_REFERENCED);
2345 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2346 		pmap_pvh_free(&m->md, pmap, va);
2347 		if (TAILQ_EMPTY(&m->md.pv_list) &&
2348 		    (m->flags & PG_FICTITIOUS) == 0) {
2349 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2350 			if (TAILQ_EMPTY(&pvh->pv_list))
2351 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2352 		}
2353 	}
2354 	return (pmap_unuse_pt(pmap, va, l2e, free));
2355 }
2356 
2357 /*
2358  *	Remove the given range of addresses from the specified map.
2359  *
2360  *	It is assumed that the start and end are properly
2361  *	rounded to the page size.
2362  */
2363 void
2364 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2365 {
2366 	struct rwlock *lock;
2367 	vm_offset_t va, va_next;
2368 	pd_entry_t *l0, *l1, *l2;
2369 	pt_entry_t l3_paddr, *l3;
2370 	struct spglist free;
2371 
2372 	/*
2373 	 * Perform an unsynchronized read.  This is, however, safe.
2374 	 */
2375 	if (pmap->pm_stats.resident_count == 0)
2376 		return;
2377 
2378 	SLIST_INIT(&free);
2379 
2380 	PMAP_LOCK(pmap);
2381 
2382 	lock = NULL;
2383 	for (; sva < eva; sva = va_next) {
2384 
2385 		if (pmap->pm_stats.resident_count == 0)
2386 			break;
2387 
2388 		l0 = pmap_l0(pmap, sva);
2389 		if (pmap_load(l0) == 0) {
2390 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2391 			if (va_next < sva)
2392 				va_next = eva;
2393 			continue;
2394 		}
2395 
2396 		l1 = pmap_l0_to_l1(l0, sva);
2397 		if (pmap_load(l1) == 0) {
2398 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2399 			if (va_next < sva)
2400 				va_next = eva;
2401 			continue;
2402 		}
2403 
2404 		/*
2405 		 * Calculate index for next page table.
2406 		 */
2407 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2408 		if (va_next < sva)
2409 			va_next = eva;
2410 
2411 		l2 = pmap_l1_to_l2(l1, sva);
2412 		if (l2 == NULL)
2413 			continue;
2414 
2415 		l3_paddr = pmap_load(l2);
2416 
2417 		if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
2418 			if (sva + L2_SIZE == va_next && eva >= va_next) {
2419 				pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
2420 				    &free, &lock);
2421 				continue;
2422 			} else if (pmap_demote_l2_locked(pmap, l2,
2423 			    sva &~L2_OFFSET, &lock) == NULL)
2424 				continue;
2425 			l3_paddr = pmap_load(l2);
2426 		}
2427 
2428 		/*
2429 		 * Weed out invalid mappings.
2430 		 */
2431 		if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
2432 			continue;
2433 
2434 		/*
2435 		 * Limit our scan to either the end of the va represented
2436 		 * by the current page table page, or to the end of the
2437 		 * range being removed.
2438 		 */
2439 		if (va_next > eva)
2440 			va_next = eva;
2441 
2442 		va = va_next;
2443 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2444 		    sva += L3_SIZE) {
2445 			if (l3 == NULL)
2446 				panic("l3 == NULL");
2447 			if (pmap_load(l3) == 0) {
2448 				if (va != va_next) {
2449 					pmap_invalidate_range(pmap, va, sva);
2450 					va = va_next;
2451 				}
2452 				continue;
2453 			}
2454 			if (va == va_next)
2455 				va = sva;
2456 			if (pmap_remove_l3(pmap, l3, sva, l3_paddr, &free,
2457 			    &lock)) {
2458 				sva += L3_SIZE;
2459 				break;
2460 			}
2461 		}
2462 		if (va != va_next)
2463 			pmap_invalidate_range(pmap, va, sva);
2464 	}
2465 	if (lock != NULL)
2466 		rw_wunlock(lock);
2467 	PMAP_UNLOCK(pmap);
2468 	vm_page_free_pages_toq(&free, false);
2469 }
2470 
2471 /*
2472  *	Routine:	pmap_remove_all
2473  *	Function:
2474  *		Removes this physical page from
2475  *		all physical maps in which it resides.
2476  *		Reflects back modify bits to the pager.
2477  *
2478  *	Notes:
2479  *		Original versions of this routine were very
2480  *		inefficient because they iteratively called
2481  *		pmap_remove (slow...)
2482  */
2483 
2484 void
2485 pmap_remove_all(vm_page_t m)
2486 {
2487 	struct md_page *pvh;
2488 	pv_entry_t pv;
2489 	pmap_t pmap;
2490 	struct rwlock *lock;
2491 	pd_entry_t *pde, tpde;
2492 	pt_entry_t *pte, tpte;
2493 	vm_offset_t va;
2494 	struct spglist free;
2495 	int lvl, pvh_gen, md_gen;
2496 
2497 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2498 	    ("pmap_remove_all: page %p is not managed", m));
2499 	SLIST_INIT(&free);
2500 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
2501 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
2502 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
2503 retry:
2504 	rw_wlock(lock);
2505 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2506 		pmap = PV_PMAP(pv);
2507 		if (!PMAP_TRYLOCK(pmap)) {
2508 			pvh_gen = pvh->pv_gen;
2509 			rw_wunlock(lock);
2510 			PMAP_LOCK(pmap);
2511 			rw_wlock(lock);
2512 			if (pvh_gen != pvh->pv_gen) {
2513 				rw_wunlock(lock);
2514 				PMAP_UNLOCK(pmap);
2515 				goto retry;
2516 			}
2517 		}
2518 		va = pv->pv_va;
2519 		pte = pmap_pte(pmap, va, &lvl);
2520 		KASSERT(pte != NULL,
2521 		    ("pmap_remove_all: no page table entry found"));
2522 		KASSERT(lvl == 2,
2523 		    ("pmap_remove_all: invalid pte level %d", lvl));
2524 
2525 		pmap_demote_l2_locked(pmap, pte, va, &lock);
2526 		PMAP_UNLOCK(pmap);
2527 	}
2528 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2529 		pmap = PV_PMAP(pv);
2530 		if (!PMAP_TRYLOCK(pmap)) {
2531 			pvh_gen = pvh->pv_gen;
2532 			md_gen = m->md.pv_gen;
2533 			rw_wunlock(lock);
2534 			PMAP_LOCK(pmap);
2535 			rw_wlock(lock);
2536 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
2537 				rw_wunlock(lock);
2538 				PMAP_UNLOCK(pmap);
2539 				goto retry;
2540 			}
2541 		}
2542 		pmap_resident_count_dec(pmap, 1);
2543 
2544 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
2545 		KASSERT(pde != NULL,
2546 		    ("pmap_remove_all: no page directory entry found"));
2547 		KASSERT(lvl == 2,
2548 		    ("pmap_remove_all: invalid pde level %d", lvl));
2549 		tpde = pmap_load(pde);
2550 
2551 		pte = pmap_l2_to_l3(pde, pv->pv_va);
2552 		tpte = pmap_load(pte);
2553 		pmap_load_clear(pte);
2554 		pmap_invalidate_page(pmap, pv->pv_va);
2555 		if (tpte & ATTR_SW_WIRED)
2556 			pmap->pm_stats.wired_count--;
2557 		if ((tpte & ATTR_AF) != 0)
2558 			vm_page_aflag_set(m, PGA_REFERENCED);
2559 
2560 		/*
2561 		 * Update the vm_page_t clean and reference bits.
2562 		 */
2563 		if (pmap_page_dirty(tpte))
2564 			vm_page_dirty(m);
2565 		pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
2566 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2567 		m->md.pv_gen++;
2568 		free_pv_entry(pmap, pv);
2569 		PMAP_UNLOCK(pmap);
2570 	}
2571 	vm_page_aflag_clear(m, PGA_WRITEABLE);
2572 	rw_wunlock(lock);
2573 	vm_page_free_pages_toq(&free, false);
2574 }
2575 
2576 /*
2577  *	Set the physical protection on the
2578  *	specified range of this map as requested.
2579  */
2580 void
2581 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2582 {
2583 	vm_offset_t va, va_next;
2584 	pd_entry_t *l0, *l1, *l2;
2585 	pt_entry_t *l3p, l3, nbits;
2586 
2587 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
2588 	if (prot == VM_PROT_NONE) {
2589 		pmap_remove(pmap, sva, eva);
2590 		return;
2591 	}
2592 
2593 	if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
2594 	    (VM_PROT_WRITE | VM_PROT_EXECUTE))
2595 		return;
2596 
2597 	PMAP_LOCK(pmap);
2598 	for (; sva < eva; sva = va_next) {
2599 
2600 		l0 = pmap_l0(pmap, sva);
2601 		if (pmap_load(l0) == 0) {
2602 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2603 			if (va_next < sva)
2604 				va_next = eva;
2605 			continue;
2606 		}
2607 
2608 		l1 = pmap_l0_to_l1(l0, sva);
2609 		if (pmap_load(l1) == 0) {
2610 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2611 			if (va_next < sva)
2612 				va_next = eva;
2613 			continue;
2614 		}
2615 
2616 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2617 		if (va_next < sva)
2618 			va_next = eva;
2619 
2620 		l2 = pmap_l1_to_l2(l1, sva);
2621 		if (pmap_load(l2) == 0)
2622 			continue;
2623 
2624 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
2625 			l3p = pmap_demote_l2(pmap, l2, sva);
2626 			if (l3p == NULL)
2627 				continue;
2628 		}
2629 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
2630 		    ("pmap_protect: Invalid L2 entry after demotion"));
2631 
2632 		if (va_next > eva)
2633 			va_next = eva;
2634 
2635 		va = va_next;
2636 		for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
2637 		    sva += L3_SIZE) {
2638 			l3 = pmap_load(l3p);
2639 			if (!pmap_l3_valid(l3))
2640 				continue;
2641 
2642 			nbits = 0;
2643 			if ((prot & VM_PROT_WRITE) == 0) {
2644 				if ((l3 & ATTR_SW_MANAGED) &&
2645 				    pmap_page_dirty(l3)) {
2646 					vm_page_dirty(PHYS_TO_VM_PAGE(l3 &
2647 					    ~ATTR_MASK));
2648 				}
2649 				nbits |= ATTR_AP(ATTR_AP_RO);
2650 			}
2651 			if ((prot & VM_PROT_EXECUTE) == 0)
2652 				nbits |= ATTR_XN;
2653 
2654 			pmap_set(l3p, nbits);
2655 			/* XXX: Use pmap_invalidate_range */
2656 			pmap_invalidate_page(pmap, sva);
2657 		}
2658 	}
2659 	PMAP_UNLOCK(pmap);
2660 }
2661 
2662 /*
2663  * Inserts the specified page table page into the specified pmap's collection
2664  * of idle page table pages.  Each of a pmap's page table pages is responsible
2665  * for mapping a distinct range of virtual addresses.  The pmap's collection is
2666  * ordered by this virtual address range.
2667  */
2668 static __inline int
2669 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
2670 {
2671 
2672 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2673 	return (vm_radix_insert(&pmap->pm_root, mpte));
2674 }
2675 
2676 /*
2677  * Removes the page table page mapping the specified virtual address from the
2678  * specified pmap's collection of idle page table pages, and returns it.
2679  * Otherwise, returns NULL if there is no page table page corresponding to the
2680  * specified virtual address.
2681  */
2682 static __inline vm_page_t
2683 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
2684 {
2685 
2686 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2687 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
2688 }
2689 
2690 /*
2691  * Performs a break-before-make update of a pmap entry. This is needed when
2692  * either promoting or demoting pages to ensure the TLB doesn't get into an
2693  * inconsistent state.
2694  */
2695 static void
2696 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
2697     vm_offset_t va, vm_size_t size)
2698 {
2699 	register_t intr;
2700 
2701 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2702 
2703 	/*
2704 	 * Ensure we don't get switched out with the page table in an
2705 	 * inconsistent state. We also need to ensure no interrupts fire
2706 	 * as they may make use of an address we are about to invalidate.
2707 	 */
2708 	intr = intr_disable();
2709 	critical_enter();
2710 
2711 	/* Clear the old mapping */
2712 	pmap_load_clear(pte);
2713 	pmap_invalidate_range_nopin(pmap, va, va + size);
2714 
2715 	/* Create the new mapping */
2716 	pmap_load_store(pte, newpte);
2717 
2718 	critical_exit();
2719 	intr_restore(intr);
2720 }
2721 
2722 #if VM_NRESERVLEVEL > 0
2723 /*
2724  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
2725  * replace the many pv entries for the 4KB page mappings by a single pv entry
2726  * for the 2MB page mapping.
2727  */
2728 static void
2729 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2730     struct rwlock **lockp)
2731 {
2732 	struct md_page *pvh;
2733 	pv_entry_t pv;
2734 	vm_offset_t va_last;
2735 	vm_page_t m;
2736 
2737 	KASSERT((pa & L2_OFFSET) == 0,
2738 	    ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
2739 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2740 
2741 	/*
2742 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
2743 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
2744 	 * a transfer avoids the possibility that get_pv_entry() calls
2745 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
2746 	 * mappings that is being promoted.
2747 	 */
2748 	m = PHYS_TO_VM_PAGE(pa);
2749 	va = va & ~L2_OFFSET;
2750 	pv = pmap_pvh_remove(&m->md, pmap, va);
2751 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
2752 	pvh = pa_to_pvh(pa);
2753 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2754 	pvh->pv_gen++;
2755 	/* Free the remaining NPTEPG - 1 pv entries. */
2756 	va_last = va + L2_SIZE - PAGE_SIZE;
2757 	do {
2758 		m++;
2759 		va += PAGE_SIZE;
2760 		pmap_pvh_free(&m->md, pmap, va);
2761 	} while (va < va_last);
2762 }
2763 
2764 /*
2765  * Tries to promote the 512, contiguous 4KB page mappings that are within a
2766  * single level 2 table entry to a single 2MB page mapping.  For promotion
2767  * to occur, two conditions must be met: (1) the 4KB page mappings must map
2768  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
2769  * identical characteristics.
2770  */
2771 static void
2772 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
2773     struct rwlock **lockp)
2774 {
2775 	pt_entry_t *firstl3, *l3, newl2, oldl3, pa;
2776 	vm_page_t mpte;
2777 	vm_offset_t sva;
2778 
2779 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2780 
2781 	sva = va & ~L2_OFFSET;
2782 	firstl3 = pmap_l2_to_l3(l2, sva);
2783 	newl2 = pmap_load(firstl3);
2784 
2785 	/* Check the alingment is valid */
2786 	if (((newl2 & ~ATTR_MASK) & L2_OFFSET) != 0) {
2787 		atomic_add_long(&pmap_l2_p_failures, 1);
2788 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
2789 		    " in pmap %p", va, pmap);
2790 		return;
2791 	}
2792 
2793 	pa = newl2 + L2_SIZE - PAGE_SIZE;
2794 	for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
2795 		oldl3 = pmap_load(l3);
2796 		if (oldl3 != pa) {
2797 			atomic_add_long(&pmap_l2_p_failures, 1);
2798 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
2799 			    " in pmap %p", va, pmap);
2800 			return;
2801 		}
2802 		pa -= PAGE_SIZE;
2803 	}
2804 
2805 	/*
2806 	 * Save the page table page in its current state until the L2
2807 	 * mapping the superpage is demoted by pmap_demote_l2() or
2808 	 * destroyed by pmap_remove_l3().
2809 	 */
2810 	mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
2811 	KASSERT(mpte >= vm_page_array &&
2812 	    mpte < &vm_page_array[vm_page_array_size],
2813 	    ("pmap_promote_l2: page table page is out of range"));
2814 	KASSERT(mpte->pindex == pmap_l2_pindex(va),
2815 	    ("pmap_promote_l2: page table page's pindex is wrong"));
2816 	if (pmap_insert_pt_page(pmap, mpte)) {
2817 		atomic_add_long(&pmap_l2_p_failures, 1);
2818 		CTR2(KTR_PMAP,
2819 		    "pmap_promote_l2: failure for va %#lx in pmap %p", va,
2820 		    pmap);
2821 		return;
2822 	}
2823 
2824 	if ((newl2 & ATTR_SW_MANAGED) != 0)
2825 		pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp);
2826 
2827 	newl2 &= ~ATTR_DESCR_MASK;
2828 	newl2 |= L2_BLOCK;
2829 
2830 	pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE);
2831 
2832 	atomic_add_long(&pmap_l2_promotions, 1);
2833 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
2834 		    pmap);
2835 }
2836 #endif /* VM_NRESERVLEVEL > 0 */
2837 
2838 /*
2839  *	Insert the given physical page (p) at
2840  *	the specified virtual address (v) in the
2841  *	target physical map with the protection requested.
2842  *
2843  *	If specified, the page will be wired down, meaning
2844  *	that the related pte can not be reclaimed.
2845  *
2846  *	NB:  This is the only routine which MAY NOT lazy-evaluate
2847  *	or lose information.  That is, this routine must actually
2848  *	insert this page into the given map NOW.
2849  */
2850 int
2851 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2852     u_int flags, int8_t psind __unused)
2853 {
2854 	struct rwlock *lock;
2855 	pd_entry_t *pde;
2856 	pt_entry_t new_l3, orig_l3;
2857 	pt_entry_t *l2, *l3;
2858 	pv_entry_t pv;
2859 	vm_paddr_t opa, pa, l1_pa, l2_pa, l3_pa;
2860 	vm_page_t mpte, om, l1_m, l2_m, l3_m;
2861 	boolean_t nosleep;
2862 	int lvl;
2863 
2864 	va = trunc_page(va);
2865 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
2866 		VM_OBJECT_ASSERT_LOCKED(m->object);
2867 	pa = VM_PAGE_TO_PHYS(m);
2868 	new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
2869 	    L3_PAGE);
2870 	if ((prot & VM_PROT_WRITE) == 0)
2871 		new_l3 |= ATTR_AP(ATTR_AP_RO);
2872 	if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
2873 		new_l3 |= ATTR_XN;
2874 	if ((flags & PMAP_ENTER_WIRED) != 0)
2875 		new_l3 |= ATTR_SW_WIRED;
2876 	if (va < VM_MAXUSER_ADDRESS)
2877 		new_l3 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN;
2878 	if ((m->oflags & VPO_UNMANAGED) == 0)
2879 		new_l3 |= ATTR_SW_MANAGED;
2880 
2881 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
2882 
2883 	mpte = NULL;
2884 
2885 	lock = NULL;
2886 	PMAP_LOCK(pmap);
2887 
2888 	pde = pmap_pde(pmap, va, &lvl);
2889 	if (pde != NULL && lvl == 1) {
2890 		l2 = pmap_l1_to_l2(pde, va);
2891 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
2892 		    (l3 = pmap_demote_l2_locked(pmap, l2, va & ~L2_OFFSET,
2893 		    &lock)) != NULL) {
2894 			l3 = &l3[pmap_l3_index(va)];
2895 			if (va < VM_MAXUSER_ADDRESS) {
2896 				mpte = PHYS_TO_VM_PAGE(
2897 				    pmap_load(l2) & ~ATTR_MASK);
2898 				mpte->wire_count++;
2899 			}
2900 			goto havel3;
2901 		}
2902 	}
2903 
2904 	if (va < VM_MAXUSER_ADDRESS) {
2905 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
2906 		mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
2907 		if (mpte == NULL && nosleep) {
2908 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
2909 			if (lock != NULL)
2910 				rw_wunlock(lock);
2911 			PMAP_UNLOCK(pmap);
2912 			return (KERN_RESOURCE_SHORTAGE);
2913 		}
2914 		pde = pmap_pde(pmap, va, &lvl);
2915 		KASSERT(pde != NULL,
2916 		    ("pmap_enter: Invalid page entry, va: 0x%lx", va));
2917 		KASSERT(lvl == 2,
2918 		    ("pmap_enter: Invalid level %d", lvl));
2919 	} else {
2920 		/*
2921 		 * If we get a level 2 pde it must point to a level 3 entry
2922 		 * otherwise we will need to create the intermediate tables
2923 		 */
2924 		if (lvl < 2) {
2925 			switch (lvl) {
2926 			default:
2927 			case -1:
2928 				/* Get the l0 pde to update */
2929 				pde = pmap_l0(pmap, va);
2930 				KASSERT(pde != NULL, ("..."));
2931 
2932 				l1_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2933 				    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2934 				    VM_ALLOC_ZERO);
2935 				if (l1_m == NULL)
2936 					panic("pmap_enter: l1 pte_m == NULL");
2937 				if ((l1_m->flags & PG_ZERO) == 0)
2938 					pmap_zero_page(l1_m);
2939 
2940 				l1_pa = VM_PAGE_TO_PHYS(l1_m);
2941 				pmap_load_store(pde, l1_pa | L0_TABLE);
2942 				/* FALLTHROUGH */
2943 			case 0:
2944 				/* Get the l1 pde to update */
2945 				pde = pmap_l1_to_l2(pde, va);
2946 				KASSERT(pde != NULL, ("..."));
2947 
2948 				l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2949 				    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2950 				    VM_ALLOC_ZERO);
2951 				if (l2_m == NULL)
2952 					panic("pmap_enter: l2 pte_m == NULL");
2953 				if ((l2_m->flags & PG_ZERO) == 0)
2954 					pmap_zero_page(l2_m);
2955 
2956 				l2_pa = VM_PAGE_TO_PHYS(l2_m);
2957 				pmap_load_store(pde, l2_pa | L1_TABLE);
2958 				/* FALLTHROUGH */
2959 			case 1:
2960 				/* Get the l2 pde to update */
2961 				pde = pmap_l1_to_l2(pde, va);
2962 				KASSERT(pde != NULL, ("..."));
2963 
2964 				l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2965 				    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2966 				    VM_ALLOC_ZERO);
2967 				if (l3_m == NULL)
2968 					panic("pmap_enter: l3 pte_m == NULL");
2969 				if ((l3_m->flags & PG_ZERO) == 0)
2970 					pmap_zero_page(l3_m);
2971 
2972 				l3_pa = VM_PAGE_TO_PHYS(l3_m);
2973 				pmap_load_store(pde, l3_pa | L2_TABLE);
2974 				break;
2975 			}
2976 		}
2977 	}
2978 	l3 = pmap_l2_to_l3(pde, va);
2979 
2980 havel3:
2981 	orig_l3 = pmap_load(l3);
2982 	opa = orig_l3 & ~ATTR_MASK;
2983 	pv = NULL;
2984 
2985 	/*
2986 	 * Is the specified virtual address already mapped?
2987 	 */
2988 	if (pmap_l3_valid(orig_l3)) {
2989 		/*
2990 		 * Wiring change, just update stats. We don't worry about
2991 		 * wiring PT pages as they remain resident as long as there
2992 		 * are valid mappings in them. Hence, if a user page is wired,
2993 		 * the PT page will be also.
2994 		 */
2995 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
2996 		    (orig_l3 & ATTR_SW_WIRED) == 0)
2997 			pmap->pm_stats.wired_count++;
2998 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
2999 		    (orig_l3 & ATTR_SW_WIRED) != 0)
3000 			pmap->pm_stats.wired_count--;
3001 
3002 		/*
3003 		 * Remove the extra PT page reference.
3004 		 */
3005 		if (mpte != NULL) {
3006 			mpte->wire_count--;
3007 			KASSERT(mpte->wire_count > 0,
3008 			    ("pmap_enter: missing reference to page table page,"
3009 			     " va: 0x%lx", va));
3010 		}
3011 
3012 		/*
3013 		 * Has the physical page changed?
3014 		 */
3015 		if (opa == pa) {
3016 			/*
3017 			 * No, might be a protection or wiring change.
3018 			 */
3019 			if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
3020 				if ((new_l3 & ATTR_AP(ATTR_AP_RW)) ==
3021 				    ATTR_AP(ATTR_AP_RW)) {
3022 					vm_page_aflag_set(m, PGA_WRITEABLE);
3023 				}
3024 			}
3025 			goto validate;
3026 		}
3027 
3028 		/*
3029 		 * The physical page has changed.
3030 		 */
3031 		(void)pmap_load_clear(l3);
3032 		KASSERT((orig_l3 & ~ATTR_MASK) == opa,
3033 		    ("pmap_enter: unexpected pa update for %#lx", va));
3034 		if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
3035 			om = PHYS_TO_VM_PAGE(opa);
3036 
3037 			/*
3038 			 * The pmap lock is sufficient to synchronize with
3039 			 * concurrent calls to pmap_page_test_mappings() and
3040 			 * pmap_ts_referenced().
3041 			 */
3042 			if (pmap_page_dirty(orig_l3))
3043 				vm_page_dirty(om);
3044 			if ((orig_l3 & ATTR_AF) != 0)
3045 				vm_page_aflag_set(om, PGA_REFERENCED);
3046 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
3047 			pv = pmap_pvh_remove(&om->md, pmap, va);
3048 			if ((m->oflags & VPO_UNMANAGED) != 0)
3049 				free_pv_entry(pmap, pv);
3050 			if ((om->aflags & PGA_WRITEABLE) != 0 &&
3051 			    TAILQ_EMPTY(&om->md.pv_list) &&
3052 			    ((om->flags & PG_FICTITIOUS) != 0 ||
3053 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3054 				vm_page_aflag_clear(om, PGA_WRITEABLE);
3055 		}
3056 		pmap_invalidate_page(pmap, va);
3057 		orig_l3 = 0;
3058 	} else {
3059 		/*
3060 		 * Increment the counters.
3061 		 */
3062 		if ((new_l3 & ATTR_SW_WIRED) != 0)
3063 			pmap->pm_stats.wired_count++;
3064 		pmap_resident_count_inc(pmap, 1);
3065 	}
3066 	/*
3067 	 * Enter on the PV list if part of our managed memory.
3068 	 */
3069 	if ((m->oflags & VPO_UNMANAGED) == 0) {
3070 		if (pv == NULL) {
3071 			pv = get_pv_entry(pmap, &lock);
3072 			pv->pv_va = va;
3073 		}
3074 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
3075 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3076 		m->md.pv_gen++;
3077 		if ((new_l3 & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))
3078 			vm_page_aflag_set(m, PGA_WRITEABLE);
3079 	}
3080 
3081 validate:
3082 	/*
3083 	 * Sync icache if exec permission and attribute VM_MEMATTR_WRITE_BACK
3084 	 * is set. Do it now, before the mapping is stored and made
3085 	 * valid for hardware table walk. If done later, then other can
3086 	 * access this page before caches are properly synced.
3087 	 * Don't do it for kernel memory which is mapped with exec
3088 	 * permission even if the memory isn't going to hold executable
3089 	 * code. The only time when icache sync is needed is after
3090 	 * kernel module is loaded and the relocation info is processed.
3091 	 * And it's done in elf_cpu_load_file().
3092 	*/
3093 	if ((prot & VM_PROT_EXECUTE) &&  pmap != kernel_pmap &&
3094 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
3095 	    (opa != pa || (orig_l3 & ATTR_XN)))
3096 		cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
3097 
3098 	/*
3099 	 * Update the L3 entry
3100 	 */
3101 	if (pmap_l3_valid(orig_l3)) {
3102 		KASSERT(opa == pa, ("pmap_enter: invalid update"));
3103 		if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
3104 			/* same PA, different attributes */
3105 			pmap_load_store(l3, new_l3);
3106 			pmap_invalidate_page(pmap, va);
3107 			if (pmap_page_dirty(orig_l3) &&
3108 			    (orig_l3 & ATTR_SW_MANAGED) != 0)
3109 				vm_page_dirty(m);
3110 		} else {
3111 			/*
3112 			 * orig_l3 == new_l3
3113 			 * This can happens if multiple threads simultaneously
3114 			 * access not yet mapped page. This bad for performance
3115 			 * since this can cause full demotion-NOP-promotion
3116 			 * cycle.
3117 			 * Another possible reasons are:
3118 			 * - VM and pmap memory layout are diverged
3119 			 * - tlb flush is missing somewhere and CPU doesn't see
3120 			 *   actual mapping.
3121 			 */
3122 			CTR4(KTR_PMAP, "%s: already mapped page - "
3123 			    "pmap %p va 0x%#lx pte 0x%lx",
3124 			    __func__, pmap, va, new_l3);
3125 		}
3126 	} else {
3127 		/* New mappig */
3128 		pmap_load_store(l3, new_l3);
3129 	}
3130 
3131 #if VM_NRESERVLEVEL > 0
3132 	if (pmap != pmap_kernel() &&
3133 	    (mpte == NULL || mpte->wire_count == NL3PG) &&
3134 	    pmap_superpages_enabled() &&
3135 	    (m->flags & PG_FICTITIOUS) == 0 &&
3136 	    vm_reserv_level_iffullpop(m) == 0) {
3137 		pmap_promote_l2(pmap, pde, va, &lock);
3138 	}
3139 #endif
3140 
3141 	if (lock != NULL)
3142 		rw_wunlock(lock);
3143 	PMAP_UNLOCK(pmap);
3144 	return (KERN_SUCCESS);
3145 }
3146 
3147 /*
3148  * Maps a sequence of resident pages belonging to the same object.
3149  * The sequence begins with the given page m_start.  This page is
3150  * mapped at the given virtual address start.  Each subsequent page is
3151  * mapped at a virtual address that is offset from start by the same
3152  * amount as the page is offset from m_start within the object.  The
3153  * last page in the sequence is the page with the largest offset from
3154  * m_start that can be mapped at a virtual address less than the given
3155  * virtual address end.  Not every virtual page between start and end
3156  * is mapped; only those for which a resident page exists with the
3157  * corresponding offset from m_start are mapped.
3158  */
3159 void
3160 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3161     vm_page_t m_start, vm_prot_t prot)
3162 {
3163 	struct rwlock *lock;
3164 	vm_offset_t va;
3165 	vm_page_t m, mpte;
3166 	vm_pindex_t diff, psize;
3167 
3168 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
3169 
3170 	psize = atop(end - start);
3171 	mpte = NULL;
3172 	m = m_start;
3173 	lock = NULL;
3174 	PMAP_LOCK(pmap);
3175 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3176 		va = start + ptoa(diff);
3177 		mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock);
3178 		m = TAILQ_NEXT(m, listq);
3179 	}
3180 	if (lock != NULL)
3181 		rw_wunlock(lock);
3182 	PMAP_UNLOCK(pmap);
3183 }
3184 
3185 /*
3186  * this code makes some *MAJOR* assumptions:
3187  * 1. Current pmap & pmap exists.
3188  * 2. Not wired.
3189  * 3. Read access.
3190  * 4. No page table pages.
3191  * but is *MUCH* faster than pmap_enter...
3192  */
3193 
3194 void
3195 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3196 {
3197 	struct rwlock *lock;
3198 
3199 	lock = NULL;
3200 	PMAP_LOCK(pmap);
3201 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
3202 	if (lock != NULL)
3203 		rw_wunlock(lock);
3204 	PMAP_UNLOCK(pmap);
3205 }
3206 
3207 static vm_page_t
3208 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3209     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
3210 {
3211 	struct spglist free;
3212 	pd_entry_t *pde;
3213 	pt_entry_t *l2, *l3, l3_val;
3214 	vm_paddr_t pa;
3215 	int lvl;
3216 
3217 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3218 	    (m->oflags & VPO_UNMANAGED) != 0,
3219 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3220 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3221 
3222 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
3223 	/*
3224 	 * In the case that a page table page is not
3225 	 * resident, we are creating it here.
3226 	 */
3227 	if (va < VM_MAXUSER_ADDRESS) {
3228 		vm_pindex_t l2pindex;
3229 
3230 		/*
3231 		 * Calculate pagetable page index
3232 		 */
3233 		l2pindex = pmap_l2_pindex(va);
3234 		if (mpte && (mpte->pindex == l2pindex)) {
3235 			mpte->wire_count++;
3236 		} else {
3237 			/*
3238 			 * Get the l2 entry
3239 			 */
3240 			pde = pmap_pde(pmap, va, &lvl);
3241 
3242 			/*
3243 			 * If the page table page is mapped, we just increment
3244 			 * the hold count, and activate it.  Otherwise, we
3245 			 * attempt to allocate a page table page.  If this
3246 			 * attempt fails, we don't retry.  Instead, we give up.
3247 			 */
3248 			if (lvl == 1) {
3249 				l2 = pmap_l1_to_l2(pde, va);
3250 				if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
3251 				    L2_BLOCK)
3252 					return (NULL);
3253 			}
3254 			if (lvl == 2 && pmap_load(pde) != 0) {
3255 				mpte =
3256 				    PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
3257 				mpte->wire_count++;
3258 			} else {
3259 				/*
3260 				 * Pass NULL instead of the PV list lock
3261 				 * pointer, because we don't intend to sleep.
3262 				 */
3263 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
3264 				if (mpte == NULL)
3265 					return (mpte);
3266 			}
3267 		}
3268 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3269 		l3 = &l3[pmap_l3_index(va)];
3270 	} else {
3271 		mpte = NULL;
3272 		pde = pmap_pde(kernel_pmap, va, &lvl);
3273 		KASSERT(pde != NULL,
3274 		    ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
3275 		     va));
3276 		KASSERT(lvl == 2,
3277 		    ("pmap_enter_quick_locked: Invalid level %d", lvl));
3278 		l3 = pmap_l2_to_l3(pde, va);
3279 	}
3280 
3281 	if (pmap_load(l3) != 0) {
3282 		if (mpte != NULL) {
3283 			mpte->wire_count--;
3284 			mpte = NULL;
3285 		}
3286 		return (mpte);
3287 	}
3288 
3289 	/*
3290 	 * Enter on the PV list if part of our managed memory.
3291 	 */
3292 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3293 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3294 		if (mpte != NULL) {
3295 			SLIST_INIT(&free);
3296 			if (pmap_unwire_l3(pmap, va, mpte, &free)) {
3297 				pmap_invalidate_page(pmap, va);
3298 				vm_page_free_pages_toq(&free, false);
3299 			}
3300 			mpte = NULL;
3301 		}
3302 		return (mpte);
3303 	}
3304 
3305 	/*
3306 	 * Increment counters
3307 	 */
3308 	pmap_resident_count_inc(pmap, 1);
3309 
3310 	pa = VM_PAGE_TO_PHYS(m);
3311 	l3_val = pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
3312 	    ATTR_AP(ATTR_AP_RO) | L3_PAGE;
3313 	if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
3314 		l3_val |= ATTR_XN;
3315 	else if (va < VM_MAXUSER_ADDRESS)
3316 		l3_val |= ATTR_PXN;
3317 
3318 	/*
3319 	 * Now validate mapping with RO protection
3320 	 */
3321 	if ((m->oflags & VPO_UNMANAGED) == 0)
3322 		l3_val |= ATTR_SW_MANAGED;
3323 
3324 	/* Sync icache before the mapping is stored to PTE */
3325 	if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
3326 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
3327 		cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
3328 
3329 	pmap_load_store(l3, l3_val);
3330 	pmap_invalidate_page(pmap, va);
3331 	return (mpte);
3332 }
3333 
3334 /*
3335  * This code maps large physical mmap regions into the
3336  * processor address space.  Note that some shortcuts
3337  * are taken, but the code works.
3338  */
3339 void
3340 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3341     vm_pindex_t pindex, vm_size_t size)
3342 {
3343 
3344 	VM_OBJECT_ASSERT_WLOCKED(object);
3345 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3346 	    ("pmap_object_init_pt: non-device object"));
3347 }
3348 
3349 /*
3350  *	Clear the wired attribute from the mappings for the specified range of
3351  *	addresses in the given pmap.  Every valid mapping within that range
3352  *	must have the wired attribute set.  In contrast, invalid mappings
3353  *	cannot have the wired attribute set, so they are ignored.
3354  *
3355  *	The wired attribute of the page table entry is not a hardware feature,
3356  *	so there is no need to invalidate any TLB entries.
3357  */
3358 void
3359 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3360 {
3361 	vm_offset_t va_next;
3362 	pd_entry_t *l0, *l1, *l2;
3363 	pt_entry_t *l3;
3364 
3365 	PMAP_LOCK(pmap);
3366 	for (; sva < eva; sva = va_next) {
3367 		l0 = pmap_l0(pmap, sva);
3368 		if (pmap_load(l0) == 0) {
3369 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3370 			if (va_next < sva)
3371 				va_next = eva;
3372 			continue;
3373 		}
3374 
3375 		l1 = pmap_l0_to_l1(l0, sva);
3376 		if (pmap_load(l1) == 0) {
3377 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3378 			if (va_next < sva)
3379 				va_next = eva;
3380 			continue;
3381 		}
3382 
3383 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3384 		if (va_next < sva)
3385 			va_next = eva;
3386 
3387 		l2 = pmap_l1_to_l2(l1, sva);
3388 		if (pmap_load(l2) == 0)
3389 			continue;
3390 
3391 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
3392 			l3 = pmap_demote_l2(pmap, l2, sva);
3393 			if (l3 == NULL)
3394 				continue;
3395 		}
3396 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
3397 		    ("pmap_unwire: Invalid l2 entry after demotion"));
3398 
3399 		if (va_next > eva)
3400 			va_next = eva;
3401 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
3402 		    sva += L3_SIZE) {
3403 			if (pmap_load(l3) == 0)
3404 				continue;
3405 			if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
3406 				panic("pmap_unwire: l3 %#jx is missing "
3407 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
3408 
3409 			/*
3410 			 * PG_W must be cleared atomically.  Although the pmap
3411 			 * lock synchronizes access to PG_W, another processor
3412 			 * could be setting PG_M and/or PG_A concurrently.
3413 			 */
3414 			atomic_clear_long(l3, ATTR_SW_WIRED);
3415 			pmap->pm_stats.wired_count--;
3416 		}
3417 	}
3418 	PMAP_UNLOCK(pmap);
3419 }
3420 
3421 /*
3422  *	Copy the range specified by src_addr/len
3423  *	from the source map to the range dst_addr/len
3424  *	in the destination map.
3425  *
3426  *	This routine is only advisory and need not do anything.
3427  */
3428 
3429 void
3430 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3431     vm_offset_t src_addr)
3432 {
3433 }
3434 
3435 /*
3436  *	pmap_zero_page zeros the specified hardware page by mapping
3437  *	the page into KVM and using bzero to clear its contents.
3438  */
3439 void
3440 pmap_zero_page(vm_page_t m)
3441 {
3442 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3443 
3444 	pagezero((void *)va);
3445 }
3446 
3447 /*
3448  *	pmap_zero_page_area zeros the specified hardware page by mapping
3449  *	the page into KVM and using bzero to clear its contents.
3450  *
3451  *	off and size may not cover an area beyond a single hardware page.
3452  */
3453 void
3454 pmap_zero_page_area(vm_page_t m, int off, int size)
3455 {
3456 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3457 
3458 	if (off == 0 && size == PAGE_SIZE)
3459 		pagezero((void *)va);
3460 	else
3461 		bzero((char *)va + off, size);
3462 }
3463 
3464 /*
3465  *	pmap_copy_page copies the specified (machine independent)
3466  *	page by mapping the page into virtual memory and using
3467  *	bcopy to copy the page, one machine dependent page at a
3468  *	time.
3469  */
3470 void
3471 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
3472 {
3473 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
3474 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
3475 
3476 	pagecopy((void *)src, (void *)dst);
3477 }
3478 
3479 int unmapped_buf_allowed = 1;
3480 
3481 void
3482 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
3483     vm_offset_t b_offset, int xfersize)
3484 {
3485 	void *a_cp, *b_cp;
3486 	vm_page_t m_a, m_b;
3487 	vm_paddr_t p_a, p_b;
3488 	vm_offset_t a_pg_offset, b_pg_offset;
3489 	int cnt;
3490 
3491 	while (xfersize > 0) {
3492 		a_pg_offset = a_offset & PAGE_MASK;
3493 		m_a = ma[a_offset >> PAGE_SHIFT];
3494 		p_a = m_a->phys_addr;
3495 		b_pg_offset = b_offset & PAGE_MASK;
3496 		m_b = mb[b_offset >> PAGE_SHIFT];
3497 		p_b = m_b->phys_addr;
3498 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
3499 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
3500 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
3501 			panic("!DMAP a %lx", p_a);
3502 		} else {
3503 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
3504 		}
3505 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
3506 			panic("!DMAP b %lx", p_b);
3507 		} else {
3508 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
3509 		}
3510 		bcopy(a_cp, b_cp, cnt);
3511 		a_offset += cnt;
3512 		b_offset += cnt;
3513 		xfersize -= cnt;
3514 	}
3515 }
3516 
3517 vm_offset_t
3518 pmap_quick_enter_page(vm_page_t m)
3519 {
3520 
3521 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
3522 }
3523 
3524 void
3525 pmap_quick_remove_page(vm_offset_t addr)
3526 {
3527 }
3528 
3529 /*
3530  * Returns true if the pmap's pv is one of the first
3531  * 16 pvs linked to from this page.  This count may
3532  * be changed upwards or downwards in the future; it
3533  * is only necessary that true be returned for a small
3534  * subset of pmaps for proper page aging.
3535  */
3536 boolean_t
3537 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
3538 {
3539 	struct md_page *pvh;
3540 	struct rwlock *lock;
3541 	pv_entry_t pv;
3542 	int loops = 0;
3543 	boolean_t rv;
3544 
3545 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3546 	    ("pmap_page_exists_quick: page %p is not managed", m));
3547 	rv = FALSE;
3548 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3549 	rw_rlock(lock);
3550 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3551 		if (PV_PMAP(pv) == pmap) {
3552 			rv = TRUE;
3553 			break;
3554 		}
3555 		loops++;
3556 		if (loops >= 16)
3557 			break;
3558 	}
3559 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
3560 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3561 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3562 			if (PV_PMAP(pv) == pmap) {
3563 				rv = TRUE;
3564 				break;
3565 			}
3566 			loops++;
3567 			if (loops >= 16)
3568 				break;
3569 		}
3570 	}
3571 	rw_runlock(lock);
3572 	return (rv);
3573 }
3574 
3575 /*
3576  *	pmap_page_wired_mappings:
3577  *
3578  *	Return the number of managed mappings to the given physical page
3579  *	that are wired.
3580  */
3581 int
3582 pmap_page_wired_mappings(vm_page_t m)
3583 {
3584 	struct rwlock *lock;
3585 	struct md_page *pvh;
3586 	pmap_t pmap;
3587 	pt_entry_t *pte;
3588 	pv_entry_t pv;
3589 	int count, lvl, md_gen, pvh_gen;
3590 
3591 	if ((m->oflags & VPO_UNMANAGED) != 0)
3592 		return (0);
3593 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3594 	rw_rlock(lock);
3595 restart:
3596 	count = 0;
3597 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3598 		pmap = PV_PMAP(pv);
3599 		if (!PMAP_TRYLOCK(pmap)) {
3600 			md_gen = m->md.pv_gen;
3601 			rw_runlock(lock);
3602 			PMAP_LOCK(pmap);
3603 			rw_rlock(lock);
3604 			if (md_gen != m->md.pv_gen) {
3605 				PMAP_UNLOCK(pmap);
3606 				goto restart;
3607 			}
3608 		}
3609 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
3610 		if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0)
3611 			count++;
3612 		PMAP_UNLOCK(pmap);
3613 	}
3614 	if ((m->flags & PG_FICTITIOUS) == 0) {
3615 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3616 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3617 			pmap = PV_PMAP(pv);
3618 			if (!PMAP_TRYLOCK(pmap)) {
3619 				md_gen = m->md.pv_gen;
3620 				pvh_gen = pvh->pv_gen;
3621 				rw_runlock(lock);
3622 				PMAP_LOCK(pmap);
3623 				rw_rlock(lock);
3624 				if (md_gen != m->md.pv_gen ||
3625 				    pvh_gen != pvh->pv_gen) {
3626 					PMAP_UNLOCK(pmap);
3627 					goto restart;
3628 				}
3629 			}
3630 			pte = pmap_pte(pmap, pv->pv_va, &lvl);
3631 			if (pte != NULL &&
3632 			    (pmap_load(pte) & ATTR_SW_WIRED) != 0)
3633 				count++;
3634 			PMAP_UNLOCK(pmap);
3635 		}
3636 	}
3637 	rw_runlock(lock);
3638 	return (count);
3639 }
3640 
3641 /*
3642  * Destroy all managed, non-wired mappings in the given user-space
3643  * pmap.  This pmap cannot be active on any processor besides the
3644  * caller.
3645  *
3646  * This function cannot be applied to the kernel pmap.  Moreover, it
3647  * is not intended for general use.  It is only to be used during
3648  * process termination.  Consequently, it can be implemented in ways
3649  * that make it faster than pmap_remove().  First, it can more quickly
3650  * destroy mappings by iterating over the pmap's collection of PV
3651  * entries, rather than searching the page table.  Second, it doesn't
3652  * have to test and clear the page table entries atomically, because
3653  * no processor is currently accessing the user address space.  In
3654  * particular, a page table entry's dirty bit won't change state once
3655  * this function starts.
3656  */
3657 void
3658 pmap_remove_pages(pmap_t pmap)
3659 {
3660 	pd_entry_t *pde;
3661 	pt_entry_t *pte, tpte;
3662 	struct spglist free;
3663 	vm_page_t m, ml3, mt;
3664 	pv_entry_t pv;
3665 	struct md_page *pvh;
3666 	struct pv_chunk *pc, *npc;
3667 	struct rwlock *lock;
3668 	int64_t bit;
3669 	uint64_t inuse, bitmask;
3670 	int allfree, field, freed, idx, lvl;
3671 	vm_paddr_t pa;
3672 
3673 	lock = NULL;
3674 
3675 	SLIST_INIT(&free);
3676 	PMAP_LOCK(pmap);
3677 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
3678 		allfree = 1;
3679 		freed = 0;
3680 		for (field = 0; field < _NPCM; field++) {
3681 			inuse = ~pc->pc_map[field] & pc_freemask[field];
3682 			while (inuse != 0) {
3683 				bit = ffsl(inuse) - 1;
3684 				bitmask = 1UL << bit;
3685 				idx = field * 64 + bit;
3686 				pv = &pc->pc_pventry[idx];
3687 				inuse &= ~bitmask;
3688 
3689 				pde = pmap_pde(pmap, pv->pv_va, &lvl);
3690 				KASSERT(pde != NULL,
3691 				    ("Attempting to remove an unmapped page"));
3692 
3693 				switch(lvl) {
3694 				case 1:
3695 					pte = pmap_l1_to_l2(pde, pv->pv_va);
3696 					tpte = pmap_load(pte);
3697 					KASSERT((tpte & ATTR_DESCR_MASK) ==
3698 					    L2_BLOCK,
3699 					    ("Attempting to remove an invalid "
3700 					    "block: %lx", tpte));
3701 					tpte = pmap_load(pte);
3702 					break;
3703 				case 2:
3704 					pte = pmap_l2_to_l3(pde, pv->pv_va);
3705 					tpte = pmap_load(pte);
3706 					KASSERT((tpte & ATTR_DESCR_MASK) ==
3707 					    L3_PAGE,
3708 					    ("Attempting to remove an invalid "
3709 					     "page: %lx", tpte));
3710 					break;
3711 				default:
3712 					panic(
3713 					    "Invalid page directory level: %d",
3714 					    lvl);
3715 				}
3716 
3717 /*
3718  * We cannot remove wired pages from a process' mapping at this time
3719  */
3720 				if (tpte & ATTR_SW_WIRED) {
3721 					allfree = 0;
3722 					continue;
3723 				}
3724 
3725 				pa = tpte & ~ATTR_MASK;
3726 
3727 				m = PHYS_TO_VM_PAGE(pa);
3728 				KASSERT(m->phys_addr == pa,
3729 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
3730 				    m, (uintmax_t)m->phys_addr,
3731 				    (uintmax_t)tpte));
3732 
3733 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
3734 				    m < &vm_page_array[vm_page_array_size],
3735 				    ("pmap_remove_pages: bad pte %#jx",
3736 				    (uintmax_t)tpte));
3737 
3738 				pmap_load_clear(pte);
3739 
3740 				/*
3741 				 * Update the vm_page_t clean/reference bits.
3742 				 */
3743 				if ((tpte & ATTR_AP_RW_BIT) ==
3744 				    ATTR_AP(ATTR_AP_RW)) {
3745 					switch (lvl) {
3746 					case 1:
3747 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3748 							vm_page_dirty(m);
3749 						break;
3750 					case 2:
3751 						vm_page_dirty(m);
3752 						break;
3753 					}
3754 				}
3755 
3756 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
3757 
3758 				/* Mark free */
3759 				pc->pc_map[field] |= bitmask;
3760 				switch (lvl) {
3761 				case 1:
3762 					pmap_resident_count_dec(pmap,
3763 					    L2_SIZE / PAGE_SIZE);
3764 					pvh = pa_to_pvh(tpte & ~ATTR_MASK);
3765 					TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
3766 					pvh->pv_gen++;
3767 					if (TAILQ_EMPTY(&pvh->pv_list)) {
3768 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3769 							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
3770 							    TAILQ_EMPTY(&mt->md.pv_list))
3771 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
3772 					}
3773 					ml3 = pmap_remove_pt_page(pmap,
3774 					    pv->pv_va);
3775 					if (ml3 != NULL) {
3776 						pmap_resident_count_dec(pmap,1);
3777 						KASSERT(ml3->wire_count == NL3PG,
3778 						    ("pmap_remove_pages: l3 page wire count error"));
3779 						ml3->wire_count = 1;
3780 						vm_page_unwire_noq(ml3);
3781 						pmap_add_delayed_free_list(ml3,
3782 						    &free, FALSE);
3783 					}
3784 					break;
3785 				case 2:
3786 					pmap_resident_count_dec(pmap, 1);
3787 					TAILQ_REMOVE(&m->md.pv_list, pv,
3788 					    pv_next);
3789 					m->md.pv_gen++;
3790 					if ((m->aflags & PGA_WRITEABLE) != 0 &&
3791 					    TAILQ_EMPTY(&m->md.pv_list) &&
3792 					    (m->flags & PG_FICTITIOUS) == 0) {
3793 						pvh = pa_to_pvh(
3794 						    VM_PAGE_TO_PHYS(m));
3795 						if (TAILQ_EMPTY(&pvh->pv_list))
3796 							vm_page_aflag_clear(m,
3797 							    PGA_WRITEABLE);
3798 					}
3799 					break;
3800 				}
3801 				pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
3802 				    &free);
3803 				freed++;
3804 			}
3805 		}
3806 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3807 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3808 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3809 		if (allfree) {
3810 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3811 			free_pv_chunk(pc);
3812 		}
3813 	}
3814 	pmap_invalidate_all(pmap);
3815 	if (lock != NULL)
3816 		rw_wunlock(lock);
3817 	PMAP_UNLOCK(pmap);
3818 	vm_page_free_pages_toq(&free, false);
3819 }
3820 
3821 /*
3822  * This is used to check if a page has been accessed or modified. As we
3823  * don't have a bit to see if it has been modified we have to assume it
3824  * has been if the page is read/write.
3825  */
3826 static boolean_t
3827 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
3828 {
3829 	struct rwlock *lock;
3830 	pv_entry_t pv;
3831 	struct md_page *pvh;
3832 	pt_entry_t *pte, mask, value;
3833 	pmap_t pmap;
3834 	int lvl, md_gen, pvh_gen;
3835 	boolean_t rv;
3836 
3837 	rv = FALSE;
3838 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3839 	rw_rlock(lock);
3840 restart:
3841 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3842 		pmap = PV_PMAP(pv);
3843 		if (!PMAP_TRYLOCK(pmap)) {
3844 			md_gen = m->md.pv_gen;
3845 			rw_runlock(lock);
3846 			PMAP_LOCK(pmap);
3847 			rw_rlock(lock);
3848 			if (md_gen != m->md.pv_gen) {
3849 				PMAP_UNLOCK(pmap);
3850 				goto restart;
3851 			}
3852 		}
3853 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
3854 		KASSERT(lvl == 3,
3855 		    ("pmap_page_test_mappings: Invalid level %d", lvl));
3856 		mask = 0;
3857 		value = 0;
3858 		if (modified) {
3859 			mask |= ATTR_AP_RW_BIT;
3860 			value |= ATTR_AP(ATTR_AP_RW);
3861 		}
3862 		if (accessed) {
3863 			mask |= ATTR_AF | ATTR_DESCR_MASK;
3864 			value |= ATTR_AF | L3_PAGE;
3865 		}
3866 		rv = (pmap_load(pte) & mask) == value;
3867 		PMAP_UNLOCK(pmap);
3868 		if (rv)
3869 			goto out;
3870 	}
3871 	if ((m->flags & PG_FICTITIOUS) == 0) {
3872 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3873 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3874 			pmap = PV_PMAP(pv);
3875 			if (!PMAP_TRYLOCK(pmap)) {
3876 				md_gen = m->md.pv_gen;
3877 				pvh_gen = pvh->pv_gen;
3878 				rw_runlock(lock);
3879 				PMAP_LOCK(pmap);
3880 				rw_rlock(lock);
3881 				if (md_gen != m->md.pv_gen ||
3882 				    pvh_gen != pvh->pv_gen) {
3883 					PMAP_UNLOCK(pmap);
3884 					goto restart;
3885 				}
3886 			}
3887 			pte = pmap_pte(pmap, pv->pv_va, &lvl);
3888 			KASSERT(lvl == 2,
3889 			    ("pmap_page_test_mappings: Invalid level %d", lvl));
3890 			mask = 0;
3891 			value = 0;
3892 			if (modified) {
3893 				mask |= ATTR_AP_RW_BIT;
3894 				value |= ATTR_AP(ATTR_AP_RW);
3895 			}
3896 			if (accessed) {
3897 				mask |= ATTR_AF | ATTR_DESCR_MASK;
3898 				value |= ATTR_AF | L2_BLOCK;
3899 			}
3900 			rv = (pmap_load(pte) & mask) == value;
3901 			PMAP_UNLOCK(pmap);
3902 			if (rv)
3903 				goto out;
3904 		}
3905 	}
3906 out:
3907 	rw_runlock(lock);
3908 	return (rv);
3909 }
3910 
3911 /*
3912  *	pmap_is_modified:
3913  *
3914  *	Return whether or not the specified physical page was modified
3915  *	in any physical maps.
3916  */
3917 boolean_t
3918 pmap_is_modified(vm_page_t m)
3919 {
3920 
3921 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3922 	    ("pmap_is_modified: page %p is not managed", m));
3923 
3924 	/*
3925 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
3926 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
3927 	 * is clear, no PTEs can have PG_M set.
3928 	 */
3929 	VM_OBJECT_ASSERT_WLOCKED(m->object);
3930 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
3931 		return (FALSE);
3932 	return (pmap_page_test_mappings(m, FALSE, TRUE));
3933 }
3934 
3935 /*
3936  *	pmap_is_prefaultable:
3937  *
3938  *	Return whether or not the specified virtual address is eligible
3939  *	for prefault.
3940  */
3941 boolean_t
3942 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3943 {
3944 	pt_entry_t *pte;
3945 	boolean_t rv;
3946 	int lvl;
3947 
3948 	rv = FALSE;
3949 	PMAP_LOCK(pmap);
3950 	pte = pmap_pte(pmap, addr, &lvl);
3951 	if (pte != NULL && pmap_load(pte) != 0) {
3952 		rv = TRUE;
3953 	}
3954 	PMAP_UNLOCK(pmap);
3955 	return (rv);
3956 }
3957 
3958 /*
3959  *	pmap_is_referenced:
3960  *
3961  *	Return whether or not the specified physical page was referenced
3962  *	in any physical maps.
3963  */
3964 boolean_t
3965 pmap_is_referenced(vm_page_t m)
3966 {
3967 
3968 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3969 	    ("pmap_is_referenced: page %p is not managed", m));
3970 	return (pmap_page_test_mappings(m, TRUE, FALSE));
3971 }
3972 
3973 /*
3974  * Clear the write and modified bits in each of the given page's mappings.
3975  */
3976 void
3977 pmap_remove_write(vm_page_t m)
3978 {
3979 	struct md_page *pvh;
3980 	pmap_t pmap;
3981 	struct rwlock *lock;
3982 	pv_entry_t next_pv, pv;
3983 	pt_entry_t oldpte, *pte;
3984 	vm_offset_t va;
3985 	int lvl, md_gen, pvh_gen;
3986 
3987 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3988 	    ("pmap_remove_write: page %p is not managed", m));
3989 
3990 	/*
3991 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
3992 	 * set by another thread while the object is locked.  Thus,
3993 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
3994 	 */
3995 	VM_OBJECT_ASSERT_WLOCKED(m->object);
3996 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
3997 		return;
3998 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3999 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4000 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
4001 retry_pv_loop:
4002 	rw_wlock(lock);
4003 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4004 		pmap = PV_PMAP(pv);
4005 		if (!PMAP_TRYLOCK(pmap)) {
4006 			pvh_gen = pvh->pv_gen;
4007 			rw_wunlock(lock);
4008 			PMAP_LOCK(pmap);
4009 			rw_wlock(lock);
4010 			if (pvh_gen != pvh->pv_gen) {
4011 				PMAP_UNLOCK(pmap);
4012 				rw_wunlock(lock);
4013 				goto retry_pv_loop;
4014 			}
4015 		}
4016 		va = pv->pv_va;
4017 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
4018 		if ((pmap_load(pte) & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))
4019 			pmap_demote_l2_locked(pmap, pte, va & ~L2_OFFSET,
4020 			    &lock);
4021 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
4022 		    ("inconsistent pv lock %p %p for page %p",
4023 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
4024 		PMAP_UNLOCK(pmap);
4025 	}
4026 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4027 		pmap = PV_PMAP(pv);
4028 		if (!PMAP_TRYLOCK(pmap)) {
4029 			pvh_gen = pvh->pv_gen;
4030 			md_gen = m->md.pv_gen;
4031 			rw_wunlock(lock);
4032 			PMAP_LOCK(pmap);
4033 			rw_wlock(lock);
4034 			if (pvh_gen != pvh->pv_gen ||
4035 			    md_gen != m->md.pv_gen) {
4036 				PMAP_UNLOCK(pmap);
4037 				rw_wunlock(lock);
4038 				goto retry_pv_loop;
4039 			}
4040 		}
4041 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
4042 retry:
4043 		oldpte = pmap_load(pte);
4044 		if ((oldpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) {
4045 			if (!atomic_cmpset_long(pte, oldpte,
4046 			    oldpte | ATTR_AP(ATTR_AP_RO)))
4047 				goto retry;
4048 			if ((oldpte & ATTR_AF) != 0)
4049 				vm_page_dirty(m);
4050 			pmap_invalidate_page(pmap, pv->pv_va);
4051 		}
4052 		PMAP_UNLOCK(pmap);
4053 	}
4054 	rw_wunlock(lock);
4055 	vm_page_aflag_clear(m, PGA_WRITEABLE);
4056 }
4057 
4058 static __inline boolean_t
4059 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
4060 {
4061 
4062 	return (FALSE);
4063 }
4064 
4065 /*
4066  *	pmap_ts_referenced:
4067  *
4068  *	Return a count of reference bits for a page, clearing those bits.
4069  *	It is not necessary for every reference bit to be cleared, but it
4070  *	is necessary that 0 only be returned when there are truly no
4071  *	reference bits set.
4072  *
4073  *	As an optimization, update the page's dirty field if a modified bit is
4074  *	found while counting reference bits.  This opportunistic update can be
4075  *	performed at low cost and can eliminate the need for some future calls
4076  *	to pmap_is_modified().  However, since this function stops after
4077  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
4078  *	dirty pages.  Those dirty pages will only be detected by a future call
4079  *	to pmap_is_modified().
4080  */
4081 int
4082 pmap_ts_referenced(vm_page_t m)
4083 {
4084 	struct md_page *pvh;
4085 	pv_entry_t pv, pvf;
4086 	pmap_t pmap;
4087 	struct rwlock *lock;
4088 	pd_entry_t *pde, tpde;
4089 	pt_entry_t *pte, tpte;
4090 	pt_entry_t *l3;
4091 	vm_offset_t va;
4092 	vm_paddr_t pa;
4093 	int cleared, md_gen, not_cleared, lvl, pvh_gen;
4094 	struct spglist free;
4095 	bool demoted;
4096 
4097 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4098 	    ("pmap_ts_referenced: page %p is not managed", m));
4099 	SLIST_INIT(&free);
4100 	cleared = 0;
4101 	pa = VM_PAGE_TO_PHYS(m);
4102 	lock = PHYS_TO_PV_LIST_LOCK(pa);
4103 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
4104 	rw_wlock(lock);
4105 retry:
4106 	not_cleared = 0;
4107 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4108 		goto small_mappings;
4109 	pv = pvf;
4110 	do {
4111 		if (pvf == NULL)
4112 			pvf = pv;
4113 		pmap = PV_PMAP(pv);
4114 		if (!PMAP_TRYLOCK(pmap)) {
4115 			pvh_gen = pvh->pv_gen;
4116 			rw_wunlock(lock);
4117 			PMAP_LOCK(pmap);
4118 			rw_wlock(lock);
4119 			if (pvh_gen != pvh->pv_gen) {
4120 				PMAP_UNLOCK(pmap);
4121 				goto retry;
4122 			}
4123 		}
4124 		va = pv->pv_va;
4125 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
4126 		KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found"));
4127 		KASSERT(lvl == 1,
4128 		    ("pmap_ts_referenced: invalid pde level %d", lvl));
4129 		tpde = pmap_load(pde);
4130 		KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE,
4131 		    ("pmap_ts_referenced: found an invalid l1 table"));
4132 		pte = pmap_l1_to_l2(pde, pv->pv_va);
4133 		tpte = pmap_load(pte);
4134 		if (pmap_page_dirty(tpte)) {
4135 			/*
4136 			 * Although "tpte" is mapping a 2MB page, because
4137 			 * this function is called at a 4KB page granularity,
4138 			 * we only update the 4KB page under test.
4139 			 */
4140 			vm_page_dirty(m);
4141 		}
4142 		if ((tpte & ATTR_AF) != 0) {
4143 			/*
4144 			 * Since this reference bit is shared by 512 4KB
4145 			 * pages, it should not be cleared every time it is
4146 			 * tested.  Apply a simple "hash" function on the
4147 			 * physical page number, the virtual superpage number,
4148 			 * and the pmap address to select one 4KB page out of
4149 			 * the 512 on which testing the reference bit will
4150 			 * result in clearing that reference bit.  This
4151 			 * function is designed to avoid the selection of the
4152 			 * same 4KB page for every 2MB page mapping.
4153 			 *
4154 			 * On demotion, a mapping that hasn't been referenced
4155 			 * is simply destroyed.  To avoid the possibility of a
4156 			 * subsequent page fault on a demoted wired mapping,
4157 			 * always leave its reference bit set.  Moreover,
4158 			 * since the superpage is wired, the current state of
4159 			 * its reference bit won't affect page replacement.
4160 			 */
4161 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
4162 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
4163 			    (tpte & ATTR_SW_WIRED) == 0) {
4164 				if (safe_to_clear_referenced(pmap, tpte)) {
4165 					/*
4166 					 * TODO: We don't handle the access
4167 					 * flag at all. We need to be able
4168 					 * to set it in  the exception handler.
4169 					 */
4170 					panic("ARM64TODO: "
4171 					    "safe_to_clear_referenced\n");
4172 				} else if (pmap_demote_l2_locked(pmap, pte,
4173 				    pv->pv_va, &lock) != NULL) {
4174 					demoted = true;
4175 					va += VM_PAGE_TO_PHYS(m) -
4176 					    (tpte & ~ATTR_MASK);
4177 					l3 = pmap_l2_to_l3(pte, va);
4178 					pmap_remove_l3(pmap, l3, va,
4179 					    pmap_load(pte), NULL, &lock);
4180 				} else
4181 					demoted = true;
4182 
4183 				if (demoted) {
4184 					/*
4185 					 * The superpage mapping was removed
4186 					 * entirely and therefore 'pv' is no
4187 					 * longer valid.
4188 					 */
4189 					if (pvf == pv)
4190 						pvf = NULL;
4191 					pv = NULL;
4192 				}
4193 				cleared++;
4194 				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
4195 				    ("inconsistent pv lock %p %p for page %p",
4196 				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
4197 			} else
4198 				not_cleared++;
4199 		}
4200 		PMAP_UNLOCK(pmap);
4201 		/* Rotate the PV list if it has more than one entry. */
4202 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4203 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4204 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4205 			pvh->pv_gen++;
4206 		}
4207 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
4208 			goto out;
4209 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4210 small_mappings:
4211 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4212 		goto out;
4213 	pv = pvf;
4214 	do {
4215 		if (pvf == NULL)
4216 			pvf = pv;
4217 		pmap = PV_PMAP(pv);
4218 		if (!PMAP_TRYLOCK(pmap)) {
4219 			pvh_gen = pvh->pv_gen;
4220 			md_gen = m->md.pv_gen;
4221 			rw_wunlock(lock);
4222 			PMAP_LOCK(pmap);
4223 			rw_wlock(lock);
4224 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4225 				PMAP_UNLOCK(pmap);
4226 				goto retry;
4227 			}
4228 		}
4229 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
4230 		KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found"));
4231 		KASSERT(lvl == 2,
4232 		    ("pmap_ts_referenced: invalid pde level %d", lvl));
4233 		tpde = pmap_load(pde);
4234 		KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE,
4235 		    ("pmap_ts_referenced: found an invalid l2 table"));
4236 		pte = pmap_l2_to_l3(pde, pv->pv_va);
4237 		tpte = pmap_load(pte);
4238 		if (pmap_page_dirty(tpte))
4239 			vm_page_dirty(m);
4240 		if ((tpte & ATTR_AF) != 0) {
4241 			if (safe_to_clear_referenced(pmap, tpte)) {
4242 				/*
4243 				 * TODO: We don't handle the access flag
4244 				 * at all. We need to be able to set it in
4245 				 * the exception handler.
4246 				 */
4247 				panic("ARM64TODO: safe_to_clear_referenced\n");
4248 			} else if ((tpte & ATTR_SW_WIRED) == 0) {
4249 				/*
4250 				 * Wired pages cannot be paged out so
4251 				 * doing accessed bit emulation for
4252 				 * them is wasted effort. We do the
4253 				 * hard work for unwired pages only.
4254 				 */
4255 				pmap_remove_l3(pmap, pte, pv->pv_va, tpde,
4256 				    &free, &lock);
4257 				pmap_invalidate_page(pmap, pv->pv_va);
4258 				cleared++;
4259 				if (pvf == pv)
4260 					pvf = NULL;
4261 				pv = NULL;
4262 				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
4263 				    ("inconsistent pv lock %p %p for page %p",
4264 				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
4265 			} else
4266 				not_cleared++;
4267 		}
4268 		PMAP_UNLOCK(pmap);
4269 		/* Rotate the PV list if it has more than one entry. */
4270 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4271 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4272 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4273 			m->md.pv_gen++;
4274 		}
4275 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
4276 	    not_cleared < PMAP_TS_REFERENCED_MAX);
4277 out:
4278 	rw_wunlock(lock);
4279 	vm_page_free_pages_toq(&free, false);
4280 	return (cleared + not_cleared);
4281 }
4282 
4283 /*
4284  *	Apply the given advice to the specified range of addresses within the
4285  *	given pmap.  Depending on the advice, clear the referenced and/or
4286  *	modified flags in each mapping and set the mapped page's dirty field.
4287  */
4288 void
4289 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4290 {
4291 }
4292 
4293 /*
4294  *	Clear the modify bits on the specified physical page.
4295  */
4296 void
4297 pmap_clear_modify(vm_page_t m)
4298 {
4299 
4300 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4301 	    ("pmap_clear_modify: page %p is not managed", m));
4302 	VM_OBJECT_ASSERT_WLOCKED(m->object);
4303 	KASSERT(!vm_page_xbusied(m),
4304 	    ("pmap_clear_modify: page %p is exclusive busied", m));
4305 
4306 	/*
4307 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
4308 	 * If the object containing the page is locked and the page is not
4309 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
4310 	 */
4311 	if ((m->aflags & PGA_WRITEABLE) == 0)
4312 		return;
4313 
4314 	/* ARM64TODO: We lack support for tracking if a page is modified */
4315 }
4316 
4317 void *
4318 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4319 {
4320 	struct pmap_preinit_mapping *ppim;
4321 	vm_offset_t va, offset;
4322 	pd_entry_t *pde;
4323 	pt_entry_t *l2;
4324 	int i, lvl, l2_blocks, free_l2_count, start_idx;
4325 
4326 	if (!vm_initialized) {
4327 		/*
4328 		 * No L3 ptables so map entire L2 blocks where start VA is:
4329 		 * 	preinit_map_va + start_idx * L2_SIZE
4330 		 * There may be duplicate mappings (multiple VA -> same PA) but
4331 		 * ARM64 dcache is always PIPT so that's acceptable.
4332 		 */
4333 		 if (size == 0)
4334 			 return (NULL);
4335 
4336 		 /* Calculate how many full L2 blocks are needed for the mapping */
4337 		l2_blocks = (roundup2(pa + size, L2_SIZE) - rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
4338 
4339 		offset = pa & L2_OFFSET;
4340 
4341 		if (preinit_map_va == 0)
4342 			return (NULL);
4343 
4344 		/* Map 2MiB L2 blocks from reserved VA space */
4345 
4346 		free_l2_count = 0;
4347 		start_idx = -1;
4348 		/* Find enough free contiguous VA space */
4349 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
4350 			ppim = pmap_preinit_mapping + i;
4351 			if (free_l2_count > 0 && ppim->pa != 0) {
4352 				/* Not enough space here */
4353 				free_l2_count = 0;
4354 				start_idx = -1;
4355 				continue;
4356 			}
4357 
4358 			if (ppim->pa == 0) {
4359 				/* Free L2 block */
4360 				if (start_idx == -1)
4361 					start_idx = i;
4362 				free_l2_count++;
4363 				if (free_l2_count == l2_blocks)
4364 					break;
4365 			}
4366 		}
4367 		if (free_l2_count != l2_blocks)
4368 			panic("%s: too many preinit mappings", __func__);
4369 
4370 		va = preinit_map_va + (start_idx * L2_SIZE);
4371 		for (i = start_idx; i < start_idx + l2_blocks; i++) {
4372 			/* Mark entries as allocated */
4373 			ppim = pmap_preinit_mapping + i;
4374 			ppim->pa = pa;
4375 			ppim->va = va + offset;
4376 			ppim->size = size;
4377 		}
4378 
4379 		/* Map L2 blocks */
4380 		pa = rounddown2(pa, L2_SIZE);
4381 		for (i = 0; i < l2_blocks; i++) {
4382 			pde = pmap_pde(kernel_pmap, va, &lvl);
4383 			KASSERT(pde != NULL,
4384 			    ("pmap_mapbios: Invalid page entry, va: 0x%lx", va));
4385 			KASSERT(lvl == 1, ("pmap_mapbios: Invalid level %d", lvl));
4386 
4387 			/* Insert L2_BLOCK */
4388 			l2 = pmap_l1_to_l2(pde, va);
4389 			pmap_load_store(l2,
4390 			    pa | ATTR_DEFAULT | ATTR_XN |
4391 			    ATTR_IDX(CACHED_MEMORY) | L2_BLOCK);
4392 			pmap_invalidate_range(kernel_pmap, va, va + L2_SIZE);
4393 
4394 			va += L2_SIZE;
4395 			pa += L2_SIZE;
4396 		}
4397 
4398 		va = preinit_map_va + (start_idx * L2_SIZE);
4399 
4400 	} else {
4401 		/* kva_alloc may be used to map the pages */
4402 		offset = pa & PAGE_MASK;
4403 		size = round_page(offset + size);
4404 
4405 		va = kva_alloc(size);
4406 		if (va == 0)
4407 			panic("%s: Couldn't allocate KVA", __func__);
4408 
4409 		pde = pmap_pde(kernel_pmap, va, &lvl);
4410 		KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
4411 
4412 		/* L3 table is linked */
4413 		va = trunc_page(va);
4414 		pa = trunc_page(pa);
4415 		pmap_kenter(va, size, pa, CACHED_MEMORY);
4416 	}
4417 
4418 	return ((void *)(va + offset));
4419 }
4420 
4421 void
4422 pmap_unmapbios(vm_offset_t va, vm_size_t size)
4423 {
4424 	struct pmap_preinit_mapping *ppim;
4425 	vm_offset_t offset, tmpsize, va_trunc;
4426 	pd_entry_t *pde;
4427 	pt_entry_t *l2;
4428 	int i, lvl, l2_blocks, block;
4429 
4430 	l2_blocks = (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
4431 	KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
4432 
4433 	/* Remove preinit mapping */
4434 	block = 0;
4435 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
4436 		ppim = pmap_preinit_mapping + i;
4437 		if (ppim->va == va) {
4438 			KASSERT(ppim->size == size, ("pmap_unmapbios: size mismatch"));
4439 			ppim->va = 0;
4440 			ppim->pa = 0;
4441 			ppim->size = 0;
4442 			offset = block * L2_SIZE;
4443 			va_trunc = rounddown2(va, L2_SIZE) + offset;
4444 
4445 			/* Remove L2_BLOCK */
4446 			pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
4447 			KASSERT(pde != NULL,
4448 			    ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va_trunc));
4449 			l2 = pmap_l1_to_l2(pde, va_trunc);
4450 			pmap_load_clear(l2);
4451 			pmap_invalidate_range(kernel_pmap, va_trunc, va_trunc + L2_SIZE);
4452 
4453 			if (block == (l2_blocks - 1))
4454 				return;
4455 			block++;
4456 		}
4457 	}
4458 
4459 	/* Unmap the pages reserved with kva_alloc. */
4460 	if (vm_initialized) {
4461 		offset = va & PAGE_MASK;
4462 		size = round_page(offset + size);
4463 		va = trunc_page(va);
4464 
4465 		pde = pmap_pde(kernel_pmap, va, &lvl);
4466 		KASSERT(pde != NULL,
4467 		    ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va));
4468 		KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl));
4469 
4470 		/* Unmap and invalidate the pages */
4471                 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
4472 			pmap_kremove(va + tmpsize);
4473 
4474 		kva_free(va, size);
4475 	}
4476 }
4477 
4478 /*
4479  * Sets the memory attribute for the specified page.
4480  */
4481 void
4482 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4483 {
4484 
4485 	m->md.pv_memattr = ma;
4486 
4487 	/*
4488 	 * If "m" is a normal page, update its direct mapping.  This update
4489 	 * can be relied upon to perform any cache operations that are
4490 	 * required for data coherence.
4491 	 */
4492 	if ((m->flags & PG_FICTITIOUS) == 0 &&
4493 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
4494 	    m->md.pv_memattr) != 0)
4495 		panic("memory attribute change on the direct map failed");
4496 }
4497 
4498 /*
4499  * Changes the specified virtual address range's memory type to that given by
4500  * the parameter "mode".  The specified virtual address range must be
4501  * completely contained within either the direct map or the kernel map.  If
4502  * the virtual address range is contained within the kernel map, then the
4503  * memory type for each of the corresponding ranges of the direct map is also
4504  * changed.  (The corresponding ranges of the direct map are those ranges that
4505  * map the same physical pages as the specified virtual address range.)  These
4506  * changes to the direct map are necessary because Intel describes the
4507  * behavior of their processors as "undefined" if two or more mappings to the
4508  * same physical page have different memory types.
4509  *
4510  * Returns zero if the change completed successfully, and either EINVAL or
4511  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
4512  * of the virtual address range was not mapped, and ENOMEM is returned if
4513  * there was insufficient memory available to complete the change.  In the
4514  * latter case, the memory type may have been changed on some part of the
4515  * virtual address range or the direct map.
4516  */
4517 static int
4518 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4519 {
4520 	int error;
4521 
4522 	PMAP_LOCK(kernel_pmap);
4523 	error = pmap_change_attr_locked(va, size, mode);
4524 	PMAP_UNLOCK(kernel_pmap);
4525 	return (error);
4526 }
4527 
4528 static int
4529 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
4530 {
4531 	vm_offset_t base, offset, tmpva;
4532 	pt_entry_t l3, *pte, *newpte;
4533 	int lvl;
4534 
4535 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
4536 	base = trunc_page(va);
4537 	offset = va & PAGE_MASK;
4538 	size = round_page(offset + size);
4539 
4540 	if (!VIRT_IN_DMAP(base))
4541 		return (EINVAL);
4542 
4543 	for (tmpva = base; tmpva < base + size; ) {
4544 		pte = pmap_pte(kernel_pmap, va, &lvl);
4545 		if (pte == NULL)
4546 			return (EINVAL);
4547 
4548 		if ((pmap_load(pte) & ATTR_IDX_MASK) == ATTR_IDX(mode)) {
4549 			/*
4550 			 * We already have the correct attribute,
4551 			 * ignore this entry.
4552 			 */
4553 			switch (lvl) {
4554 			default:
4555 				panic("Invalid DMAP table level: %d\n", lvl);
4556 			case 1:
4557 				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
4558 				break;
4559 			case 2:
4560 				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
4561 				break;
4562 			case 3:
4563 				tmpva += PAGE_SIZE;
4564 				break;
4565 			}
4566 		} else {
4567 			/*
4568 			 * Split the entry to an level 3 table, then
4569 			 * set the new attribute.
4570 			 */
4571 			switch (lvl) {
4572 			default:
4573 				panic("Invalid DMAP table level: %d\n", lvl);
4574 			case 1:
4575 				newpte = pmap_demote_l1(kernel_pmap, pte,
4576 				    tmpva & ~L1_OFFSET);
4577 				if (newpte == NULL)
4578 					return (EINVAL);
4579 				pte = pmap_l1_to_l2(pte, tmpva);
4580 			case 2:
4581 				newpte = pmap_demote_l2(kernel_pmap, pte,
4582 				    tmpva & ~L2_OFFSET);
4583 				if (newpte == NULL)
4584 					return (EINVAL);
4585 				pte = pmap_l2_to_l3(pte, tmpva);
4586 			case 3:
4587 				/* Update the entry */
4588 				l3 = pmap_load(pte);
4589 				l3 &= ~ATTR_IDX_MASK;
4590 				l3 |= ATTR_IDX(mode);
4591 				if (mode == DEVICE_MEMORY)
4592 					l3 |= ATTR_XN;
4593 
4594 				pmap_update_entry(kernel_pmap, pte, l3, tmpva,
4595 				    PAGE_SIZE);
4596 
4597 				/*
4598 				 * If moving to a non-cacheable entry flush
4599 				 * the cache.
4600 				 */
4601 				if (mode == VM_MEMATTR_UNCACHEABLE)
4602 					cpu_dcache_wbinv_range(tmpva, L3_SIZE);
4603 
4604 				break;
4605 			}
4606 			tmpva += PAGE_SIZE;
4607 		}
4608 	}
4609 
4610 	return (0);
4611 }
4612 
4613 /*
4614  * Create an L2 table to map all addresses within an L1 mapping.
4615  */
4616 static pt_entry_t *
4617 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
4618 {
4619 	pt_entry_t *l2, newl2, oldl1;
4620 	vm_offset_t tmpl1;
4621 	vm_paddr_t l2phys, phys;
4622 	vm_page_t ml2;
4623 	int i;
4624 
4625 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4626 	oldl1 = pmap_load(l1);
4627 	KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
4628 	    ("pmap_demote_l1: Demoting a non-block entry"));
4629 	KASSERT((va & L1_OFFSET) == 0,
4630 	    ("pmap_demote_l1: Invalid virtual address %#lx", va));
4631 	KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
4632 	    ("pmap_demote_l1: Level 1 table shouldn't be managed"));
4633 
4634 	tmpl1 = 0;
4635 	if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
4636 		tmpl1 = kva_alloc(PAGE_SIZE);
4637 		if (tmpl1 == 0)
4638 			return (NULL);
4639 	}
4640 
4641 	if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
4642 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
4643 		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
4644 		    " in pmap %p", va, pmap);
4645 		return (NULL);
4646 	}
4647 
4648 	l2phys = VM_PAGE_TO_PHYS(ml2);
4649 	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
4650 
4651 	/* Address the range points at */
4652 	phys = oldl1 & ~ATTR_MASK;
4653 	/* The attributed from the old l1 table to be copied */
4654 	newl2 = oldl1 & ATTR_MASK;
4655 
4656 	/* Create the new entries */
4657 	for (i = 0; i < Ln_ENTRIES; i++) {
4658 		l2[i] = newl2 | phys;
4659 		phys += L2_SIZE;
4660 	}
4661 	KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
4662 	    ("Invalid l2 page (%lx != %lx)", l2[0],
4663 	    (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
4664 
4665 	if (tmpl1 != 0) {
4666 		pmap_kenter(tmpl1, PAGE_SIZE,
4667 		    DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, CACHED_MEMORY);
4668 		l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
4669 	}
4670 
4671 	pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
4672 
4673 	if (tmpl1 != 0) {
4674 		pmap_kremove(tmpl1);
4675 		kva_free(tmpl1, PAGE_SIZE);
4676 	}
4677 
4678 	return (l2);
4679 }
4680 
4681 /*
4682  * Create an L3 table to map all addresses within an L2 mapping.
4683  */
4684 static pt_entry_t *
4685 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
4686     struct rwlock **lockp)
4687 {
4688 	pt_entry_t *l3, newl3, oldl2;
4689 	vm_offset_t tmpl2;
4690 	vm_paddr_t l3phys, phys;
4691 	vm_page_t ml3;
4692 	int i;
4693 
4694 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4695 	l3 = NULL;
4696 	oldl2 = pmap_load(l2);
4697 	KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
4698 	    ("pmap_demote_l2: Demoting a non-block entry"));
4699 	KASSERT((va & L2_OFFSET) == 0,
4700 	    ("pmap_demote_l2: Invalid virtual address %#lx", va));
4701 
4702 	tmpl2 = 0;
4703 	if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
4704 		tmpl2 = kva_alloc(PAGE_SIZE);
4705 		if (tmpl2 == 0)
4706 			return (NULL);
4707 	}
4708 
4709 	if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
4710 		ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va),
4711 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
4712 		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
4713 		if (ml3 == NULL) {
4714 			CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
4715 			    " in pmap %p", va, pmap);
4716 			goto fail;
4717 		}
4718 		if (va < VM_MAXUSER_ADDRESS)
4719 			pmap_resident_count_inc(pmap, 1);
4720 	}
4721 
4722 	l3phys = VM_PAGE_TO_PHYS(ml3);
4723 	l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
4724 
4725 	/* Address the range points at */
4726 	phys = oldl2 & ~ATTR_MASK;
4727 	/* The attributed from the old l2 table to be copied */
4728 	newl3 = (oldl2 & (ATTR_MASK & ~ATTR_DESCR_MASK)) | L3_PAGE;
4729 
4730 	/*
4731 	 * If the page table page is new, initialize it.
4732 	 */
4733 	if (ml3->wire_count == 1) {
4734 		for (i = 0; i < Ln_ENTRIES; i++) {
4735 			l3[i] = newl3 | phys;
4736 			phys += L3_SIZE;
4737 		}
4738 	}
4739 	KASSERT(l3[0] == ((oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE),
4740 	    ("Invalid l3 page (%lx != %lx)", l3[0],
4741 	    (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE));
4742 
4743 	/*
4744 	 * Map the temporary page so we don't lose access to the l2 table.
4745 	 */
4746 	if (tmpl2 != 0) {
4747 		pmap_kenter(tmpl2, PAGE_SIZE,
4748 		    DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, CACHED_MEMORY);
4749 		l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
4750 	}
4751 
4752 	/*
4753 	 * The spare PV entries must be reserved prior to demoting the
4754 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
4755 	 * of the L2 and the PV lists will be inconsistent, which can result
4756 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
4757 	 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
4758 	 * PV entry for the 2MB page mapping that is being demoted.
4759 	 */
4760 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
4761 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
4762 
4763 	pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
4764 
4765 	/*
4766 	 * Demote the PV entry.
4767 	 */
4768 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
4769 		pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp);
4770 
4771 	atomic_add_long(&pmap_l2_demotions, 1);
4772 	CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
4773 	    " in pmap %p %lx", va, pmap, l3[0]);
4774 
4775 fail:
4776 	if (tmpl2 != 0) {
4777 		pmap_kremove(tmpl2);
4778 		kva_free(tmpl2, PAGE_SIZE);
4779 	}
4780 
4781 	return (l3);
4782 
4783 }
4784 
4785 static pt_entry_t *
4786 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
4787 {
4788 	struct rwlock *lock;
4789 	pt_entry_t *l3;
4790 
4791 	lock = NULL;
4792 	l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
4793 	if (lock != NULL)
4794 		rw_wunlock(lock);
4795 	return (l3);
4796 }
4797 
4798 /*
4799  * perform the pmap work for mincore
4800  */
4801 int
4802 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
4803 {
4804 	pd_entry_t *l1p, l1;
4805 	pd_entry_t *l2p, l2;
4806 	pt_entry_t *l3p, l3;
4807 	vm_paddr_t pa;
4808 	bool managed;
4809 	int val;
4810 
4811 	PMAP_LOCK(pmap);
4812 retry:
4813 	pa = 0;
4814 	val = 0;
4815 	managed = false;
4816 
4817 	l1p = pmap_l1(pmap, addr);
4818 	if (l1p == NULL) /* No l1 */
4819 		goto done;
4820 
4821 	l1 = pmap_load(l1p);
4822 	if ((l1 & ATTR_DESCR_MASK) == L1_INVAL)
4823 		goto done;
4824 
4825 	if ((l1 & ATTR_DESCR_MASK) == L1_BLOCK) {
4826 		pa = (l1 & ~ATTR_MASK) | (addr & L1_OFFSET);
4827 		managed = (l1 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
4828 		val = MINCORE_SUPER | MINCORE_INCORE;
4829 		if (pmap_page_dirty(l1))
4830 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
4831 		if ((l1 & ATTR_AF) == ATTR_AF)
4832 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
4833 		goto done;
4834 	}
4835 
4836 	l2p = pmap_l1_to_l2(l1p, addr);
4837 	if (l2p == NULL) /* No l2 */
4838 		goto done;
4839 
4840 	l2 = pmap_load(l2p);
4841 	if ((l2 & ATTR_DESCR_MASK) == L2_INVAL)
4842 		goto done;
4843 
4844 	if ((l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
4845 		pa = (l2 & ~ATTR_MASK) | (addr & L2_OFFSET);
4846 		managed = (l2 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
4847 		val = MINCORE_SUPER | MINCORE_INCORE;
4848 		if (pmap_page_dirty(l2))
4849 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
4850 		if ((l2 & ATTR_AF) == ATTR_AF)
4851 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
4852 		goto done;
4853 	}
4854 
4855 	l3p = pmap_l2_to_l3(l2p, addr);
4856 	if (l3p == NULL) /* No l3 */
4857 		goto done;
4858 
4859 	l3 = pmap_load(l2p);
4860 	if ((l3 & ATTR_DESCR_MASK) == L3_INVAL)
4861 		goto done;
4862 
4863 	if ((l3 & ATTR_DESCR_MASK) == L3_PAGE) {
4864 		pa = (l3 & ~ATTR_MASK) | (addr & L3_OFFSET);
4865 		managed = (l3 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
4866 		val = MINCORE_INCORE;
4867 		if (pmap_page_dirty(l3))
4868 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
4869 		if ((l3 & ATTR_AF) == ATTR_AF)
4870 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
4871 	}
4872 
4873 done:
4874 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
4875 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
4876 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
4877 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
4878 			goto retry;
4879 	} else
4880 		PA_UNLOCK_COND(*locked_pa);
4881 	PMAP_UNLOCK(pmap);
4882 
4883 	return (val);
4884 }
4885 
4886 void
4887 pmap_activate(struct thread *td)
4888 {
4889 	pmap_t	pmap;
4890 
4891 	critical_enter();
4892 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
4893 	td->td_proc->p_md.md_l0addr = vtophys(pmap->pm_l0);
4894 	__asm __volatile("msr ttbr0_el1, %0" : :
4895 	    "r"(td->td_proc->p_md.md_l0addr));
4896 	pmap_invalidate_all(pmap);
4897 	critical_exit();
4898 }
4899 
4900 struct pcb *
4901 pmap_switch(struct thread *old, struct thread *new)
4902 {
4903 	pcpu_bp_harden bp_harden;
4904 	struct pcb *pcb;
4905 
4906 	/* Store the new curthread */
4907 	PCPU_SET(curthread, new);
4908 
4909 	/* And the new pcb */
4910 	pcb = new->td_pcb;
4911 	PCPU_SET(curpcb, pcb);
4912 
4913 	/*
4914 	 * TODO: We may need to flush the cache here if switching
4915 	 * to a user process.
4916 	 */
4917 
4918 	if (old == NULL ||
4919 	    old->td_proc->p_md.md_l0addr != new->td_proc->p_md.md_l0addr) {
4920 		__asm __volatile(
4921 		    /* Switch to the new pmap */
4922 		    "msr	ttbr0_el1, %0	\n"
4923 		    "isb			\n"
4924 
4925 		    /* Invalidate the TLB */
4926 		    "dsb	ishst		\n"
4927 		    "tlbi	vmalle1is	\n"
4928 		    "dsb	ish		\n"
4929 		    "isb			\n"
4930 		    : : "r"(new->td_proc->p_md.md_l0addr));
4931 
4932 		/*
4933 		 * Stop userspace from training the branch predictor against
4934 		 * other processes. This will call into a CPU specific
4935 		 * function that clears the branch predictor state.
4936 		 */
4937 		bp_harden = PCPU_GET(bp_harden);
4938 		if (bp_harden != NULL)
4939 			bp_harden();
4940 	}
4941 
4942 	return (pcb);
4943 }
4944 
4945 void
4946 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
4947 {
4948 
4949 	if (va >= VM_MIN_KERNEL_ADDRESS) {
4950 		cpu_icache_sync_range(va, sz);
4951 	} else {
4952 		u_int len, offset;
4953 		vm_paddr_t pa;
4954 
4955 		/* Find the length of data in this page to flush */
4956 		offset = va & PAGE_MASK;
4957 		len = imin(PAGE_SIZE - offset, sz);
4958 
4959 		while (sz != 0) {
4960 			/* Extract the physical address & find it in the DMAP */
4961 			pa = pmap_extract(pmap, va);
4962 			if (pa != 0)
4963 				cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
4964 
4965 			/* Move to the next page */
4966 			sz -= len;
4967 			va += len;
4968 			/* Set the length for the next iteration */
4969 			len = imin(PAGE_SIZE, sz);
4970 		}
4971 	}
4972 }
4973 
4974 int
4975 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
4976 {
4977 #ifdef SMP
4978 	register_t intr;
4979 	uint64_t par;
4980 
4981 	switch (ESR_ELx_EXCEPTION(esr)) {
4982 	case EXCP_INSN_ABORT_L:
4983 	case EXCP_INSN_ABORT:
4984 	case EXCP_DATA_ABORT_L:
4985 	case EXCP_DATA_ABORT:
4986 		break;
4987 	default:
4988 		return (KERN_FAILURE);
4989 	}
4990 
4991 	/* Data and insn aborts use same encoding for FCS field. */
4992 	switch (esr & ISS_DATA_DFSC_MASK) {
4993 	case ISS_DATA_DFSC_TF_L0:
4994 	case ISS_DATA_DFSC_TF_L1:
4995 	case ISS_DATA_DFSC_TF_L2:
4996 	case ISS_DATA_DFSC_TF_L3:
4997 		PMAP_LOCK(pmap);
4998 		/* Ask the MMU to check the address */
4999 		intr = intr_disable();
5000 		if (pmap == kernel_pmap)
5001 			par = arm64_address_translate_s1e1r(far);
5002 		else
5003 			par = arm64_address_translate_s1e0r(far);
5004 		intr_restore(intr);
5005 		PMAP_UNLOCK(pmap);
5006 
5007 		/*
5008 		 * If the translation was successful the address was invalid
5009 		 * due to a break-before-make sequence. We can unlock and
5010 		 * return success to the trap handler.
5011 		 */
5012 		if (PAR_SUCCESS(par))
5013 			return (KERN_SUCCESS);
5014 		break;
5015 	default:
5016 		break;
5017 	}
5018 #endif
5019 
5020 	return (KERN_FAILURE);
5021 }
5022 
5023 /*
5024  *	Increase the starting virtual address of the given mapping if a
5025  *	different alignment might result in more superpage mappings.
5026  */
5027 void
5028 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5029     vm_offset_t *addr, vm_size_t size)
5030 {
5031 	vm_offset_t superpage_offset;
5032 
5033 	if (size < L2_SIZE)
5034 		return;
5035 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5036 		offset += ptoa(object->pg_color);
5037 	superpage_offset = offset & L2_OFFSET;
5038 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
5039 	    (*addr & L2_OFFSET) == superpage_offset)
5040 		return;
5041 	if ((*addr & L2_OFFSET) < superpage_offset)
5042 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
5043 	else
5044 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
5045 }
5046 
5047 /**
5048  * Get the kernel virtual address of a set of physical pages. If there are
5049  * physical addresses not covered by the DMAP perform a transient mapping
5050  * that will be removed when calling pmap_unmap_io_transient.
5051  *
5052  * \param page        The pages the caller wishes to obtain the virtual
5053  *                    address on the kernel memory map.
5054  * \param vaddr       On return contains the kernel virtual memory address
5055  *                    of the pages passed in the page parameter.
5056  * \param count       Number of pages passed in.
5057  * \param can_fault   TRUE if the thread using the mapped pages can take
5058  *                    page faults, FALSE otherwise.
5059  *
5060  * \returns TRUE if the caller must call pmap_unmap_io_transient when
5061  *          finished or FALSE otherwise.
5062  *
5063  */
5064 boolean_t
5065 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
5066     boolean_t can_fault)
5067 {
5068 	vm_paddr_t paddr;
5069 	boolean_t needs_mapping;
5070 	int error, i;
5071 
5072 	/*
5073 	 * Allocate any KVA space that we need, this is done in a separate
5074 	 * loop to prevent calling vmem_alloc while pinned.
5075 	 */
5076 	needs_mapping = FALSE;
5077 	for (i = 0; i < count; i++) {
5078 		paddr = VM_PAGE_TO_PHYS(page[i]);
5079 		if (__predict_false(!PHYS_IN_DMAP(paddr))) {
5080 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
5081 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
5082 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
5083 			needs_mapping = TRUE;
5084 		} else {
5085 			vaddr[i] = PHYS_TO_DMAP(paddr);
5086 		}
5087 	}
5088 
5089 	/* Exit early if everything is covered by the DMAP */
5090 	if (!needs_mapping)
5091 		return (FALSE);
5092 
5093 	if (!can_fault)
5094 		sched_pin();
5095 	for (i = 0; i < count; i++) {
5096 		paddr = VM_PAGE_TO_PHYS(page[i]);
5097 		if (!PHYS_IN_DMAP(paddr)) {
5098 			panic(
5099 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
5100 		}
5101 	}
5102 
5103 	return (needs_mapping);
5104 }
5105 
5106 void
5107 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
5108     boolean_t can_fault)
5109 {
5110 	vm_paddr_t paddr;
5111 	int i;
5112 
5113 	if (!can_fault)
5114 		sched_unpin();
5115 	for (i = 0; i < count; i++) {
5116 		paddr = VM_PAGE_TO_PHYS(page[i]);
5117 		if (!PHYS_IN_DMAP(paddr)) {
5118 			panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
5119 		}
5120 	}
5121 }
5122