xref: /netbsd/sys/arch/alpha/alpha/pmap.c (revision 1e11a48f)
1 /* $NetBSD: pmap.c,v 1.307 2022/04/09 23:38:31 riastradh Exp $ */
2 
3 /*-
4  * Copyright (c) 1998, 1999, 2000, 2001, 2007, 2008, 2020
5  * 	The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
10  * NASA Ames Research Center, by Andrew Doran and Mindaugas Rasiukevicius,
11  * and by Chris G. Demetriou.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  * POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 /*
36  * Copyright (c) 1991, 1993
37  *	The Regents of the University of California.  All rights reserved.
38  *
39  * This code is derived from software contributed to Berkeley by
40  * the Systems Programming Group of the University of Utah Computer
41  * Science Department.
42  *
43  * Redistribution and use in source and binary forms, with or without
44  * modification, are permitted provided that the following conditions
45  * are met:
46  * 1. Redistributions of source code must retain the above copyright
47  *    notice, this list of conditions and the following disclaimer.
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  * 3. Neither the name of the University nor the names of its contributors
52  *    may be used to endorse or promote products derived from this software
53  *    without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65  * SUCH DAMAGE.
66  *
67  *	@(#)pmap.c	8.6 (Berkeley) 5/27/94
68  */
69 
70 /*
71  * DEC Alpha physical map management code.
72  *
73  * History:
74  *
75  *	This pmap started life as a Motorola 68851/68030 pmap,
76  *	written by Mike Hibler at the University of Utah.
77  *
78  *	It was modified for the DEC Alpha by Chris Demetriou
79  *	at Carnegie Mellon University.
80  *
81  *	Support for non-contiguous physical memory was added by
82  *	Jason R. Thorpe of the Numerical Aerospace Simulation
83  *	Facility, NASA Ames Research Center and Chris Demetriou.
84  *
85  *	Page table management and a major cleanup were undertaken
86  *	by Jason R. Thorpe, with lots of help from Ross Harvey of
87  *	Avalon Computer Systems and from Chris Demetriou.
88  *
89  *	Support for the new UVM pmap interface was written by
90  *	Jason R. Thorpe.
91  *
92  *	Support for ASNs was written by Jason R. Thorpe, again
93  *	with help from Chris Demetriou and Ross Harvey.
94  *
95  *	The locking protocol was written by Jason R. Thorpe,
96  *	using Chuck Cranor's i386 pmap for UVM as a model.
97  *
98  *	TLB shootdown code was written (and then subsequently
99  *	rewritten some years later, borrowing some ideas from
100  *	the x86 pmap) by Jason R. Thorpe.
101  *
102  *	Multiprocessor modifications by Andrew Doran and
103  *	Jason R. Thorpe.
104  *
105  * Notes:
106  *
107  *	All user page table access is done via K0SEG.  Kernel
108  *	page table access is done via the recursive Virtual Page
109  *	Table because kernel PT pages are pre-allocated and never
110  *	freed, so no VPT fault handling is required.
111  */
112 
113 /*
114  *	Manages physical address maps.
115  *
116  *	Since the information managed by this module is
117  *	also stored by the logical address mapping module,
118  *	this module may throw away valid virtual-to-physical
119  *	mappings at almost any time.  However, invalidations
120  *	of virtual-to-physical mappings must be done as
121  *	requested.
122  *
123  *	In order to cope with hardware architectures which
124  *	make virtual-to-physical map invalidates expensive,
125  *	this module may delay invalidate or reduced protection
126  *	operations until such time as they are actually
127  *	necessary.  This module is given full information as
128  *	to which processors are currently using which maps,
129  *	and to when physical maps must be made correct.
130  */
131 
132 #include "opt_lockdebug.h"
133 #include "opt_sysv.h"
134 #include "opt_multiprocessor.h"
135 
136 #include <sys/cdefs.h>			/* RCS ID & Copyright macro defns */
137 
138 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.307 2022/04/09 23:38:31 riastradh Exp $");
139 
140 #include <sys/param.h>
141 #include <sys/systm.h>
142 #include <sys/kernel.h>
143 #include <sys/proc.h>
144 #include <sys/pool.h>
145 #include <sys/buf.h>
146 #include <sys/evcnt.h>
147 #include <sys/atomic.h>
148 #include <sys/cpu.h>
149 
150 #include <uvm/uvm.h>
151 
152 #if defined(MULTIPROCESSOR)
153 #include <machine/rpb.h>
154 #endif
155 
156 #ifdef DEBUG
157 #define	PDB_FOLLOW	0x0001
158 #define	PDB_INIT	0x0002
159 #define	PDB_ENTER	0x0004
160 #define	PDB_REMOVE	0x0008
161 #define	PDB_CREATE	0x0010
162 #define	PDB_PTPAGE	0x0020
163 #define	PDB_ASN		0x0040
164 #define	PDB_BITS	0x0080
165 #define	PDB_COLLECT	0x0100
166 #define	PDB_PROTECT	0x0200
167 #define	PDB_BOOTSTRAP	0x1000
168 #define	PDB_PARANOIA	0x2000
169 #define	PDB_WIRING	0x4000
170 #define	PDB_PVDUMP	0x8000
171 
172 int debugmap = 0;
173 int pmapdebug = PDB_PARANOIA;
174 #endif
175 
176 #if defined(MULTIPROCESSOR)
177 #define	PMAP_MP(x)	x
178 #else
179 #define	PMAP_MP(x)	__nothing
180 #endif /* MULTIPROCESSOR */
181 
182 /*
183  * Given a map and a machine independent protection code,
184  * convert to an alpha protection code.
185  */
186 #define pte_prot(m, p)	(protection_codes[m == pmap_kernel() ? 0 : 1][p])
187 static int	protection_codes[2][8] __read_mostly;
188 
189 /*
190  * kernel_lev1map:
191  *
192  *	Kernel level 1 page table.  This maps all kernel level 2
193  *	page table pages, and is used as a template for all user
194  *	pmap level 1 page tables.  When a new user level 1 page
195  *	table is allocated, all kernel_lev1map PTEs for kernel
196  *	addresses are copied to the new map.
197  *
198  *	The kernel also has an initial set of kernel level 2 page
199  *	table pages.  These map the kernel level 3 page table pages.
200  *	As kernel level 3 page table pages are added, more level 2
201  *	page table pages may be added to map them.  These pages are
202  *	never freed.
203  *
204  *	Finally, the kernel also has an initial set of kernel level
205  *	3 page table pages.  These map pages in K1SEG.  More level
206  *	3 page table pages may be added at run-time if additional
207  *	K1SEG address space is required.  These pages are never freed.
208  *
209  * NOTE: When mappings are inserted into the kernel pmap, all
210  * level 2 and level 3 page table pages must already be allocated
211  * and mapped into the parent page table.
212  */
213 pt_entry_t	*kernel_lev1map __read_mostly;
214 
215 /*
216  * Virtual Page Table.
217  */
218 static pt_entry_t *VPT __read_mostly;
219 
220 static struct {
221 	struct pmap k_pmap;
222 } kernel_pmap_store __cacheline_aligned;
223 
224 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store.k_pmap;
225 
226 /* PA of first available physical page */
227 paddr_t    	avail_start __read_mostly;
228 
229 /* PA of last available physical page */
230 paddr_t		avail_end __read_mostly;
231 
232 /* VA of last avail page (end of kernel AS) */
233 static vaddr_t	virtual_end __read_mostly;
234 
235 /* Has pmap_init completed? */
236 static bool pmap_initialized __read_mostly;
237 
238 /* Instrumentation */
239 u_long		pmap_pages_stolen __read_mostly;
240 
241 /*
242  * This variable contains the number of CPU IDs we need to allocate
243  * space for when allocating the pmap structure.  It is used to
244  * size a per-CPU array of ASN and ASN Generation number.
245  */
246 static u_long 	pmap_ncpuids __read_mostly;
247 
248 #ifndef PMAP_PV_LOWAT
249 #define	PMAP_PV_LOWAT	16
250 #endif
251 int		pmap_pv_lowat __read_mostly = PMAP_PV_LOWAT;
252 
253 /*
254  * List of all pmaps, used to update them when e.g. additional kernel
255  * page tables are allocated.  This list is kept LRU-ordered by
256  * pmap_activate().
257  */
258 static TAILQ_HEAD(, pmap) pmap_all_pmaps __cacheline_aligned;
259 
260 /*
261  * Instrument the number of calls to pmap_growkernel().
262  */
263 static struct evcnt pmap_growkernel_evcnt __read_mostly;
264 
265 /*
266  * The pools from which pmap structures and sub-structures are allocated.
267  */
268 static struct pool_cache pmap_pmap_cache __read_mostly;
269 static struct pool_cache pmap_l1pt_cache __read_mostly;
270 static struct pool_cache pmap_pv_cache __read_mostly;
271 
272 CTASSERT(offsetof(struct pmap, pm_percpu[0]) == COHERENCY_UNIT);
273 CTASSERT(PMAP_SIZEOF(ALPHA_MAXPROCS) < ALPHA_PGBYTES);
274 CTASSERT(sizeof(struct pmap_percpu) == COHERENCY_UNIT);
275 
276 /*
277  * Address Space Numbers.
278  *
279  * On many implementations of the Alpha architecture, the TLB entries and
280  * I-cache blocks are tagged with a unique number within an implementation-
281  * specified range.  When a process context becomes active, the ASN is used
282  * to match TLB entries; if a TLB entry for a particular VA does not match
283  * the current ASN, it is ignored (one could think of the processor as
284  * having a collection of <max ASN> separate TLBs).  This allows operating
285  * system software to skip the TLB flush that would otherwise be necessary
286  * at context switch time.
287  *
288  * Alpha PTEs have a bit in them (PG_ASM - Address Space Match) that
289  * causes TLB entries to match any ASN.  The PALcode also provides
290  * a TBI (Translation Buffer Invalidate) operation that flushes all
291  * TLB entries that _do not_ have PG_ASM.  We use this bit for kernel
292  * mappings, so that invalidation of all user mappings does not invalidate
293  * kernel mappings (which are consistent across all processes).
294  *
295  * pmap_next_asn always indicates to the next ASN to use.  When
296  * pmap_next_asn exceeds pmap_max_asn, we start a new ASN generation.
297  *
298  * When a new ASN generation is created, the per-process (i.e. non-PG_ASM)
299  * TLB entries and the I-cache are flushed, the generation number is bumped,
300  * and pmap_next_asn is changed to indicate the first non-reserved ASN.
301  *
302  * We reserve ASN #0 for pmaps that use the global kernel_lev1map.  This
303  * prevents the following scenario to ensure no accidental accesses to
304  * user space for LWPs using the kernel pmap.  This is important because
305  * the PALcode may use the recursive VPT to service TLB misses.
306  *
307  * By reserving an ASN for the kernel, we are guaranteeing that an lwp
308  * will not see any valid user space TLB entries until it passes through
309  * pmap_activate() for the first time.
310  *
311  * On processors that do not support ASNs, the PALcode invalidates
312  * non-ASM TLB entries automatically on swpctx.  We completely skip
313  * the ASN machinery in this case because the PALcode neither reads
314  * nor writes that field of the HWPCB.
315  */
316 
317 /* max ASN supported by the system */
318 static u_int	pmap_max_asn __read_mostly;
319 
320 /*
321  * Locking:
322  *
323  *	READ/WRITE LOCKS
324  *	----------------
325  *
326  *	* pmap_main_lock - This lock is used to prevent deadlock and/or
327  *	  provide mutex access to the pmap module.  Most operations lock
328  *	  the pmap first, then PV lists as needed.  However, some operations,
329  *	  such as pmap_page_protect(), lock the PV lists before locking
330  *	  the pmaps.  To prevent deadlock, we require a mutex lock on the
331  *	  pmap module if locking in the PV->pmap direction.  This is
332  *	  implemented by acquiring a (shared) read lock on pmap_main_lock
333  *	  if locking pmap->PV and a (exclusive) write lock if locking in
334  *	  the PV->pmap direction.  Since only one thread can hold a write
335  *	  lock at a time, this provides the mutex.
336  *
337  *	MUTEXES
338  *	-------
339  *
340  *	* pmap lock (global hash) - These locks protect the pmap structures.
341  *
342  *	* pmap activation lock (global hash) - These IPL_SCHED spin locks
343  *	  synchronize pmap_activate() and TLB shootdowns.  This has a lock
344  *	  ordering constraint with the tlb_lock:
345  *
346  *		tlb_lock -> pmap activation lock
347  *
348  *	* pvh_lock (global hash) - These locks protect the PV lists for
349  *	  managed pages.
350  *
351  *	* tlb_lock - This IPL_VM lock serializes local and remote TLB
352  *	  invalidation.
353  *
354  *	* pmap_all_pmaps_lock - This lock protects the global list of
355  *	  all pmaps.
356  *
357  *	* pmap_growkernel_lock - This lock protects pmap_growkernel()
358  *	  and the virtual_end variable.
359  *
360  *	  There is a lock ordering constraint for pmap_growkernel_lock.
361  *	  pmap_growkernel() acquires the locks in the following order:
362  *
363  *		pmap_growkernel_lock (write) -> pmap_all_pmaps_lock ->
364  *		    pmap lock
365  *
366  *	  We need to ensure consistency between user pmaps and the
367  *	  kernel_lev1map.  For this reason, pmap_growkernel_lock must
368  *	  be held to prevent kernel_lev1map changing across pmaps
369  *	  being added to / removed from the global pmaps list.
370  *
371  *	Address space number management (global ASN counters and per-pmap
372  *	ASN state) are not locked; they use arrays of values indexed
373  *	per-processor.
374  *
375  *	All internal functions which operate on a pmap are called
376  *	with the pmap already locked by the caller (which will be
377  *	an interface function).
378  */
379 static krwlock_t pmap_main_lock __cacheline_aligned;
380 static kmutex_t pmap_all_pmaps_lock __cacheline_aligned;
381 static krwlock_t pmap_growkernel_lock __cacheline_aligned;
382 
383 #define	PMAP_MAP_TO_HEAD_LOCK()		rw_enter(&pmap_main_lock, RW_READER)
384 #define	PMAP_MAP_TO_HEAD_UNLOCK()	rw_exit(&pmap_main_lock)
385 #define	PMAP_HEAD_TO_MAP_LOCK()		rw_enter(&pmap_main_lock, RW_WRITER)
386 #define	PMAP_HEAD_TO_MAP_UNLOCK()	rw_exit(&pmap_main_lock)
387 
388 static union {
389 	kmutex_t	lock;
390 	uint8_t		pad[COHERENCY_UNIT];
391 } pmap_pvh_locks[64] __cacheline_aligned;
392 
393 #define	PVH_LOCK_HASH(pg)						\
394 	((((uintptr_t)(pg)) >> 6) & 63)
395 
396 static inline kmutex_t *
pmap_pvh_lock(struct vm_page * pg)397 pmap_pvh_lock(struct vm_page *pg)
398 {
399 	return &pmap_pvh_locks[PVH_LOCK_HASH(pg)].lock;
400 }
401 
402 static union {
403 	struct {
404 		kmutex_t	lock;
405 		kmutex_t	activation_lock;
406 	} locks;
407 	uint8_t		pad[COHERENCY_UNIT];
408 } pmap_pmap_locks[64] __cacheline_aligned;
409 
410 #define	PMAP_LOCK_HASH(pm)						\
411 	((((uintptr_t)(pm)) >> 6) & 63)
412 
413 static inline kmutex_t *
pmap_pmap_lock(pmap_t const pmap)414 pmap_pmap_lock(pmap_t const pmap)
415 {
416 	return &pmap_pmap_locks[PMAP_LOCK_HASH(pmap)].locks.lock;
417 }
418 
419 static inline kmutex_t *
pmap_activation_lock(pmap_t const pmap)420 pmap_activation_lock(pmap_t const pmap)
421 {
422 	return &pmap_pmap_locks[PMAP_LOCK_HASH(pmap)].locks.activation_lock;
423 }
424 
425 #define	PMAP_LOCK(pmap)		mutex_enter(pmap_pmap_lock(pmap))
426 #define	PMAP_UNLOCK(pmap)	mutex_exit(pmap_pmap_lock(pmap))
427 
428 #define	PMAP_ACT_LOCK(pmap)	mutex_spin_enter(pmap_activation_lock(pmap))
429 #define	PMAP_ACT_TRYLOCK(pmap)	mutex_tryenter(pmap_activation_lock(pmap))
430 #define	PMAP_ACT_UNLOCK(pmap)	mutex_spin_exit(pmap_activation_lock(pmap))
431 
432 #if defined(MULTIPROCESSOR)
433 #define	pmap_all_cpus()		cpus_running
434 #else
435 #define	pmap_all_cpus()		~0UL
436 #endif /* MULTIPROCESSOR */
437 
438 /*
439  * TLB context structure; see description in "TLB management" section
440  * below.
441  */
442 #define	TLB_CTX_MAXVA		8
443 #define	TLB_CTX_ALLVA		PAGE_MASK
444 struct pmap_tlb_context {
445 	uintptr_t		t_addrdata[TLB_CTX_MAXVA];
446 	pmap_t			t_pmap;
447 	struct pmap_pagelist	t_freeptq;
448 	struct pmap_pvlist	t_freepvq;
449 };
450 
451 /*
452  * Internal routines
453  */
454 static void	alpha_protection_init(void);
455 static pt_entry_t pmap_remove_mapping(pmap_t, vaddr_t, pt_entry_t *, bool,
456 				      pv_entry_t *,
457 				      struct pmap_tlb_context *);
458 static void	pmap_changebit(struct vm_page *, pt_entry_t, pt_entry_t,
459 			       struct pmap_tlb_context *);
460 
461 /*
462  * PT page management functions.
463  */
464 static int	pmap_ptpage_alloc(pmap_t, pt_entry_t *, int);
465 static void	pmap_ptpage_free(pmap_t, pt_entry_t *,
466 				 struct pmap_tlb_context *);
467 static void	pmap_l3pt_delref(pmap_t, vaddr_t, pt_entry_t *,
468 		     struct pmap_tlb_context *);
469 static void	pmap_l2pt_delref(pmap_t, pt_entry_t *, pt_entry_t *,
470 		     struct pmap_tlb_context *);
471 static void	pmap_l1pt_delref(pmap_t, pt_entry_t *);
472 
473 static void	*pmap_l1pt_alloc(struct pool *, int);
474 static void	pmap_l1pt_free(struct pool *, void *);
475 
476 static struct pool_allocator pmap_l1pt_allocator = {
477 	pmap_l1pt_alloc, pmap_l1pt_free, 0,
478 };
479 
480 static int	pmap_l1pt_ctor(void *, void *, int);
481 
482 /*
483  * PV table management functions.
484  */
485 static int	pmap_pv_enter(pmap_t, struct vm_page *, vaddr_t, pt_entry_t *,
486 			      bool, pv_entry_t);
487 static void	pmap_pv_remove(pmap_t, struct vm_page *, vaddr_t, bool,
488 			       pv_entry_t *, struct pmap_tlb_context *);
489 static void	*pmap_pv_page_alloc(struct pool *, int);
490 static void	pmap_pv_page_free(struct pool *, void *);
491 
492 static struct pool_allocator pmap_pv_page_allocator = {
493 	pmap_pv_page_alloc, pmap_pv_page_free, 0,
494 };
495 
496 #ifdef DEBUG
497 void	pmap_pv_dump(paddr_t);
498 #endif
499 
500 #define	pmap_pv_alloc()		pool_cache_get(&pmap_pv_cache, PR_NOWAIT)
501 #define	pmap_pv_free(pv)	pool_cache_put(&pmap_pv_cache, (pv))
502 
503 /*
504  * Generic routine for freeing pages on a pmap_pagelist back to
505  * the system.
506  */
507 static void
pmap_pagelist_free(struct pmap_pagelist * const list)508 pmap_pagelist_free(struct pmap_pagelist * const list)
509 {
510 	struct vm_page *pg;
511 
512 	while ((pg = LIST_FIRST(list)) != NULL) {
513 		LIST_REMOVE(pg, pageq.list);
514 		/* Fix up ref count; it's not always 0 when we get here. */
515 		PHYSPAGE_REFCNT_SET(pg, 0);
516 		uvm_pagefree(pg);
517 	}
518 }
519 
520 /*
521  * Generic routine for freeing a list of PV entries back to the
522  * system.
523  */
524 static void
pmap_pvlist_free(struct pmap_pvlist * const list)525 pmap_pvlist_free(struct pmap_pvlist * const list)
526 {
527 	pv_entry_t pv;
528 
529 	while ((pv = LIST_FIRST(list)) != NULL) {
530 		LIST_REMOVE(pv, pv_link);
531 		pmap_pv_free(pv);
532 	}
533 }
534 
535 /*
536  * TLB management.
537  *
538  * TLB invalidations need to be performed on local and remote CPUs
539  * whenever parts of the PTE that the hardware or PALcode understands
540  * changes.  In order amortize the cost of these operations, we will
541  * queue up to 8 addresses to invalidate in a batch.  Any more than
542  * that, and we will hit the entire TLB.
543  *
544  * Some things that add complexity:
545  *
546  * ==> ASNs. A CPU may have valid TLB entries for other than the current
547  *     address space.  We can only invalidate TLB entries for the current
548  *     address space, so when asked to invalidate a VA for the non-current
549  *     pmap on a given CPU, we simply invalidate the ASN for that pmap,CPU
550  *     tuple so that new one is allocated on the next activation on that
551  *     CPU.  N.B. that for CPUs that don't implement ASNs, SWPCTX does all
552  *     the work necessary, so we can skip some work in the pmap module
553  *     itself.
554  *
555  *     When a pmap is activated on a given CPU, we set a corresponding
556  *     bit in pmap::pm_cpus, indicating that it potentially has valid
557  *     TLB entries for that address space.  This bitmap is then used to
558  *     determine which remote CPUs need to be notified of invalidations.
559  *     The bit is cleared when the ASN is invalidated on that CPU.
560  *
561  *     In order to serialize with activating an address space on a
562  *     given CPU (that we can reliably send notifications only to
563  *     relevant remote CPUs), we acquire the pmap lock in pmap_activate()
564  *     and also hold the lock while remote shootdowns take place.
565  *     This does not apply to the kernel pmap; all CPUs are notified about
566  *     invalidations for the kernel pmap, and the pmap lock is not held
567  *     in pmap_activate() for the kernel pmap.
568  *
569  * ==> P->V operations (e.g. pmap_page_protect()) may require sending
570  *     invalidations for multiple address spaces.  We only track one
571  *     address space at a time, and if we encounter more than one, then
572  *     the notification each CPU gets is to hit the entire TLB.  Note
573  *     also that we can't serialize with pmap_activate() in this case,
574  *     so all CPUs will get the notification, and they check when
575  *     processing the notification if the pmap is current on that CPU.
576  *
577  * Invalidation information is gathered into a pmap_tlb_context structure
578  * that includes room for 8 VAs, the pmap the VAs belong to, a bitmap of
579  * CPUs to be notified, and a list for PT pages that are freed during
580  * removal off mappings.  The number of valid addresses in the list as
581  * well as flags are squeezed into the lower bits of the first two VAs.
582  * Storage for this structure is allocated on the stack.  We need to be
583  * careful to keep the size of this structure under control.
584  *
585  * When notifying remote CPUs, we acquire the tlb_lock (which also
586  * blocks IPIs), record the pointer to our context structure, set a
587  * global bitmap off CPUs to be notified, and then send the IPIs to
588  * each victim.  While the other CPUs are in-flight, we then perform
589  * any invalidations necessary on the local CPU.  Once that is done,
590  * we then wait the global context pointer to be cleared, which
591  * will be done by the final remote CPU to complete their work. This
592  * method reduces cache line contention during processing.
593  *
594  * When removing mappings in user pmaps, this implementation frees page
595  * table pages back to the VM system once they contain no valid mappings.
596  * As we do this, we must ensure to invalidate TLB entries that the
597  * CPU might hold for the respective recursive VPT mappings.  This must
598  * be done whenever an L1 or L2 PTE is invalidated.  Until these VPT
599  * translations are invalidated, the PT pages must not be reused.  For
600  * this reason, we keep a list of freed PT pages in the context structure
601  * and drain them off once all invalidations are complete.
602  *
603  * NOTE: The value of TLB_CTX_MAXVA is tuned to accommodate the UBC
604  * window size (defined as 64KB on alpha in <machine/vmparam.h>).
605  */
606 
607 #define	TLB_CTX_F_ASM		__BIT(0)
608 #define	TLB_CTX_F_IMB		__BIT(1)
609 #define	TLB_CTX_F_KIMB		__BIT(2)
610 #define	TLB_CTX_F_PV		__BIT(3)
611 #define	TLB_CTX_F_MULTI		__BIT(4)
612 
613 #define	TLB_CTX_COUNT(ctx)	((ctx)->t_addrdata[0] & PAGE_MASK)
614 #define	TLB_CTX_INC_COUNT(ctx)	 (ctx)->t_addrdata[0]++
615 #define	TLB_CTX_SET_ALLVA(ctx)	 (ctx)->t_addrdata[0] |= TLB_CTX_ALLVA
616 
617 #define	TLB_CTX_FLAGS(ctx)	((ctx)->t_addrdata[1] & PAGE_MASK)
618 #define	TLB_CTX_SET_FLAG(ctx, f) (ctx)->t_addrdata[1] |= (f)
619 
620 #define	TLB_CTX_VA(ctx, i)	((ctx)->t_addrdata[(i)] & ~PAGE_MASK)
621 #define	TLB_CTX_SETVA(ctx, i, va)					\
622 	(ctx)->t_addrdata[(i)] = (va) | ((ctx)->t_addrdata[(i)] & PAGE_MASK)
623 
624 static struct {
625 	kmutex_t	lock;
626 	struct evcnt	events;
627 } tlb_shootdown __cacheline_aligned;
628 #define	tlb_lock	tlb_shootdown.lock
629 #define	tlb_evcnt	tlb_shootdown.events
630 #if defined(MULTIPROCESSOR)
631 static const struct pmap_tlb_context *tlb_context __cacheline_aligned;
632 static unsigned long tlb_pending __cacheline_aligned;
633 #endif /* MULTIPROCESSOR */
634 
635 #if defined(TLB_STATS)
636 #define	TLB_COUNT_DECL(cnt)	static struct evcnt tlb_stat_##cnt
637 #define	TLB_COUNT(cnt)		atomic_inc_64(&tlb_stat_##cnt .ev_count)
638 #define	TLB_COUNT_ATTACH(cnt)						\
639 	evcnt_attach_dynamic_nozero(&tlb_stat_##cnt, EVCNT_TYPE_MISC,	\
640 	    NULL, "TLB", #cnt)
641 
642 TLB_COUNT_DECL(invalidate_multi_tbia);
643 TLB_COUNT_DECL(invalidate_multi_tbiap);
644 TLB_COUNT_DECL(invalidate_multi_imb);
645 
646 TLB_COUNT_DECL(invalidate_kern_tbia);
647 TLB_COUNT_DECL(invalidate_kern_tbis);
648 TLB_COUNT_DECL(invalidate_kern_imb);
649 
650 TLB_COUNT_DECL(invalidate_user_not_current);
651 TLB_COUNT_DECL(invalidate_user_lazy_imb);
652 TLB_COUNT_DECL(invalidate_user_tbiap);
653 TLB_COUNT_DECL(invalidate_user_tbis);
654 
655 TLB_COUNT_DECL(shootdown_kernel);
656 TLB_COUNT_DECL(shootdown_user);
657 TLB_COUNT_DECL(shootdown_imb);
658 TLB_COUNT_DECL(shootdown_kimb);
659 TLB_COUNT_DECL(shootdown_overflow);
660 
661 TLB_COUNT_DECL(shootdown_all_user);
662 TLB_COUNT_DECL(shootdown_all_user_imb);
663 
664 TLB_COUNT_DECL(shootdown_pv);
665 TLB_COUNT_DECL(shootdown_pv_multi);
666 
667 TLB_COUNT_DECL(shootnow_over_notify);
668 TLB_COUNT_DECL(shootnow_remote);
669 
670 TLB_COUNT_DECL(reason_remove_kernel);
671 TLB_COUNT_DECL(reason_remove_user);
672 TLB_COUNT_DECL(reason_remove_all_user);
673 TLB_COUNT_DECL(reason_page_protect_read);
674 TLB_COUNT_DECL(reason_page_protect_none);
675 TLB_COUNT_DECL(reason_protect);
676 TLB_COUNT_DECL(reason_enter_kernel);
677 TLB_COUNT_DECL(reason_enter_user);
678 TLB_COUNT_DECL(reason_kenter);
679 TLB_COUNT_DECL(reason_enter_l2pt_delref);
680 TLB_COUNT_DECL(reason_enter_l3pt_delref);
681 TLB_COUNT_DECL(reason_kremove);
682 TLB_COUNT_DECL(reason_clear_modify);
683 TLB_COUNT_DECL(reason_clear_reference);
684 TLB_COUNT_DECL(reason_emulate_reference);
685 
686 TLB_COUNT_DECL(asn_reuse);
687 TLB_COUNT_DECL(asn_newgen);
688 TLB_COUNT_DECL(asn_assign);
689 
690 TLB_COUNT_DECL(activate_both_change);
691 TLB_COUNT_DECL(activate_asn_change);
692 TLB_COUNT_DECL(activate_ptbr_change);
693 TLB_COUNT_DECL(activate_swpctx);
694 TLB_COUNT_DECL(activate_skip_swpctx);
695 
696 #else /* ! TLB_STATS */
697 #define	TLB_COUNT(cnt)		__nothing
698 #define	TLB_COUNT_ATTACH(cnt)	__nothing
699 #endif /* TLB_STATS */
700 
701 static void
pmap_tlb_init(void)702 pmap_tlb_init(void)
703 {
704 	/* mutex is initialized in pmap_bootstrap(). */
705 
706 	evcnt_attach_dynamic_nozero(&tlb_evcnt, EVCNT_TYPE_MISC,
707 	    NULL, "TLB", "shootdown");
708 
709 	TLB_COUNT_ATTACH(invalidate_multi_tbia);
710 	TLB_COUNT_ATTACH(invalidate_multi_tbiap);
711 	TLB_COUNT_ATTACH(invalidate_multi_imb);
712 
713 	TLB_COUNT_ATTACH(invalidate_kern_tbia);
714 	TLB_COUNT_ATTACH(invalidate_kern_tbis);
715 	TLB_COUNT_ATTACH(invalidate_kern_imb);
716 
717 	TLB_COUNT_ATTACH(invalidate_user_not_current);
718 	TLB_COUNT_ATTACH(invalidate_user_lazy_imb);
719 	TLB_COUNT_ATTACH(invalidate_user_tbiap);
720 	TLB_COUNT_ATTACH(invalidate_user_tbis);
721 
722 	TLB_COUNT_ATTACH(shootdown_kernel);
723 	TLB_COUNT_ATTACH(shootdown_user);
724 	TLB_COUNT_ATTACH(shootdown_imb);
725 	TLB_COUNT_ATTACH(shootdown_kimb);
726 	TLB_COUNT_ATTACH(shootdown_overflow);
727 
728 	TLB_COUNT_ATTACH(shootdown_all_user);
729 	TLB_COUNT_ATTACH(shootdown_all_user_imb);
730 
731 	TLB_COUNT_ATTACH(shootdown_pv);
732 	TLB_COUNT_ATTACH(shootdown_pv_multi);
733 
734 	TLB_COUNT_ATTACH(shootnow_over_notify);
735 	TLB_COUNT_ATTACH(shootnow_remote);
736 
737 	TLB_COUNT_ATTACH(reason_remove_kernel);
738 	TLB_COUNT_ATTACH(reason_remove_user);
739 	TLB_COUNT_ATTACH(reason_remove_all_user);
740 	TLB_COUNT_ATTACH(reason_page_protect_read);
741 	TLB_COUNT_ATTACH(reason_page_protect_none);
742 	TLB_COUNT_ATTACH(reason_protect);
743 	TLB_COUNT_ATTACH(reason_enter_kernel);
744 	TLB_COUNT_ATTACH(reason_enter_user);
745 	TLB_COUNT_ATTACH(reason_kenter);
746 	TLB_COUNT_ATTACH(reason_enter_l2pt_delref);
747 	TLB_COUNT_ATTACH(reason_enter_l3pt_delref);
748 	TLB_COUNT_ATTACH(reason_kremove);
749 	TLB_COUNT_ATTACH(reason_clear_modify);
750 	TLB_COUNT_ATTACH(reason_clear_reference);
751 
752 	TLB_COUNT_ATTACH(asn_reuse);
753 	TLB_COUNT_ATTACH(asn_newgen);
754 	TLB_COUNT_ATTACH(asn_assign);
755 
756 	TLB_COUNT_ATTACH(activate_both_change);
757 	TLB_COUNT_ATTACH(activate_asn_change);
758 	TLB_COUNT_ATTACH(activate_ptbr_change);
759 	TLB_COUNT_ATTACH(activate_swpctx);
760 	TLB_COUNT_ATTACH(activate_skip_swpctx);
761 }
762 
763 static inline void
pmap_tlb_context_init(struct pmap_tlb_context * const tlbctx,uintptr_t flags)764 pmap_tlb_context_init(struct pmap_tlb_context * const tlbctx, uintptr_t flags)
765 {
766 	/* Initialize the minimum number of fields. */
767 	tlbctx->t_addrdata[0] = 0;
768 	tlbctx->t_addrdata[1] = flags;
769 	tlbctx->t_pmap = NULL;
770 	LIST_INIT(&tlbctx->t_freeptq);
771 	LIST_INIT(&tlbctx->t_freepvq);
772 }
773 
774 static void
pmap_tlb_shootdown_internal(pmap_t const pmap,vaddr_t const va,pt_entry_t const pte_bits,struct pmap_tlb_context * const tlbctx)775 pmap_tlb_shootdown_internal(pmap_t const pmap, vaddr_t const va,
776     pt_entry_t const pte_bits, struct pmap_tlb_context * const tlbctx)
777 {
778 	KASSERT(pmap != NULL);
779 	KASSERT((va & PAGE_MASK) == 0);
780 
781 	/*
782 	 * Figure out who needs to hear about this, and the scope
783 	 * of an all-entries invalidate.
784 	 */
785 	if (pmap == pmap_kernel()) {
786 		TLB_COUNT(shootdown_kernel);
787 		KASSERT(pte_bits & PG_ASM);
788 		TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_ASM);
789 
790 		/* Note if an I-stream sync is also needed. */
791 		if (pte_bits & PG_EXEC) {
792 			TLB_COUNT(shootdown_kimb);
793 			TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_KIMB);
794 		}
795 	} else {
796 		TLB_COUNT(shootdown_user);
797 		KASSERT((pte_bits & PG_ASM) == 0);
798 
799 		/* Note if an I-stream sync is also needed. */
800 		if (pte_bits & PG_EXEC) {
801 			TLB_COUNT(shootdown_imb);
802 			TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_IMB);
803 		}
804 	}
805 
806 	KASSERT(tlbctx->t_pmap == NULL || tlbctx->t_pmap == pmap);
807 	tlbctx->t_pmap = pmap;
808 
809 	/*
810 	 * If we're already at the max, just tell each active CPU
811 	 * to nail everything.
812 	 */
813 	const uintptr_t count = TLB_CTX_COUNT(tlbctx);
814 	if (count > TLB_CTX_MAXVA) {
815 		return;
816 	}
817 	if (count == TLB_CTX_MAXVA) {
818 		TLB_COUNT(shootdown_overflow);
819 		TLB_CTX_SET_ALLVA(tlbctx);
820 		return;
821 	}
822 
823 	TLB_CTX_SETVA(tlbctx, count, va);
824 	TLB_CTX_INC_COUNT(tlbctx);
825 }
826 
827 static void
pmap_tlb_shootdown(pmap_t const pmap,vaddr_t const va,pt_entry_t const pte_bits,struct pmap_tlb_context * const tlbctx)828 pmap_tlb_shootdown(pmap_t const pmap, vaddr_t const va,
829     pt_entry_t const pte_bits, struct pmap_tlb_context * const tlbctx)
830 {
831 	KASSERT((TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_PV) == 0);
832 	pmap_tlb_shootdown_internal(pmap, va, pte_bits, tlbctx);
833 }
834 
835 static void
pmap_tlb_shootdown_all_user(pmap_t const pmap,pt_entry_t const pte_bits,struct pmap_tlb_context * const tlbctx)836 pmap_tlb_shootdown_all_user(pmap_t const pmap, pt_entry_t const pte_bits,
837     struct pmap_tlb_context * const tlbctx)
838 {
839 	KASSERT(pmap != pmap_kernel());
840 
841 	TLB_COUNT(shootdown_all_user);
842 
843 	/* Note if an I-stream sync is also needed. */
844 	if (pte_bits & PG_EXEC) {
845 		TLB_COUNT(shootdown_all_user_imb);
846 		TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_IMB);
847 	}
848 
849 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_PV) {
850 		if (tlbctx->t_pmap == NULL || tlbctx->t_pmap == pmap) {
851 			if (tlbctx->t_pmap == NULL) {
852 				pmap_reference(pmap);
853 				tlbctx->t_pmap = pmap;
854 			}
855 		} else {
856 			TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_MULTI);
857 		}
858 	} else {
859 		KASSERT(tlbctx->t_pmap == NULL || tlbctx->t_pmap == pmap);
860 		tlbctx->t_pmap = pmap;
861 	}
862 
863 	TLB_CTX_SET_ALLVA(tlbctx);
864 }
865 
866 static void
pmap_tlb_shootdown_pv(pmap_t const pmap,vaddr_t const va,pt_entry_t const pte_bits,struct pmap_tlb_context * const tlbctx)867 pmap_tlb_shootdown_pv(pmap_t const pmap, vaddr_t const va,
868     pt_entry_t const pte_bits, struct pmap_tlb_context * const tlbctx)
869 {
870 
871 	KASSERT(TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_PV);
872 
873 	TLB_COUNT(shootdown_pv);
874 
875 	if (tlbctx->t_pmap == NULL || tlbctx->t_pmap == pmap) {
876 		if (tlbctx->t_pmap == NULL) {
877 			pmap_reference(pmap);
878 			tlbctx->t_pmap = pmap;
879 		}
880 		pmap_tlb_shootdown_internal(pmap, va, pte_bits, tlbctx);
881 	} else {
882 		TLB_COUNT(shootdown_pv_multi);
883 		uintptr_t flags = TLB_CTX_F_MULTI;
884 		if (pmap == pmap_kernel()) {
885 			KASSERT(pte_bits & PG_ASM);
886 			flags |= TLB_CTX_F_ASM;
887 		} else {
888 			KASSERT((pte_bits & PG_ASM) == 0);
889 		}
890 
891 		/*
892 		 * No need to distinguish between kernel and user IMB
893 		 * here; see pmap_tlb_invalidate_multi().
894 		 */
895 		if (pte_bits & PG_EXEC) {
896 			flags |= TLB_CTX_F_IMB;
897 		}
898 		TLB_CTX_SET_ALLVA(tlbctx);
899 		TLB_CTX_SET_FLAG(tlbctx, flags);
900 	}
901 }
902 
903 static void
pmap_tlb_invalidate_multi(const struct pmap_tlb_context * const tlbctx)904 pmap_tlb_invalidate_multi(const struct pmap_tlb_context * const tlbctx)
905 {
906 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_ASM) {
907 		TLB_COUNT(invalidate_multi_tbia);
908 		ALPHA_TBIA();
909 	} else {
910 		TLB_COUNT(invalidate_multi_tbiap);
911 		ALPHA_TBIAP();
912 	}
913 	if (TLB_CTX_FLAGS(tlbctx) & (TLB_CTX_F_IMB | TLB_CTX_F_KIMB)) {
914 		TLB_COUNT(invalidate_multi_imb);
915 		alpha_pal_imb();
916 	}
917 }
918 
919 static void
pmap_tlb_invalidate_kernel(const struct pmap_tlb_context * const tlbctx)920 pmap_tlb_invalidate_kernel(const struct pmap_tlb_context * const tlbctx)
921 {
922 	const uintptr_t count = TLB_CTX_COUNT(tlbctx);
923 
924 	if (count == TLB_CTX_ALLVA) {
925 		TLB_COUNT(invalidate_kern_tbia);
926 		ALPHA_TBIA();
927 	} else {
928 		TLB_COUNT(invalidate_kern_tbis);
929 		for (uintptr_t i = 0; i < count; i++) {
930 			ALPHA_TBIS(TLB_CTX_VA(tlbctx, i));
931 		}
932 	}
933 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_KIMB) {
934 		TLB_COUNT(invalidate_kern_imb);
935 		alpha_pal_imb();
936 	}
937 }
938 
939 static void
pmap_tlb_invalidate(const struct pmap_tlb_context * const tlbctx,const struct cpu_info * const ci)940 pmap_tlb_invalidate(const struct pmap_tlb_context * const tlbctx,
941     const struct cpu_info * const ci)
942 {
943 	const uintptr_t count = TLB_CTX_COUNT(tlbctx);
944 
945 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_MULTI) {
946 		pmap_tlb_invalidate_multi(tlbctx);
947 		return;
948 	}
949 
950 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_ASM) {
951 		pmap_tlb_invalidate_kernel(tlbctx);
952 		return;
953 	}
954 
955 	KASSERT(kpreempt_disabled());
956 
957 	pmap_t const pmap = tlbctx->t_pmap;
958 	KASSERT(pmap != NULL);
959 
960 	if (__predict_false(pmap != ci->ci_pmap)) {
961 		TLB_COUNT(invalidate_user_not_current);
962 
963 		/*
964 		 * For CPUs that don't implement ASNs, the SWPCTX call
965 		 * does all of the TLB invalidation work for us.
966 		 */
967 		if (__predict_false(pmap_max_asn == 0)) {
968 			return;
969 		}
970 
971 		const u_long cpu_mask = 1UL << ci->ci_cpuid;
972 
973 		/*
974 		 * We cannot directly invalidate the TLB in this case,
975 		 * so force allocation of a new ASN when the pmap becomes
976 		 * active again.
977 		 */
978 		pmap->pm_percpu[ci->ci_cpuid].pmc_asngen = PMAP_ASNGEN_INVALID;
979 		atomic_and_ulong(&pmap->pm_cpus, ~cpu_mask);
980 
981 		/*
982 		 * This isn't strictly necessary; when we allocate a
983 		 * new ASN, we're going to clear this bit and skip
984 		 * syncing the I-stream.  But we will keep this bit
985 		 * of accounting for internal consistency.
986 		 */
987 		if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_IMB) {
988 			pmap->pm_percpu[ci->ci_cpuid].pmc_needisync = 1;
989 		}
990 		return;
991 	}
992 
993 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_IMB) {
994 		TLB_COUNT(invalidate_user_lazy_imb);
995 		pmap->pm_percpu[ci->ci_cpuid].pmc_needisync = 1;
996 	}
997 
998 	if (count == TLB_CTX_ALLVA) {
999 		/*
1000 		 * Another option here for CPUs that implement ASNs is
1001 		 * to allocate a new ASN and do a SWPCTX.  That's almost
1002 		 * certainly faster than a TBIAP, but would require us
1003 		 * to synchronize against IPIs in pmap_activate().
1004 		 */
1005 		TLB_COUNT(invalidate_user_tbiap);
1006 		KASSERT((TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_ASM) == 0);
1007 		ALPHA_TBIAP();
1008 	} else {
1009 		TLB_COUNT(invalidate_user_tbis);
1010 		for (uintptr_t i = 0; i < count; i++) {
1011 			ALPHA_TBIS(TLB_CTX_VA(tlbctx, i));
1012 		}
1013 	}
1014 }
1015 
1016 static void
pmap_tlb_shootnow(const struct pmap_tlb_context * const tlbctx)1017 pmap_tlb_shootnow(const struct pmap_tlb_context * const tlbctx)
1018 {
1019 
1020 	if (TLB_CTX_COUNT(tlbctx) == 0) {
1021 		/* No work to do. */
1022 		return;
1023 	}
1024 
1025 	/*
1026 	 * Acquire the shootdown mutex.  This will also block IPL_VM
1027 	 * interrupts and disable preemption.  It is critically important
1028 	 * that IPIs not be blocked in this routine.
1029 	 */
1030 	KASSERT(alpha_pal_rdps() < ALPHA_PSL_IPL_CLOCK);
1031 	mutex_spin_enter(&tlb_lock);
1032 	tlb_evcnt.ev_count++;
1033 
1034 	const struct cpu_info *ci = curcpu();
1035 	const u_long this_cpu = 1UL << ci->ci_cpuid;
1036 	u_long active_cpus;
1037 	bool activation_locked, activation_lock_tried;
1038 
1039 	/*
1040 	 * Figure out who to notify.  If it's for the kernel or
1041 	 * multiple address spaces, we notify everybody.  If
1042 	 * it's a single user pmap, then we try to acquire the
1043 	 * activation lock so we can get an accurate accounting
1044 	 * of who needs to be notified.  If we can't acquire
1045 	 * the activation lock, then just notify everyone and
1046 	 * let them sort it out when they process the IPI.
1047 	 */
1048 	if (TLB_CTX_FLAGS(tlbctx) & (TLB_CTX_F_ASM | TLB_CTX_F_MULTI)) {
1049 		active_cpus = pmap_all_cpus();
1050 		activation_locked = false;
1051 		activation_lock_tried = false;
1052 	} else {
1053 		KASSERT(tlbctx->t_pmap != NULL);
1054 		activation_locked = PMAP_ACT_TRYLOCK(tlbctx->t_pmap);
1055 		if (__predict_true(activation_locked)) {
1056 			active_cpus = tlbctx->t_pmap->pm_cpus;
1057 		} else {
1058 			TLB_COUNT(shootnow_over_notify);
1059 			active_cpus = pmap_all_cpus();
1060 		}
1061 		activation_lock_tried = true;
1062 	}
1063 
1064 #if defined(MULTIPROCESSOR)
1065 	/*
1066 	 * If there are remote CPUs that need to do work, get them
1067 	 * started now.
1068 	 */
1069 	const u_long remote_cpus = active_cpus & ~this_cpu;
1070 	KASSERT(tlb_context == NULL);
1071 	if (remote_cpus) {
1072 		TLB_COUNT(shootnow_remote);
1073 		tlb_context = tlbctx;
1074 		tlb_pending = remote_cpus;
1075 		alpha_multicast_ipi(remote_cpus, ALPHA_IPI_SHOOTDOWN);
1076 	}
1077 #endif /* MULTIPROCESSOR */
1078 
1079 	/*
1080 	 * Now that the remotes have been notified, release the
1081 	 * activation lock.
1082 	 */
1083 	if (activation_lock_tried) {
1084 		if (activation_locked) {
1085 			KASSERT(tlbctx->t_pmap != NULL);
1086 			PMAP_ACT_UNLOCK(tlbctx->t_pmap);
1087 		}
1088 		/*
1089 		 * When we tried to acquire the activation lock, we
1090 		 * raised IPL to IPL_SCHED (even if we ultimately
1091 		 * failed to acquire the lock), which blocks out IPIs.
1092 		 * Force our IPL back down to IPL_VM so that we can
1093 		 * receive IPIs.
1094 		 */
1095 		alpha_pal_swpipl(IPL_VM);
1096 	}
1097 
1098 	/*
1099 	 * Do any work that we might need to do.  We don't need to
1100 	 * synchronize with activation here because we know that
1101 	 * for the current CPU, activation status will not change.
1102 	 */
1103 	if (active_cpus & this_cpu) {
1104 		pmap_tlb_invalidate(tlbctx, ci);
1105 	}
1106 
1107 #if defined(MULTIPROCESSOR)
1108 	/* Wait for remote CPUs to finish. */
1109 	if (remote_cpus) {
1110 		int backoff = SPINLOCK_BACKOFF_MIN;
1111 		u_int spins = 0;
1112 
1113 		while (atomic_load_acquire(&tlb_context) != NULL) {
1114 			SPINLOCK_BACKOFF(backoff);
1115 			if (spins++ > 0x0fffffff) {
1116 				printf("TLB LOCAL MASK  = 0x%016lx\n",
1117 				    this_cpu);
1118 				printf("TLB REMOTE MASK = 0x%016lx\n",
1119 				    remote_cpus);
1120 				printf("TLB REMOTE PENDING = 0x%016lx\n",
1121 				    tlb_pending);
1122 				printf("TLB CONTEXT = %p\n", tlb_context);
1123 				printf("TLB LOCAL IPL = %lu\n",
1124 				    alpha_pal_rdps());
1125 				panic("pmap_tlb_shootnow");
1126 			}
1127 		}
1128 	}
1129 	KASSERT(tlb_context == NULL);
1130 #endif /* MULTIPROCESSOR */
1131 
1132 	mutex_spin_exit(&tlb_lock);
1133 
1134 	if (__predict_false(TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_PV)) {
1135 		/*
1136 		 * P->V TLB operations may operate on multiple pmaps.
1137 		 * The shootdown takes a reference on the first pmap it
1138 		 * encounters, in order to prevent it from disappearing,
1139 		 * in the hope that we end up with a single-pmap P->V
1140 		 * operation (instrumentation shows this is not rare).
1141 		 *
1142 		 * Once this shootdown is finished globally, we need to
1143 		 * release this extra reference.
1144 		 */
1145 		KASSERT(tlbctx->t_pmap != NULL);
1146 		pmap_destroy(tlbctx->t_pmap);
1147 	}
1148 }
1149 
1150 #if defined(MULTIPROCESSOR)
1151 void
pmap_tlb_shootdown_ipi(struct cpu_info * const ci,struct trapframe * const tf __unused)1152 pmap_tlb_shootdown_ipi(struct cpu_info * const ci,
1153 
1154     struct trapframe * const tf __unused)
1155 {
1156 	KASSERT(tlb_context != NULL);
1157 	pmap_tlb_invalidate(tlb_context, ci);
1158 	if (atomic_and_ulong_nv(&tlb_pending, ~(1UL << ci->ci_cpuid)) == 0) {
1159 		atomic_store_release(&tlb_context, NULL);
1160 	}
1161 }
1162 #endif /* MULTIPROCESSOR */
1163 
1164 static inline void
pmap_tlb_context_drain(struct pmap_tlb_context * const tlbctx)1165 pmap_tlb_context_drain(struct pmap_tlb_context * const tlbctx)
1166 {
1167 	if (! LIST_EMPTY(&tlbctx->t_freeptq)) {
1168 		pmap_pagelist_free(&tlbctx->t_freeptq);
1169 	}
1170 	if (! LIST_EMPTY(&tlbctx->t_freepvq)) {
1171 		pmap_pvlist_free(&tlbctx->t_freepvq);
1172 	}
1173 }
1174 
1175 /*
1176  * ASN management functions.
1177  */
1178 static u_int	pmap_asn_alloc(pmap_t, struct cpu_info *);
1179 
1180 /*
1181  * Misc. functions.
1182  */
1183 static struct vm_page *pmap_physpage_alloc(int);
1184 static void	pmap_physpage_free(paddr_t);
1185 static int	pmap_physpage_addref(void *);
1186 static int	pmap_physpage_delref(void *);
1187 
1188 static bool	vtophys_internal(vaddr_t, paddr_t *p);
1189 
1190 /*
1191  * PMAP_KERNEL_PTE:
1192  *
1193  *	Get a kernel PTE.
1194  *
1195  *	If debugging, do a table walk.  If not debugging, just use
1196  *	the Virtual Page Table, since all kernel page tables are
1197  *	pre-allocated and mapped in.
1198  */
1199 #ifdef DEBUG
1200 #define	PMAP_KERNEL_PTE(va)						\
1201 ({									\
1202 	pt_entry_t *l1pte_, *l2pte_;					\
1203 									\
1204 	l1pte_ = pmap_l1pte(kernel_lev1map, va);			\
1205 	if (pmap_pte_v(l1pte_) == 0) {					\
1206 		printf("kernel level 1 PTE not valid, va 0x%lx "	\
1207 		    "(line %d)\n", (va), __LINE__);			\
1208 		panic("PMAP_KERNEL_PTE");				\
1209 	}								\
1210 	l2pte_ = pmap_l2pte(kernel_lev1map, va, l1pte_);		\
1211 	if (pmap_pte_v(l2pte_) == 0) {					\
1212 		printf("kernel level 2 PTE not valid, va 0x%lx "	\
1213 		    "(line %d)\n", (va), __LINE__);			\
1214 		panic("PMAP_KERNEL_PTE");				\
1215 	}								\
1216 	pmap_l3pte(kernel_lev1map, va, l2pte_);				\
1217 })
1218 #else
1219 #define	PMAP_KERNEL_PTE(va)	(&VPT[VPT_INDEX((va))])
1220 #endif
1221 
1222 /*
1223  * PMAP_STAT_{INCR,DECR}:
1224  *
1225  *	Increment or decrement a pmap statistic.
1226  */
1227 #define	PMAP_STAT_INCR(s, v)	atomic_add_long((unsigned long *)(&(s)), (v))
1228 #define	PMAP_STAT_DECR(s, v)	atomic_add_long((unsigned long *)(&(s)), -(v))
1229 
1230 /*
1231  * pmap_init_cpu:
1232  *
1233  *	Initilize pmap data in the cpu_info.
1234  */
1235 void
pmap_init_cpu(struct cpu_info * const ci)1236 pmap_init_cpu(struct cpu_info * const ci)
1237 {
1238 	pmap_t const pmap = pmap_kernel();
1239 
1240 	/* All CPUs start out using the kernel pmap. */
1241 	atomic_or_ulong(&pmap->pm_cpus, 1UL << ci->ci_cpuid);
1242 	pmap_reference(pmap);
1243 	ci->ci_pmap = pmap;
1244 
1245 	/* Initialize ASN allocation logic. */
1246 	ci->ci_next_asn = PMAP_ASN_FIRST_USER;
1247 	ci->ci_asn_gen = PMAP_ASNGEN_INITIAL;
1248 }
1249 
1250 /*
1251  * pmap_bootstrap:
1252  *
1253  *	Bootstrap the system to run with virtual memory.
1254  *
1255  *	Note: no locking is necessary in this function.
1256  */
1257 void
pmap_bootstrap(paddr_t ptaddr,u_int maxasn,u_long ncpuids)1258 pmap_bootstrap(paddr_t ptaddr, u_int maxasn, u_long ncpuids)
1259 {
1260 	vsize_t lev2mapsize, lev3mapsize;
1261 	pt_entry_t *lev2map, *lev3map;
1262 	pt_entry_t pte;
1263 	vsize_t bufsz;
1264 	struct pcb *pcb;
1265 	int i;
1266 
1267 #ifdef DEBUG
1268 	if (pmapdebug & (PDB_FOLLOW|PDB_BOOTSTRAP))
1269 		printf("pmap_bootstrap(0x%lx, %u)\n", ptaddr, maxasn);
1270 #endif
1271 
1272 	/*
1273 	 * Compute the number of pages kmem_arena will have.
1274 	 */
1275 	kmeminit_nkmempages();
1276 
1277 	/*
1278 	 * Figure out how many initial PTE's are necessary to map the
1279 	 * kernel.  We also reserve space for kmem_alloc_pageable()
1280 	 * for vm_fork().
1281 	 */
1282 
1283 	/* Get size of buffer cache and set an upper limit */
1284 	bufsz = buf_memcalc();
1285 	buf_setvalimit(bufsz);
1286 
1287 	lev3mapsize =
1288 		(VM_PHYS_SIZE + (ubc_nwins << ubc_winshift) +
1289 		 bufsz + 16 * NCARGS + pager_map_size) / PAGE_SIZE +
1290 		(maxproc * UPAGES) + nkmempages;
1291 
1292 	lev3mapsize = roundup(lev3mapsize, NPTEPG);
1293 
1294 	/*
1295 	 * Initialize `FYI' variables.  Note we're relying on
1296 	 * the fact that BSEARCH sorts the vm_physmem[] array
1297 	 * for us.
1298 	 */
1299 	avail_start = ptoa(uvm_physseg_get_avail_start(uvm_physseg_get_first()));
1300 	avail_end = ptoa(uvm_physseg_get_avail_end(uvm_physseg_get_last()));
1301 	virtual_end = VM_MIN_KERNEL_ADDRESS + lev3mapsize * PAGE_SIZE;
1302 
1303 #if 0
1304 	printf("avail_start = 0x%lx\n", avail_start);
1305 	printf("avail_end = 0x%lx\n", avail_end);
1306 	printf("virtual_end = 0x%lx\n", virtual_end);
1307 #endif
1308 
1309 	/*
1310 	 * Allocate a level 1 PTE table for the kernel.
1311 	 * This is always one page long.
1312 	 * IF THIS IS NOT A MULTIPLE OF PAGE_SIZE, ALL WILL GO TO HELL.
1313 	 */
1314 	kernel_lev1map = (pt_entry_t *)
1315 	    uvm_pageboot_alloc(sizeof(pt_entry_t) * NPTEPG);
1316 
1317 	/*
1318 	 * Allocate a level 2 PTE table for the kernel.
1319 	 * These must map all of the level3 PTEs.
1320 	 * IF THIS IS NOT A MULTIPLE OF PAGE_SIZE, ALL WILL GO TO HELL.
1321 	 */
1322 	lev2mapsize = roundup(howmany(lev3mapsize, NPTEPG), NPTEPG);
1323 	lev2map = (pt_entry_t *)
1324 	    uvm_pageboot_alloc(sizeof(pt_entry_t) * lev2mapsize);
1325 
1326 	/*
1327 	 * Allocate a level 3 PTE table for the kernel.
1328 	 * Contains lev3mapsize PTEs.
1329 	 */
1330 	lev3map = (pt_entry_t *)
1331 	    uvm_pageboot_alloc(sizeof(pt_entry_t) * lev3mapsize);
1332 
1333 	/*
1334 	 * Set up level 1 page table
1335 	 */
1336 
1337 	/* Map all of the level 2 pte pages */
1338 	for (i = 0; i < howmany(lev2mapsize, NPTEPG); i++) {
1339 		pte = (ALPHA_K0SEG_TO_PHYS(((vaddr_t)lev2map) +
1340 		    (i*PAGE_SIZE)) >> PGSHIFT) << PG_SHIFT;
1341 		pte |= PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
1342 		kernel_lev1map[l1pte_index(VM_MIN_KERNEL_ADDRESS +
1343 		    (i*PAGE_SIZE*NPTEPG*NPTEPG))] = pte;
1344 	}
1345 
1346 	/* Map the virtual page table */
1347 	pte = (ALPHA_K0SEG_TO_PHYS((vaddr_t)kernel_lev1map) >> PGSHIFT)
1348 	    << PG_SHIFT;
1349 	pte |= PG_V | PG_KRE | PG_KWE; /* NOTE NO ASM */
1350 	kernel_lev1map[l1pte_index(VPTBASE)] = pte;
1351 	VPT = (pt_entry_t *)VPTBASE;
1352 
1353 	/*
1354 	 * Set up level 2 page table.
1355 	 */
1356 	/* Map all of the level 3 pte pages */
1357 	for (i = 0; i < howmany(lev3mapsize, NPTEPG); i++) {
1358 		pte = (ALPHA_K0SEG_TO_PHYS(((vaddr_t)lev3map) +
1359 		    (i*PAGE_SIZE)) >> PGSHIFT) << PG_SHIFT;
1360 		pte |= PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
1361 		lev2map[l2pte_index(VM_MIN_KERNEL_ADDRESS+
1362 		    (i*PAGE_SIZE*NPTEPG))] = pte;
1363 	}
1364 
1365 	/* Initialize the pmap_growkernel_lock. */
1366 	rw_init(&pmap_growkernel_lock);
1367 
1368 	/*
1369 	 * Set up level three page table (lev3map)
1370 	 */
1371 	/* Nothing to do; it's already zero'd */
1372 
1373 	/*
1374 	 * Initialize the pmap pools and list.
1375 	 */
1376 	pmap_ncpuids = ncpuids;
1377 	pool_cache_bootstrap(&pmap_pmap_cache, PMAP_SIZEOF(pmap_ncpuids),
1378 	    COHERENCY_UNIT, 0, 0, "pmap", NULL, IPL_NONE, NULL, NULL, NULL);
1379 	pool_cache_bootstrap(&pmap_l1pt_cache, PAGE_SIZE, 0, 0, 0, "pmapl1pt",
1380 	    &pmap_l1pt_allocator, IPL_NONE, pmap_l1pt_ctor, NULL, NULL);
1381 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1382 	    PR_LARGECACHE, "pmappv", &pmap_pv_page_allocator, IPL_NONE, NULL,
1383 	    NULL, NULL);
1384 
1385 	TAILQ_INIT(&pmap_all_pmaps);
1386 
1387 	/* Initialize the ASN logic.  See also pmap_init_cpu(). */
1388 	pmap_max_asn = maxasn;
1389 
1390 	/*
1391 	 * Initialize the locks.
1392 	 */
1393 	rw_init(&pmap_main_lock);
1394 	mutex_init(&pmap_all_pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1395 	for (i = 0; i < __arraycount(pmap_pvh_locks); i++) {
1396 		mutex_init(&pmap_pvh_locks[i].lock, MUTEX_DEFAULT, IPL_NONE);
1397 	}
1398 	for (i = 0; i < __arraycount(pmap_pvh_locks); i++) {
1399 		mutex_init(&pmap_pmap_locks[i].locks.lock,
1400 		    MUTEX_DEFAULT, IPL_NONE);
1401 		mutex_init(&pmap_pmap_locks[i].locks.activation_lock,
1402 		    MUTEX_SPIN, IPL_SCHED);
1403 	}
1404 
1405 	/*
1406 	 * This must block any interrupt from which a TLB shootdown
1407 	 * could be issued, but must NOT block IPIs.
1408 	 */
1409 	mutex_init(&tlb_lock, MUTEX_SPIN, IPL_VM);
1410 
1411 	/*
1412 	 * Initialize kernel pmap.  Note that all kernel mappings
1413 	 * have PG_ASM set, so the ASN doesn't really matter for
1414 	 * the kernel pmap.  Also, since the kernel pmap always
1415 	 * references kernel_lev1map, it always has an invalid ASN
1416 	 * generation.
1417 	 */
1418 	memset(pmap_kernel(), 0, sizeof(struct pmap));
1419 	LIST_INIT(&pmap_kernel()->pm_ptpages);
1420 	LIST_INIT(&pmap_kernel()->pm_pvents);
1421 	atomic_store_relaxed(&pmap_kernel()->pm_count, 1);
1422 	/* Kernel pmap does not have per-CPU info. */
1423 	TAILQ_INSERT_TAIL(&pmap_all_pmaps, pmap_kernel(), pm_list);
1424 
1425 	/*
1426 	 * Set up lwp0's PCB such that the ptbr points to the right place
1427 	 * and has the kernel pmap's (really unused) ASN.
1428 	 */
1429 	pcb = lwp_getpcb(&lwp0);
1430 	pcb->pcb_hw.apcb_ptbr =
1431 	    ALPHA_K0SEG_TO_PHYS((vaddr_t)kernel_lev1map) >> PGSHIFT;
1432 	pcb->pcb_hw.apcb_asn = PMAP_ASN_KERNEL;
1433 
1434 	struct cpu_info * const ci = curcpu();
1435 	pmap_init_cpu(ci);
1436 }
1437 
1438 /*
1439  * pmap_virtual_space:		[ INTERFACE ]
1440  *
1441  *	Define the initial bounds of the kernel virtual address space.
1442  */
1443 void
pmap_virtual_space(vaddr_t * vstartp,vaddr_t * vendp)1444 pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp)
1445 {
1446 
1447 	*vstartp = VM_MIN_KERNEL_ADDRESS;	/* kernel is in K0SEG */
1448 	*vendp = VM_MAX_KERNEL_ADDRESS;		/* we use pmap_growkernel */
1449 }
1450 
1451 /*
1452  * pmap_steal_memory:		[ INTERFACE ]
1453  *
1454  *	Bootstrap memory allocator (alternative to vm_bootstrap_steal_memory()).
1455  *	This function allows for early dynamic memory allocation until the
1456  *	virtual memory system has been bootstrapped.  After that point, either
1457  *	kmem_alloc or malloc should be used.  This function works by stealing
1458  *	pages from the (to be) managed page pool, then implicitly mapping the
1459  *	pages (by using their k0seg addresses) and zeroing them.
1460  *
1461  *	It may be used once the physical memory segments have been pre-loaded
1462  *	into the vm_physmem[] array.  Early memory allocation MUST use this
1463  *	interface!  This cannot be used after vm_page_startup(), and will
1464  *	generate a panic if tried.
1465  *
1466  *	Note that this memory will never be freed, and in essence it is wired
1467  *	down.
1468  *
1469  *	We must adjust *vstartp and/or *vendp iff we use address space
1470  *	from the kernel virtual address range defined by pmap_virtual_space().
1471  *
1472  *	Note: no locking is necessary in this function.
1473  */
1474 vaddr_t
pmap_steal_memory(vsize_t size,vaddr_t * vstartp,vaddr_t * vendp)1475 pmap_steal_memory(vsize_t size, vaddr_t *vstartp, vaddr_t *vendp)
1476 {
1477 	int npgs;
1478 	vaddr_t va;
1479 	paddr_t pa;
1480 
1481 	uvm_physseg_t bank;
1482 
1483 	size = round_page(size);
1484 	npgs = atop(size);
1485 
1486 #if 0
1487 	printf("PSM: size 0x%lx (npgs 0x%x)\n", size, npgs);
1488 #endif
1489 
1490 	for (bank = uvm_physseg_get_first();
1491 	     uvm_physseg_valid_p(bank);
1492 	     bank = uvm_physseg_get_next(bank)) {
1493 		if (uvm.page_init_done == true)
1494 			panic("pmap_steal_memory: called _after_ bootstrap");
1495 
1496 #if 0
1497 		printf("     bank %d: avail_start 0x%"PRIxPADDR", start 0x%"PRIxPADDR", "
1498 		    "avail_end 0x%"PRIxPADDR"\n", bank, uvm_physseg_get_avail_start(bank),
1499 		    uvm_physseg_get_start(bank), uvm_physseg_get_avail_end(bank));
1500 #endif
1501 
1502 		if (uvm_physseg_get_avail_start(bank) != uvm_physseg_get_start(bank) ||
1503 		    uvm_physseg_get_avail_start(bank) >= uvm_physseg_get_avail_end(bank))
1504 			continue;
1505 
1506 #if 0
1507 		printf("             avail_end - avail_start = 0x%"PRIxPADDR"\n",
1508 		    uvm_physseg_get_avail_end(bank) - uvm_physseg_get_avail_start(bank));
1509 #endif
1510 
1511 		if (uvm_physseg_get_avail_end(bank) - uvm_physseg_get_avail_start(bank)
1512 		    < npgs)
1513 			continue;
1514 
1515 		/*
1516 		 * There are enough pages here; steal them!
1517 		 */
1518 		pa = ptoa(uvm_physseg_get_start(bank));
1519 		uvm_physseg_unplug(atop(pa), npgs);
1520 
1521 		va = ALPHA_PHYS_TO_K0SEG(pa);
1522 		memset((void *)va, 0, size);
1523 		pmap_pages_stolen += npgs;
1524 		return (va);
1525 	}
1526 
1527 	/*
1528 	 * If we got here, this was no memory left.
1529 	 */
1530 	panic("pmap_steal_memory: no memory to steal");
1531 }
1532 
1533 /*
1534  * pmap_init:			[ INTERFACE ]
1535  *
1536  *	Initialize the pmap module.  Called by vm_init(), to initialize any
1537  *	structures that the pmap system needs to map virtual memory.
1538  *
1539  *	Note: no locking is necessary in this function.
1540  */
1541 void
pmap_init(void)1542 pmap_init(void)
1543 {
1544 
1545 #ifdef DEBUG
1546 	if (pmapdebug & PDB_FOLLOW)
1547 	        printf("pmap_init()\n");
1548 #endif
1549 
1550 	/* initialize protection array */
1551 	alpha_protection_init();
1552 
1553 	/* Initialize TLB handling. */
1554 	pmap_tlb_init();
1555 
1556 	/* Instrument pmap_growkernel(). */
1557 	evcnt_attach_dynamic_nozero(&pmap_growkernel_evcnt, EVCNT_TYPE_MISC,
1558 	    NULL, "pmap", "growkernel");
1559 
1560 	/*
1561 	 * Set a low water mark on the pv_entry pool, so that we are
1562 	 * more likely to have these around even in extreme memory
1563 	 * starvation.
1564 	 */
1565 	pool_cache_setlowat(&pmap_pv_cache, pmap_pv_lowat);
1566 
1567 	/*
1568 	 * Now it is safe to enable pv entry recording.
1569 	 */
1570 	pmap_initialized = true;
1571 
1572 #if 0
1573 	for (uvm_physseg_t bank = uvm_physseg_get_first();
1574 	    uvm_physseg_valid_p(bank);
1575 	    bank = uvm_physseg_get_next(bank)) {
1576 		printf("bank %d\n", bank);
1577 		printf("\tstart = 0x%lx\n", ptoa(uvm_physseg_get_start(bank)));
1578 		printf("\tend = 0x%lx\n", ptoa(uvm_physseg_get_end(bank)));
1579 		printf("\tavail_start = 0x%lx\n",
1580 		    ptoa(uvm_physseg_get_avail_start(bank)));
1581 		printf("\tavail_end = 0x%lx\n",
1582 		    ptoa(uvm_physseg_get_avail_end(bank)));
1583 	}
1584 #endif
1585 }
1586 
1587 /*
1588  * pmap_create:			[ INTERFACE ]
1589  *
1590  *	Create and return a physical map.
1591  *
1592  *	Note: no locking is necessary in this function.
1593  */
1594 pmap_t
pmap_create(void)1595 pmap_create(void)
1596 {
1597 	pmap_t pmap;
1598 	pt_entry_t *lev1map;
1599 	int i;
1600 
1601 #ifdef DEBUG
1602 	if (pmapdebug & (PDB_FOLLOW|PDB_CREATE))
1603 		printf("pmap_create()\n");
1604 #endif
1605 
1606 	pmap = pool_cache_get(&pmap_pmap_cache, PR_WAITOK);
1607 	memset(pmap, 0, sizeof(*pmap));
1608 	LIST_INIT(&pmap->pm_ptpages);
1609 	LIST_INIT(&pmap->pm_pvents);
1610 
1611 	atomic_store_relaxed(&pmap->pm_count, 1);
1612 
1613  try_again:
1614 	rw_enter(&pmap_growkernel_lock, RW_READER);
1615 
1616 	lev1map = pool_cache_get(&pmap_l1pt_cache, PR_NOWAIT);
1617 	if (__predict_false(lev1map == NULL)) {
1618 		rw_exit(&pmap_growkernel_lock);
1619 		(void) kpause("pmap_create", false, hz >> 2, NULL);
1620 		goto try_again;
1621 	}
1622 
1623 	/*
1624 	 * There are only kernel mappings at this point; give the pmap
1625 	 * the kernel ASN.  This will be initialized to correct values
1626 	 * when the pmap is activated.
1627 	 *
1628 	 * We stash a pointer to the pmap's lev1map in each CPU's
1629 	 * private data.  It remains constant for the life of the
1630 	 * pmap, and gives us more room in the shared pmap structure.
1631 	 */
1632 	for (i = 0; i < pmap_ncpuids; i++) {
1633 		pmap->pm_percpu[i].pmc_asn = PMAP_ASN_KERNEL;
1634 		pmap->pm_percpu[i].pmc_asngen = PMAP_ASNGEN_INVALID;
1635 		pmap->pm_percpu[i].pmc_lev1map = lev1map;
1636 	}
1637 
1638 	mutex_enter(&pmap_all_pmaps_lock);
1639 	TAILQ_INSERT_TAIL(&pmap_all_pmaps, pmap, pm_list);
1640 	mutex_exit(&pmap_all_pmaps_lock);
1641 
1642 	rw_exit(&pmap_growkernel_lock);
1643 
1644 	return (pmap);
1645 }
1646 
1647 /*
1648  * pmap_destroy:		[ INTERFACE ]
1649  *
1650  *	Drop the reference count on the specified pmap, releasing
1651  *	all resources if the reference count drops to zero.
1652  */
1653 void
pmap_destroy(pmap_t pmap)1654 pmap_destroy(pmap_t pmap)
1655 {
1656 
1657 #ifdef DEBUG
1658 	if (pmapdebug & PDB_FOLLOW)
1659 		printf("pmap_destroy(%p)\n", pmap);
1660 #endif
1661 
1662 	PMAP_MP(membar_release());
1663 	KASSERT(atomic_load_relaxed(&pmap->pm_count) > 0);
1664 	if (atomic_dec_uint_nv(&pmap->pm_count) > 0)
1665 		return;
1666 	PMAP_MP(membar_acquire());
1667 
1668 	pt_entry_t *lev1map = pmap_lev1map(pmap);
1669 
1670 	rw_enter(&pmap_growkernel_lock, RW_READER);
1671 
1672 	/*
1673 	 * Remove it from the global list of all pmaps.
1674 	 */
1675 	mutex_enter(&pmap_all_pmaps_lock);
1676 	TAILQ_REMOVE(&pmap_all_pmaps, pmap, pm_list);
1677 	mutex_exit(&pmap_all_pmaps_lock);
1678 
1679 	pool_cache_put(&pmap_l1pt_cache, lev1map);
1680 #ifdef DIAGNOSTIC
1681 	int i;
1682 	for (i = 0; i < pmap_ncpuids; i++) {
1683 		pmap->pm_percpu[i].pmc_lev1map = (pt_entry_t *)0xdeadbeefUL;
1684 	}
1685 #endif /* DIAGNOSTIC */
1686 
1687 	rw_exit(&pmap_growkernel_lock);
1688 
1689 	pool_cache_put(&pmap_pmap_cache, pmap);
1690 }
1691 
1692 /*
1693  * pmap_reference:		[ INTERFACE ]
1694  *
1695  *	Add a reference to the specified pmap.
1696  */
1697 void
pmap_reference(pmap_t pmap)1698 pmap_reference(pmap_t pmap)
1699 {
1700 	unsigned int newcount __diagused;
1701 
1702 #ifdef DEBUG
1703 	if (pmapdebug & PDB_FOLLOW)
1704 		printf("pmap_reference(%p)\n", pmap);
1705 #endif
1706 
1707 	newcount = atomic_inc_uint_nv(&pmap->pm_count);
1708 	KASSERT(newcount != 0);
1709 }
1710 
1711 /*
1712  * pmap_remove:			[ INTERFACE ]
1713  *
1714  *	Remove the given range of addresses from the specified map.
1715  *
1716  *	It is assumed that the start and end are properly
1717  *	rounded to the page size.
1718  */
1719 static void
pmap_remove_internal(pmap_t pmap,vaddr_t sva,vaddr_t eva,struct pmap_tlb_context * const tlbctx)1720 pmap_remove_internal(pmap_t pmap, vaddr_t sva, vaddr_t eva,
1721     struct pmap_tlb_context * const tlbctx)
1722 {
1723 	pt_entry_t *l1pte, *l2pte, *l3pte;
1724 	pt_entry_t *saved_l2pte, *saved_l3pte;
1725 	vaddr_t l1eva, l2eva, l3vptva;
1726 	pt_entry_t pte_bits;
1727 
1728 #ifdef DEBUG
1729 	if (pmapdebug & (PDB_FOLLOW|PDB_REMOVE|PDB_PROTECT))
1730 		printf("pmap_remove(%p, %lx, %lx)\n", pmap, sva, eva);
1731 #endif
1732 
1733 	/*
1734 	 * If this is the kernel pmap, we can use a faster method
1735 	 * for accessing the PTEs (since the PT pages are always
1736 	 * resident).
1737 	 *
1738 	 * Note that this routine should NEVER be called from an
1739 	 * interrupt context; pmap_kremove() is used for that.
1740 	 */
1741 	if (pmap == pmap_kernel()) {
1742 		PMAP_MAP_TO_HEAD_LOCK();
1743 		PMAP_LOCK(pmap);
1744 
1745 		while (sva < eva) {
1746 			l3pte = PMAP_KERNEL_PTE(sva);
1747 			if (pmap_pte_v(l3pte)) {
1748 				pte_bits = pmap_remove_mapping(pmap, sva,
1749 				    l3pte, true, NULL, tlbctx);
1750 				pmap_tlb_shootdown(pmap, sva, pte_bits,
1751 				    tlbctx);
1752 			}
1753 			sva += PAGE_SIZE;
1754 		}
1755 
1756 		PMAP_MAP_TO_HEAD_UNLOCK();
1757 		PMAP_UNLOCK(pmap);
1758 		pmap_tlb_shootnow(tlbctx);
1759 		/* kernel PT pages are never freed. */
1760 		KASSERT(LIST_EMPTY(&tlbctx->t_freeptq));
1761 		/* ...but we might have freed PV entries. */
1762 		pmap_tlb_context_drain(tlbctx);
1763 		TLB_COUNT(reason_remove_kernel);
1764 
1765 		return;
1766 	}
1767 
1768 	pt_entry_t * const lev1map = pmap_lev1map(pmap);
1769 
1770 	KASSERT(sva < VM_MAXUSER_ADDRESS);
1771 	KASSERT(eva <= VM_MAXUSER_ADDRESS);
1772 	KASSERT(lev1map != kernel_lev1map);
1773 
1774 	PMAP_MAP_TO_HEAD_LOCK();
1775 	PMAP_LOCK(pmap);
1776 
1777 	l1pte = pmap_l1pte(lev1map, sva);
1778 
1779 	for (; sva < eva; sva = l1eva, l1pte++) {
1780 		l1eva = alpha_trunc_l1seg(sva) + ALPHA_L1SEG_SIZE;
1781 		if (pmap_pte_v(l1pte)) {
1782 			saved_l2pte = l2pte = pmap_l2pte(lev1map, sva, l1pte);
1783 
1784 			/*
1785 			 * Add a reference to the L2 table so it won't
1786 			 * get removed from under us.
1787 			 */
1788 			pmap_physpage_addref(saved_l2pte);
1789 
1790 			for (; sva < l1eva && sva < eva; sva = l2eva, l2pte++) {
1791 				l2eva =
1792 				    alpha_trunc_l2seg(sva) + ALPHA_L2SEG_SIZE;
1793 				if (pmap_pte_v(l2pte)) {
1794 					saved_l3pte = l3pte =
1795 					    pmap_l3pte(lev1map, sva, l2pte);
1796 
1797 					/*
1798 					 * Add a reference to the L3 table so
1799 					 * it won't get removed from under us.
1800 					 */
1801 					pmap_physpage_addref(saved_l3pte);
1802 
1803 					/*
1804 					 * Remember this sva; if the L3 table
1805 					 * gets removed, we need to invalidate
1806 					 * the VPT TLB entry for it.
1807 					 */
1808 					l3vptva = sva;
1809 
1810 					for (; sva < l2eva && sva < eva;
1811 					     sva += PAGE_SIZE, l3pte++) {
1812 						if (!pmap_pte_v(l3pte)) {
1813 							continue;
1814 						}
1815 						pte_bits =
1816 						    pmap_remove_mapping(
1817 							pmap, sva,
1818 							l3pte, true,
1819 							NULL, tlbctx);
1820 						pmap_tlb_shootdown(pmap,
1821 						    sva, pte_bits, tlbctx);
1822 					}
1823 
1824 					/*
1825 					 * Remove the reference to the L3
1826 					 * table that we added above.  This
1827 					 * may free the L3 table.
1828 					 */
1829 					pmap_l3pt_delref(pmap, l3vptva,
1830 					    saved_l3pte, tlbctx);
1831 				}
1832 			}
1833 
1834 			/*
1835 			 * Remove the reference to the L2 table that we
1836 			 * added above.  This may free the L2 table.
1837 			 */
1838 			pmap_l2pt_delref(pmap, l1pte, saved_l2pte, tlbctx);
1839 		}
1840 	}
1841 
1842 	PMAP_MAP_TO_HEAD_UNLOCK();
1843 	PMAP_UNLOCK(pmap);
1844 	pmap_tlb_shootnow(tlbctx);
1845 	pmap_tlb_context_drain(tlbctx);
1846 	TLB_COUNT(reason_remove_user);
1847 }
1848 
1849 void
pmap_remove(pmap_t pmap,vaddr_t sva,vaddr_t eva)1850 pmap_remove(pmap_t pmap, vaddr_t sva, vaddr_t eva)
1851 {
1852 	struct pmap_tlb_context tlbctx;
1853 
1854 	pmap_tlb_context_init(&tlbctx, 0);
1855 	pmap_remove_internal(pmap, sva, eva, &tlbctx);
1856 }
1857 
1858 /*
1859  * pmap_remove_all:		[ INTERFACE ]
1860  *
1861  *	Remove all mappings from a pmap in bulk.  This is only called
1862  *	when it's known that the address space is no longer visible to
1863  *	any user process (e.g. during exit or exec).
1864  */
1865 bool
pmap_remove_all(pmap_t pmap)1866 pmap_remove_all(pmap_t pmap)
1867 {
1868 	struct pmap_tlb_context tlbctx;
1869 	struct vm_page *pg;
1870 	pv_entry_t pv;
1871 
1872 	KASSERT(pmap != pmap_kernel());
1873 
1874 	/*
1875 	 * This process is pretty simple:
1876 	 *
1877 	 * ==> (1) Zero out the user-space portion of the lev1map.
1878 	 *
1879 	 * ==> (2) Copy the PT page list to the tlbctx and re-init.
1880 	 *
1881 	 * ==> (3) Walk the PV entry list and remove each entry.
1882 	 *
1883 	 * ==> (4) Zero the wired and resident count.
1884 	 *
1885 	 * Once we've done that, we just need to free everything
1886 	 * back to the system.
1887 	 */
1888 
1889 	pmap_tlb_context_init(&tlbctx, 0);
1890 
1891 	PMAP_MAP_TO_HEAD_LOCK();
1892 	PMAP_LOCK(pmap);
1893 
1894 	/* Step 1 */
1895 	pt_entry_t * const lev1map = pmap_lev1map(pmap);
1896 	memset(lev1map, 0,
1897 	       l1pte_index(VM_MAXUSER_ADDRESS) * sizeof(pt_entry_t));
1898 
1899 	/* Step 2 */
1900 	LIST_MOVE(&pmap->pm_ptpages, &tlbctx.t_freeptq, pageq.list);
1901 
1902 	/* Fix up the reference count on the lev1map page. */
1903 	pg = PHYS_TO_VM_PAGE(ALPHA_K0SEG_TO_PHYS((vaddr_t)lev1map));
1904 	PHYSPAGE_REFCNT_SET(pg, 0);
1905 
1906 	/* Step 3 */
1907 	while ((pv = LIST_FIRST(&pmap->pm_pvents)) != NULL) {
1908 		KASSERT(pv->pv_pmap == pmap);
1909 		pmap_pv_remove(pmap, PHYS_TO_VM_PAGE(pmap_pte_pa(pv->pv_pte)),
1910 		    pv->pv_va, true, NULL, &tlbctx);
1911 	}
1912 
1913 	/* Step 4 */
1914 	atomic_store_relaxed(&pmap->pm_stats.wired_count, 0);
1915 	atomic_store_relaxed(&pmap->pm_stats.resident_count, 0);
1916 
1917 	pmap_tlb_shootdown_all_user(pmap, PG_EXEC, &tlbctx);
1918 
1919 	PMAP_UNLOCK(pmap);
1920 	PMAP_MAP_TO_HEAD_UNLOCK();
1921 
1922 	pmap_tlb_shootnow(&tlbctx);
1923 	pmap_tlb_context_drain(&tlbctx);
1924 	TLB_COUNT(reason_remove_all_user);
1925 
1926 	return true;
1927 }
1928 
1929 /*
1930  * pmap_page_protect:		[ INTERFACE ]
1931  *
1932  *	Lower the permission for all mappings to a given page to
1933  *	the permissions specified.
1934  */
1935 void
pmap_page_protect(struct vm_page * pg,vm_prot_t prot)1936 pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
1937 {
1938 	pv_entry_t pv, nextpv;
1939 	pt_entry_t opte;
1940 	kmutex_t *lock;
1941 	struct pmap_tlb_context tlbctx;
1942 
1943 #ifdef DEBUG
1944 	if ((pmapdebug & (PDB_FOLLOW|PDB_PROTECT)) ||
1945 	    (prot == VM_PROT_NONE && (pmapdebug & PDB_REMOVE)))
1946 		printf("pmap_page_protect(%p, %x)\n", pg, prot);
1947 #endif
1948 
1949 	pmap_tlb_context_init(&tlbctx, TLB_CTX_F_PV);
1950 
1951 	switch (prot) {
1952 	case VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE:
1953 	case VM_PROT_READ|VM_PROT_WRITE:
1954 		return;
1955 
1956 	/* copy_on_write */
1957 	case VM_PROT_READ|VM_PROT_EXECUTE:
1958 	case VM_PROT_READ:
1959 		PMAP_HEAD_TO_MAP_LOCK();
1960 		lock = pmap_pvh_lock(pg);
1961 		mutex_enter(lock);
1962 		for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next) {
1963 			PMAP_LOCK(pv->pv_pmap);
1964 			opte = atomic_load_relaxed(pv->pv_pte);
1965 			if (opte & (PG_KWE | PG_UWE)) {
1966 				atomic_store_relaxed(pv->pv_pte,
1967 				    opte & ~(PG_KWE | PG_UWE));
1968 				pmap_tlb_shootdown_pv(pv->pv_pmap, pv->pv_va,
1969 				    opte, &tlbctx);
1970 			}
1971 			PMAP_UNLOCK(pv->pv_pmap);
1972 		}
1973 		mutex_exit(lock);
1974 		PMAP_HEAD_TO_MAP_UNLOCK();
1975 		pmap_tlb_shootnow(&tlbctx);
1976 		TLB_COUNT(reason_page_protect_read);
1977 		return;
1978 
1979 	/* remove_all */
1980 	default:
1981 		break;
1982 	}
1983 
1984 	PMAP_HEAD_TO_MAP_LOCK();
1985 	lock = pmap_pvh_lock(pg);
1986 	mutex_enter(lock);
1987 	for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = nextpv) {
1988 		pt_entry_t pte_bits;
1989 		pmap_t pmap;
1990 		vaddr_t va;
1991 
1992 		nextpv = pv->pv_next;
1993 
1994 		PMAP_LOCK(pv->pv_pmap);
1995 		pmap = pv->pv_pmap;
1996 		va = pv->pv_va;
1997 		pte_bits = pmap_remove_mapping(pmap, va, pv->pv_pte,
1998 		    false, NULL, &tlbctx);
1999 		pmap_tlb_shootdown_pv(pmap, va, pte_bits, &tlbctx);
2000 		PMAP_UNLOCK(pv->pv_pmap);
2001 	}
2002 	mutex_exit(lock);
2003 	PMAP_HEAD_TO_MAP_UNLOCK();
2004 	pmap_tlb_shootnow(&tlbctx);
2005 	pmap_tlb_context_drain(&tlbctx);
2006 	TLB_COUNT(reason_page_protect_none);
2007 }
2008 
2009 /*
2010  * pmap_protect:		[ INTERFACE ]
2011  *
2012  *	Set the physical protection on the specified range of this map
2013  *	as requested.
2014  */
2015 void
pmap_protect(pmap_t pmap,vaddr_t sva,vaddr_t eva,vm_prot_t prot)2016 pmap_protect(pmap_t pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
2017 {
2018 	pt_entry_t *l1pte, *l2pte, *l3pte, opte;
2019 	vaddr_t l1eva, l2eva;
2020 	struct pmap_tlb_context tlbctx;
2021 
2022 #ifdef DEBUG
2023 	if (pmapdebug & (PDB_FOLLOW|PDB_PROTECT))
2024 		printf("pmap_protect(%p, %lx, %lx, %x)\n",
2025 		    pmap, sva, eva, prot);
2026 #endif
2027 
2028 	pmap_tlb_context_init(&tlbctx, 0);
2029 
2030 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2031 		pmap_remove_internal(pmap, sva, eva, &tlbctx);
2032 		return;
2033 	}
2034 
2035 	const pt_entry_t bits = pte_prot(pmap, prot);
2036 	pt_entry_t * const lev1map = pmap_lev1map(pmap);
2037 
2038 	PMAP_LOCK(pmap);
2039 
2040 	l1pte = pmap_l1pte(lev1map, sva);
2041 	for (; sva < eva; sva = l1eva, l1pte++) {
2042 		l1eva = alpha_trunc_l1seg(sva) + ALPHA_L1SEG_SIZE;
2043 		if (pmap_pte_v(l1pte)) {
2044 			l2pte = pmap_l2pte(lev1map, sva, l1pte);
2045 			for (; sva < l1eva && sva < eva; sva = l2eva, l2pte++) {
2046 				l2eva =
2047 				    alpha_trunc_l2seg(sva) + ALPHA_L2SEG_SIZE;
2048 				if (pmap_pte_v(l2pte)) {
2049 					l3pte = pmap_l3pte(lev1map, sva, l2pte);
2050 					for (; sva < l2eva && sva < eva;
2051 					     sva += PAGE_SIZE, l3pte++) {
2052 						if (pmap_pte_v(l3pte) &&
2053 						    pmap_pte_prot_chg(l3pte,
2054 								      bits)) {
2055 							opte = atomic_load_relaxed(l3pte);
2056 							pmap_pte_set_prot(l3pte,
2057 							   bits);
2058 							pmap_tlb_shootdown(pmap,
2059 							    sva, opte, &tlbctx);
2060 						}
2061 					}
2062 				}
2063 			}
2064 		}
2065 	}
2066 
2067 	PMAP_UNLOCK(pmap);
2068 	pmap_tlb_shootnow(&tlbctx);
2069 	TLB_COUNT(reason_protect);
2070 }
2071 
2072 /*
2073  * pmap_enter_tlb_shootdown:
2074  *
2075  *	Carry out a TLB shootdown on behalf of a pmap_enter()
2076  *	or a pmap_kenter_pa().  This is factored out separately
2077  *	because we expect it to be not a common case.
2078  */
2079 static void __noinline
pmap_enter_tlb_shootdown(pmap_t const pmap,vaddr_t const va,pt_entry_t const pte_bits,bool locked)2080 pmap_enter_tlb_shootdown(pmap_t const pmap, vaddr_t const va,
2081     pt_entry_t const pte_bits, bool locked)
2082 {
2083 	struct pmap_tlb_context tlbctx;
2084 
2085 	pmap_tlb_context_init(&tlbctx, 0);
2086 	pmap_tlb_shootdown(pmap, va, pte_bits, &tlbctx);
2087 	if (locked) {
2088 		PMAP_UNLOCK(pmap);
2089 	}
2090 	pmap_tlb_shootnow(&tlbctx);
2091 }
2092 
2093 /*
2094  * pmap_enter_l2pt_delref:
2095  *
2096  *	Release a reference on an L2 PT page for pmap_enter().
2097  *	This is factored out separately because we expect it
2098  *	to be a rare case.
2099  */
2100 static void __noinline
pmap_enter_l2pt_delref(pmap_t const pmap,pt_entry_t * const l1pte,pt_entry_t * const l2pte)2101 pmap_enter_l2pt_delref(pmap_t const pmap, pt_entry_t * const l1pte,
2102     pt_entry_t * const l2pte)
2103 {
2104 	struct pmap_tlb_context tlbctx;
2105 
2106 	/*
2107 	 * PALcode may have tried to service a TLB miss with
2108 	 * this L2 PTE, so we need to make sure we don't actually
2109 	 * free the PT page until we've shot down any TLB entries
2110 	 * for this VPT index.
2111 	 */
2112 
2113 	pmap_tlb_context_init(&tlbctx, 0);
2114 	pmap_l2pt_delref(pmap, l1pte, l2pte, &tlbctx);
2115 	PMAP_UNLOCK(pmap);
2116 	pmap_tlb_shootnow(&tlbctx);
2117 	pmap_tlb_context_drain(&tlbctx);
2118 	TLB_COUNT(reason_enter_l2pt_delref);
2119 }
2120 
2121 /*
2122  * pmap_enter_l3pt_delref:
2123  *
2124  *	Release a reference on an L3 PT page for pmap_enter().
2125  *	This is factored out separately because we expect it
2126  *	to be a rare case.
2127  */
2128 static void __noinline
pmap_enter_l3pt_delref(pmap_t const pmap,vaddr_t const va,pt_entry_t * const pte)2129 pmap_enter_l3pt_delref(pmap_t const pmap, vaddr_t const va,
2130     pt_entry_t * const pte)
2131 {
2132 	struct pmap_tlb_context tlbctx;
2133 
2134 	/*
2135 	 * PALcode may have tried to service a TLB miss with
2136 	 * this PTE, so we need to make sure we don't actually
2137 	 * free the PT page until we've shot down any TLB entries
2138 	 * for this VPT index.
2139 	 */
2140 
2141 	pmap_tlb_context_init(&tlbctx, 0);
2142 	pmap_l3pt_delref(pmap, va, pte, &tlbctx);
2143 	PMAP_UNLOCK(pmap);
2144 	pmap_tlb_shootnow(&tlbctx);
2145 	pmap_tlb_context_drain(&tlbctx);
2146 	TLB_COUNT(reason_enter_l3pt_delref);
2147 }
2148 
2149 /*
2150  * pmap_enter:			[ INTERFACE ]
2151  *
2152  *	Insert the given physical page (p) at
2153  *	the specified virtual address (v) in the
2154  *	target physical map with the protection requested.
2155  *
2156  *	If specified, the page will be wired down, meaning
2157  *	that the related pte can not be reclaimed.
2158  *
2159  *	Note:  This is the only routine which MAY NOT lazy-evaluate
2160  *	or lose information.  That is, this routine must actually
2161  *	insert this page into the given map NOW.
2162  */
2163 int
pmap_enter(pmap_t pmap,vaddr_t va,paddr_t pa,vm_prot_t prot,u_int flags)2164 pmap_enter(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
2165 {
2166 	pt_entry_t *pte, npte, opte;
2167 	pv_entry_t opv = NULL;
2168 	paddr_t opa;
2169 	bool tflush = false;
2170 	int error = 0;
2171 	kmutex_t *lock;
2172 
2173 #ifdef DEBUG
2174 	if (pmapdebug & (PDB_FOLLOW|PDB_ENTER))
2175 		printf("pmap_enter(%p, %lx, %lx, %x, %x)\n",
2176 		       pmap, va, pa, prot, flags);
2177 #endif
2178 	struct vm_page * const pg = PHYS_TO_VM_PAGE(pa);
2179 	const bool wired = (flags & PMAP_WIRED) != 0;
2180 
2181 	PMAP_MAP_TO_HEAD_LOCK();
2182 	PMAP_LOCK(pmap);
2183 
2184 	if (pmap == pmap_kernel()) {
2185 		KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
2186 		pte = PMAP_KERNEL_PTE(va);
2187 	} else {
2188 		pt_entry_t *l1pte, *l2pte;
2189 		pt_entry_t * const lev1map = pmap_lev1map(pmap);
2190 
2191 		KASSERT(va < VM_MAXUSER_ADDRESS);
2192 		KASSERT(lev1map != kernel_lev1map);
2193 
2194 		/*
2195 		 * Check to see if the level 1 PTE is valid, and
2196 		 * allocate a new level 2 page table page if it's not.
2197 		 * A reference will be added to the level 2 table when
2198 		 * the level 3 table is created.
2199 		 */
2200 		l1pte = pmap_l1pte(lev1map, va);
2201 		if (pmap_pte_v(l1pte) == 0) {
2202 			pmap_physpage_addref(l1pte);
2203 			error = pmap_ptpage_alloc(pmap, l1pte, PGU_L2PT);
2204 			if (error) {
2205 				pmap_l1pt_delref(pmap, l1pte);
2206 				if (flags & PMAP_CANFAIL)
2207 					goto out;
2208 				panic("pmap_enter: unable to create L2 PT "
2209 				    "page");
2210 			}
2211 #ifdef DEBUG
2212 			if (pmapdebug & PDB_PTPAGE)
2213 				printf("pmap_enter: new level 2 table at "
2214 				    "0x%lx\n", pmap_pte_pa(l1pte));
2215 #endif
2216 		}
2217 
2218 		/*
2219 		 * Check to see if the level 2 PTE is valid, and
2220 		 * allocate a new level 3 page table page if it's not.
2221 		 * A reference will be added to the level 3 table when
2222 		 * the mapping is validated.
2223 		 */
2224 		l2pte = pmap_l2pte(lev1map, va, l1pte);
2225 		if (pmap_pte_v(l2pte) == 0) {
2226 			pmap_physpage_addref(l2pte);
2227 			error = pmap_ptpage_alloc(pmap, l2pte, PGU_L3PT);
2228 			if (error) {
2229 				/* unlocks pmap */
2230 				pmap_enter_l2pt_delref(pmap, l1pte, l2pte);
2231 				if (flags & PMAP_CANFAIL) {
2232 					PMAP_LOCK(pmap);
2233 					goto out;
2234 				}
2235 				panic("pmap_enter: unable to create L3 PT "
2236 				    "page");
2237 			}
2238 #ifdef DEBUG
2239 			if (pmapdebug & PDB_PTPAGE)
2240 				printf("pmap_enter: new level 3 table at "
2241 				    "0x%lx\n", pmap_pte_pa(l2pte));
2242 #endif
2243 		}
2244 
2245 		/*
2246 		 * Get the PTE that will map the page.
2247 		 */
2248 		pte = pmap_l3pte(lev1map, va, l2pte);
2249 	}
2250 
2251 	/* Remember all of the old PTE; used for TBI check later. */
2252 	opte = atomic_load_relaxed(pte);
2253 
2254 	/*
2255 	 * Check to see if the old mapping is valid.  If not, validate the
2256 	 * new one immediately.
2257 	 */
2258 	if ((opte & PG_V) == 0) {
2259 		/* No TLB invalidations needed for new mappings. */
2260 
2261 		if (pmap != pmap_kernel()) {
2262 			/*
2263 			 * New mappings gain a reference on the level 3
2264 			 * table.
2265 			 */
2266 			pmap_physpage_addref(pte);
2267 		}
2268 		goto validate_enterpv;
2269 	}
2270 
2271 	opa = pmap_pte_pa(pte);
2272 
2273 	if (opa == pa) {
2274 		/*
2275 		 * Mapping has not changed; must be a protection or
2276 		 * wiring change.
2277 		 */
2278 		if (pmap_pte_w_chg(pte, wired ? PG_WIRED : 0)) {
2279 #ifdef DEBUG
2280 			if (pmapdebug & PDB_ENTER)
2281 				printf("pmap_enter: wiring change -> %d\n",
2282 				    wired);
2283 #endif
2284 			/* Adjust the wiring count. */
2285 			if (wired)
2286 				PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);
2287 			else
2288 				PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
2289 		}
2290 
2291 		/* Set the PTE. */
2292 		goto validate;
2293 	}
2294 
2295 	/*
2296 	 * The mapping has changed.  We need to invalidate the
2297 	 * old mapping before creating the new one.
2298 	 */
2299 #ifdef DEBUG
2300 	if (pmapdebug & PDB_ENTER)
2301 		printf("pmap_enter: removing old mapping 0x%lx\n", va);
2302 #endif
2303 	if (pmap != pmap_kernel()) {
2304 		/*
2305 		 * Gain an extra reference on the level 3 table.
2306 		 * pmap_remove_mapping() will delete a reference,
2307 		 * and we don't want the table to be erroneously
2308 		 * freed.
2309 		 */
2310 		pmap_physpage_addref(pte);
2311 	}
2312 	/* Already have the bits from opte above. */
2313 	(void) pmap_remove_mapping(pmap, va, pte, true, &opv, NULL);
2314 
2315  validate_enterpv:
2316 	/* Enter the mapping into the pv_table if appropriate. */
2317 	if (pg != NULL) {
2318 		error = pmap_pv_enter(pmap, pg, va, pte, true, opv);
2319 		if (error) {
2320 			/* This can only fail if opv == NULL */
2321 			KASSERT(opv == NULL);
2322 
2323 			/* unlocks pmap */
2324 			pmap_enter_l3pt_delref(pmap, va, pte);
2325 			if (flags & PMAP_CANFAIL) {
2326 				PMAP_LOCK(pmap);
2327 				goto out;
2328 			}
2329 			panic("pmap_enter: unable to enter mapping in PV "
2330 			    "table");
2331 		}
2332 		opv = NULL;
2333 	}
2334 
2335 	/* Increment counters. */
2336 	PMAP_STAT_INCR(pmap->pm_stats.resident_count, 1);
2337 	if (wired)
2338 		PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);
2339 
2340  validate:
2341 	/* Build the new PTE. */
2342 	npte = ((pa >> PGSHIFT) << PG_SHIFT) | pte_prot(pmap, prot) | PG_V;
2343 	if (pg != NULL) {
2344 		struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2345 		uintptr_t attrs = 0;
2346 
2347 		KASSERT(((flags & VM_PROT_ALL) & ~prot) == 0);
2348 
2349 		if (flags & VM_PROT_WRITE)
2350 			attrs |= (PGA_REFERENCED|PGA_MODIFIED);
2351 		else if (flags & VM_PROT_ALL)
2352 			attrs |= PGA_REFERENCED;
2353 
2354 		lock = pmap_pvh_lock(pg);
2355 		mutex_enter(lock);
2356 		attrs = (md->pvh_listx |= attrs);
2357 		mutex_exit(lock);
2358 
2359 		/* Set up referenced/modified emulation for new mapping. */
2360 		if ((attrs & PGA_REFERENCED) == 0)
2361 			npte |= PG_FOR | PG_FOW | PG_FOE;
2362 		else if ((attrs & PGA_MODIFIED) == 0)
2363 			npte |= PG_FOW;
2364 
2365 		/*
2366 		 * Mapping was entered on PV list.
2367 		 */
2368 		npte |= PG_PVLIST;
2369 	}
2370 	if (wired)
2371 		npte |= PG_WIRED;
2372 #ifdef DEBUG
2373 	if (pmapdebug & PDB_ENTER)
2374 		printf("pmap_enter: new pte = 0x%lx\n", npte);
2375 #endif
2376 
2377 	/*
2378 	 * If the HW / PALcode portion of the new PTE is the same as the
2379 	 * old PTE, no TBI is necessary.
2380 	 */
2381 	if (opte & PG_V) {
2382 		tflush = PG_PALCODE(opte) != PG_PALCODE(npte);
2383 	}
2384 
2385 	/* Set the new PTE. */
2386 	atomic_store_relaxed(pte, npte);
2387 
2388 out:
2389 	PMAP_MAP_TO_HEAD_UNLOCK();
2390 
2391 	/*
2392 	 * Invalidate the TLB entry for this VA and any appropriate
2393 	 * caches.
2394 	 */
2395 	if (tflush) {
2396 		/* unlocks pmap */
2397 		pmap_enter_tlb_shootdown(pmap, va, opte, true);
2398 		if (pmap == pmap_kernel()) {
2399 			TLB_COUNT(reason_enter_kernel);
2400 		} else {
2401 			TLB_COUNT(reason_enter_user);
2402 		}
2403 	} else {
2404 		PMAP_UNLOCK(pmap);
2405 	}
2406 
2407 	if (opv)
2408 		pmap_pv_free(opv);
2409 
2410 	return error;
2411 }
2412 
2413 /*
2414  * pmap_kenter_pa:		[ INTERFACE ]
2415  *
2416  *	Enter a va -> pa mapping into the kernel pmap without any
2417  *	physical->virtual tracking.
2418  *
2419  *	Note: no locking is necessary in this function.
2420  */
2421 void
pmap_kenter_pa(vaddr_t va,paddr_t pa,vm_prot_t prot,u_int flags)2422 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
2423 {
2424 	pmap_t const pmap = pmap_kernel();
2425 
2426 #ifdef DEBUG
2427 	if (pmapdebug & (PDB_FOLLOW|PDB_ENTER))
2428 		printf("pmap_kenter_pa(%lx, %lx, %x)\n",
2429 		    va, pa, prot);
2430 #endif
2431 
2432 	KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
2433 
2434 	pt_entry_t * const pte = PMAP_KERNEL_PTE(va);
2435 
2436 	/* Build the new PTE. */
2437 	const pt_entry_t npte =
2438 	    ((pa >> PGSHIFT) << PG_SHIFT) | pte_prot(pmap_kernel(), prot) |
2439 	    PG_V | PG_WIRED;
2440 
2441 	/* Set the new PTE. */
2442 	const pt_entry_t opte = atomic_load_relaxed(pte);
2443 	atomic_store_relaxed(pte, npte);
2444 
2445 	PMAP_STAT_INCR(pmap->pm_stats.resident_count, 1);
2446 	PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);
2447 
2448 	/*
2449 	 * There should not have been anything here, previously,
2450 	 * so we can skip TLB shootdowns, etc. in the common case.
2451 	 */
2452 	if (__predict_false(opte & PG_V)) {
2453 		const pt_entry_t diff = npte ^ opte;
2454 
2455 		printf_nolog("%s: mapping already present\n", __func__);
2456 		PMAP_STAT_DECR(pmap->pm_stats.resident_count, 1);
2457 		if (diff & PG_WIRED)
2458 			PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
2459 		/* XXX Can't handle this case. */
2460 		if (diff & PG_PVLIST)
2461 			panic("pmap_kenter_pa: old mapping was managed");
2462 
2463 		pmap_enter_tlb_shootdown(pmap_kernel(), va, opte, false);
2464 		TLB_COUNT(reason_kenter);
2465 	}
2466 }
2467 
2468 /*
2469  * pmap_kremove:		[ INTERFACE ]
2470  *
2471  *	Remove a mapping entered with pmap_kenter_pa() starting at va,
2472  *	for size bytes (assumed to be page rounded).
2473  */
2474 void
pmap_kremove(vaddr_t va,vsize_t size)2475 pmap_kremove(vaddr_t va, vsize_t size)
2476 {
2477 	pt_entry_t *pte, opte;
2478 	pmap_t const pmap = pmap_kernel();
2479 	struct pmap_tlb_context tlbctx;
2480 	int count = 0;
2481 
2482 #ifdef DEBUG
2483 	if (pmapdebug & (PDB_FOLLOW|PDB_ENTER))
2484 		printf("pmap_kremove(%lx, %lx)\n",
2485 		    va, size);
2486 #endif
2487 
2488 	pmap_tlb_context_init(&tlbctx, 0);
2489 
2490 	KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
2491 
2492 	for (; size != 0; size -= PAGE_SIZE, va += PAGE_SIZE) {
2493 		pte = PMAP_KERNEL_PTE(va);
2494 		opte = atomic_load_relaxed(pte);
2495 		if (opte & PG_V) {
2496 			KASSERT((opte & PG_PVLIST) == 0);
2497 
2498 			/* Zap the mapping. */
2499 			atomic_store_relaxed(pte, PG_NV);
2500 			pmap_tlb_shootdown(pmap, va, opte, &tlbctx);
2501 
2502 			count++;
2503 		}
2504 	}
2505 
2506 	/* Update stats. */
2507 	if (__predict_true(count != 0)) {
2508 		PMAP_STAT_DECR(pmap->pm_stats.resident_count, count);
2509 		PMAP_STAT_DECR(pmap->pm_stats.wired_count, count);
2510 	}
2511 
2512 	pmap_tlb_shootnow(&tlbctx);
2513 	TLB_COUNT(reason_kremove);
2514 }
2515 
2516 /*
2517  * pmap_unwire:			[ INTERFACE ]
2518  *
2519  *	Clear the wired attribute for a map/virtual-address pair.
2520  *
2521  *	The mapping must already exist in the pmap.
2522  */
2523 void
pmap_unwire(pmap_t pmap,vaddr_t va)2524 pmap_unwire(pmap_t pmap, vaddr_t va)
2525 {
2526 	pt_entry_t *pte;
2527 
2528 #ifdef DEBUG
2529 	if (pmapdebug & PDB_FOLLOW)
2530 		printf("pmap_unwire(%p, %lx)\n", pmap, va);
2531 #endif
2532 
2533 	PMAP_LOCK(pmap);
2534 
2535 	pte = pmap_l3pte(pmap_lev1map(pmap), va, NULL);
2536 
2537 	KASSERT(pte != NULL);
2538 	KASSERT(pmap_pte_v(pte));
2539 
2540 	/*
2541 	 * If wiring actually changed (always?) clear the wire bit and
2542 	 * update the wire count.  Note that wiring is not a hardware
2543 	 * characteristic so there is no need to invalidate the TLB.
2544 	 */
2545 	if (pmap_pte_w_chg(pte, 0)) {
2546 		pmap_pte_set_w(pte, false);
2547 		PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
2548 	}
2549 #ifdef DEBUG
2550 	else {
2551 		printf("pmap_unwire: wiring for pmap %p va 0x%lx "
2552 		    "didn't change!\n", pmap, va);
2553 	}
2554 #endif
2555 
2556 	PMAP_UNLOCK(pmap);
2557 }
2558 
2559 /*
2560  * pmap_extract:		[ INTERFACE ]
2561  *
2562  *	Extract the physical address associated with the given
2563  *	pmap/virtual address pair.
2564  */
2565 bool
pmap_extract(pmap_t pmap,vaddr_t va,paddr_t * pap)2566 pmap_extract(pmap_t pmap, vaddr_t va, paddr_t *pap)
2567 {
2568 	pt_entry_t *l1pte, *l2pte, *l3pte;
2569 	paddr_t pa;
2570 
2571 #ifdef DEBUG
2572 	if (pmapdebug & PDB_FOLLOW)
2573 		printf("pmap_extract(%p, %lx) -> ", pmap, va);
2574 #endif
2575 
2576 	/*
2577 	 * Take a faster path for the kernel pmap.  Avoids locking,
2578 	 * handles K0SEG.
2579 	 */
2580 	if (__predict_true(pmap == pmap_kernel())) {
2581 #ifdef DEBUG
2582 		bool address_is_valid = vtophys_internal(va, pap);
2583 		if (pmapdebug & PDB_FOLLOW) {
2584 			if (address_is_valid) {
2585 				printf("0x%lx (kernel vtophys)\n", *pap);
2586 			} else {
2587 				printf("failed (kernel vtophys)\n");
2588 			}
2589 		}
2590 		return address_is_valid;
2591 #else
2592 		return vtophys_internal(va, pap);
2593 #endif
2594 	}
2595 
2596 	pt_entry_t * const lev1map = pmap_lev1map(pmap);
2597 
2598 	PMAP_LOCK(pmap);
2599 
2600 	l1pte = pmap_l1pte(lev1map, va);
2601 	if (pmap_pte_v(l1pte) == 0)
2602 		goto out;
2603 
2604 	l2pte = pmap_l2pte(lev1map, va, l1pte);
2605 	if (pmap_pte_v(l2pte) == 0)
2606 		goto out;
2607 
2608 	l3pte = pmap_l3pte(lev1map, va, l2pte);
2609 	if (pmap_pte_v(l3pte) == 0)
2610 		goto out;
2611 
2612 	pa = pmap_pte_pa(l3pte) | (va & PGOFSET);
2613 	PMAP_UNLOCK(pmap);
2614 	if (pap != NULL)
2615 		*pap = pa;
2616 #ifdef DEBUG
2617 	if (pmapdebug & PDB_FOLLOW)
2618 		printf("0x%lx\n", pa);
2619 #endif
2620 	return (true);
2621 
2622  out:
2623 	PMAP_UNLOCK(pmap);
2624 #ifdef DEBUG
2625 	if (pmapdebug & PDB_FOLLOW)
2626 		printf("failed\n");
2627 #endif
2628 	return (false);
2629 }
2630 
2631 /*
2632  * pmap_copy:			[ INTERFACE ]
2633  *
2634  *	Copy the mapping range specified by src_addr/len
2635  *	from the source map to the range dst_addr/len
2636  *	in the destination map.
2637  *
2638  *	This routine is only advisory and need not do anything.
2639  */
2640 /* call deleted in <machine/pmap.h> */
2641 
2642 /*
2643  * pmap_update:			[ INTERFACE ]
2644  *
2645  *	Require that all active physical maps contain no
2646  *	incorrect entries NOW, by processing any deferred
2647  *	pmap operations.
2648  */
2649 /* call deleted in <machine/pmap.h> */
2650 
2651 /*
2652  * pmap_activate:		[ INTERFACE ]
2653  *
2654  *	Activate the pmap used by the specified process.  This includes
2655  *	reloading the MMU context of the current process, and marking
2656  *	the pmap in use by the processor.
2657  */
2658 void
pmap_activate(struct lwp * l)2659 pmap_activate(struct lwp *l)
2660 {
2661 	struct pmap * const pmap = l->l_proc->p_vmspace->vm_map.pmap;
2662 	struct pcb * const pcb = lwp_getpcb(l);
2663 
2664 #ifdef DEBUG
2665 	if (pmapdebug & PDB_FOLLOW)
2666 		printf("pmap_activate(%p)\n", l);
2667 #endif
2668 
2669 	KASSERT(kpreempt_disabled());
2670 
2671 	struct cpu_info * const ci = curcpu();
2672 
2673 	KASSERT(l == ci->ci_curlwp);
2674 
2675 	u_long const old_ptbr = pcb->pcb_hw.apcb_ptbr;
2676 	u_int const old_asn = pcb->pcb_hw.apcb_asn;
2677 
2678 	/*
2679 	 * We hold the activation lock to synchronize with TLB shootdown.
2680 	 * The kernel pmap does not require those tests because shootdowns
2681 	 * for the kernel pmap are always sent to all CPUs.
2682 	 */
2683 	if (pmap != pmap_kernel()) {
2684 		PMAP_ACT_LOCK(pmap);
2685 		pcb->pcb_hw.apcb_asn = pmap_asn_alloc(pmap, ci);
2686 		atomic_or_ulong(&pmap->pm_cpus, (1UL << ci->ci_cpuid));
2687 	} else {
2688 		pcb->pcb_hw.apcb_asn = PMAP_ASN_KERNEL;
2689 	}
2690 	pcb->pcb_hw.apcb_ptbr =
2691 	    ALPHA_K0SEG_TO_PHYS((vaddr_t)pmap_lev1map(pmap)) >> PGSHIFT;
2692 
2693 	/*
2694 	 * Check to see if the ASN or page table base has changed; if
2695 	 * so, switch to our own context again so that it will take
2696 	 * effect.
2697 	 *
2698 	 * We test ASN first because it's the most likely value to change.
2699 	 */
2700 	if (old_asn != pcb->pcb_hw.apcb_asn ||
2701 	    old_ptbr != pcb->pcb_hw.apcb_ptbr) {
2702 		if (old_asn != pcb->pcb_hw.apcb_asn &&
2703 		    old_ptbr != pcb->pcb_hw.apcb_ptbr) {
2704 			TLB_COUNT(activate_both_change);
2705 		} else if (old_asn != pcb->pcb_hw.apcb_asn) {
2706 			TLB_COUNT(activate_asn_change);
2707 		} else {
2708 			TLB_COUNT(activate_ptbr_change);
2709 		}
2710 		(void) alpha_pal_swpctx((u_long)l->l_md.md_pcbpaddr);
2711 		TLB_COUNT(activate_swpctx);
2712 	} else {
2713 		TLB_COUNT(activate_skip_swpctx);
2714 	}
2715 
2716 	pmap_reference(pmap);
2717 	ci->ci_pmap = pmap;
2718 
2719 	if (pmap != pmap_kernel()) {
2720 		PMAP_ACT_UNLOCK(pmap);
2721 	}
2722 }
2723 
2724 /*
2725  * pmap_deactivate:		[ INTERFACE ]
2726  *
2727  *	Mark that the pmap used by the specified process is no longer
2728  *	in use by the processor.
2729  */
2730 void
pmap_deactivate(struct lwp * l)2731 pmap_deactivate(struct lwp *l)
2732 {
2733 	struct pmap * const pmap = l->l_proc->p_vmspace->vm_map.pmap;
2734 
2735 #ifdef DEBUG
2736 	if (pmapdebug & PDB_FOLLOW)
2737 		printf("pmap_deactivate(%p)\n", l);
2738 #endif
2739 
2740 	KASSERT(kpreempt_disabled());
2741 
2742 	struct cpu_info * const ci = curcpu();
2743 
2744 	KASSERT(l == ci->ci_curlwp);
2745 	KASSERT(pmap == ci->ci_pmap);
2746 
2747 	/*
2748 	 * There is no need to switch to a different PTBR here,
2749 	 * because a pmap_activate() or SWPCTX is guaranteed
2750 	 * before whatever lev1map we're on now is invalidated
2751 	 * or before user space is accessed again.
2752 	 *
2753 	 * Because only kernel mappings will be accessed before the
2754 	 * next pmap_activate() call, we consider our CPU to be on
2755 	 * the kernel pmap.
2756 	 */
2757 	ci->ci_pmap = pmap_kernel();
2758 	KASSERT(atomic_load_relaxed(&pmap->pm_count) > 1);
2759 	pmap_destroy(pmap);
2760 }
2761 
2762 /* pmap_zero_page() is in pmap_subr.s */
2763 
2764 /* pmap_copy_page() is in pmap_subr.s */
2765 
2766 /*
2767  * pmap_pageidlezero:		[ INTERFACE ]
2768  *
2769  *	Page zero'er for the idle loop.  Returns true if the
2770  *	page was zero'd, FALSE if we aborted for some reason.
2771  */
2772 bool
pmap_pageidlezero(paddr_t pa)2773 pmap_pageidlezero(paddr_t pa)
2774 {
2775 	u_long *ptr;
2776 	int i, cnt = PAGE_SIZE / sizeof(u_long);
2777 
2778 	for (i = 0, ptr = (u_long *) ALPHA_PHYS_TO_K0SEG(pa); i < cnt; i++) {
2779 		if (sched_curcpu_runnable_p()) {
2780 			/*
2781 			 * An LWP has become ready.  Abort now,
2782 			 * so we don't keep it waiting while we
2783 			 * finish zeroing the page.
2784 			 */
2785 			return (false);
2786 		}
2787 		*ptr++ = 0;
2788 	}
2789 
2790 	return (true);
2791 }
2792 
2793 /*
2794  * pmap_clear_modify:		[ INTERFACE ]
2795  *
2796  *	Clear the modify bits on the specified physical page.
2797  */
2798 bool
pmap_clear_modify(struct vm_page * pg)2799 pmap_clear_modify(struct vm_page *pg)
2800 {
2801 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2802 	bool rv = false;
2803 	kmutex_t *lock;
2804 	struct pmap_tlb_context tlbctx;
2805 
2806 #ifdef DEBUG
2807 	if (pmapdebug & PDB_FOLLOW)
2808 		printf("pmap_clear_modify(%p)\n", pg);
2809 #endif
2810 
2811 	pmap_tlb_context_init(&tlbctx, TLB_CTX_F_PV);
2812 
2813 	PMAP_HEAD_TO_MAP_LOCK();
2814 	lock = pmap_pvh_lock(pg);
2815 	mutex_enter(lock);
2816 
2817 	if (md->pvh_listx & PGA_MODIFIED) {
2818 		rv = true;
2819 		pmap_changebit(pg, PG_FOW, ~0UL, &tlbctx);
2820 		md->pvh_listx &= ~PGA_MODIFIED;
2821 	}
2822 
2823 	mutex_exit(lock);
2824 	PMAP_HEAD_TO_MAP_UNLOCK();
2825 
2826 	pmap_tlb_shootnow(&tlbctx);
2827 	TLB_COUNT(reason_clear_modify);
2828 
2829 	return (rv);
2830 }
2831 
2832 /*
2833  * pmap_clear_reference:	[ INTERFACE ]
2834  *
2835  *	Clear the reference bit on the specified physical page.
2836  */
2837 bool
pmap_clear_reference(struct vm_page * pg)2838 pmap_clear_reference(struct vm_page *pg)
2839 {
2840 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2841 	bool rv = false;
2842 	kmutex_t *lock;
2843 	struct pmap_tlb_context tlbctx;
2844 
2845 #ifdef DEBUG
2846 	if (pmapdebug & PDB_FOLLOW)
2847 		printf("pmap_clear_reference(%p)\n", pg);
2848 #endif
2849 
2850 	pmap_tlb_context_init(&tlbctx, TLB_CTX_F_PV);
2851 
2852 	PMAP_HEAD_TO_MAP_LOCK();
2853 	lock = pmap_pvh_lock(pg);
2854 	mutex_enter(lock);
2855 
2856 	if (md->pvh_listx & PGA_REFERENCED) {
2857 		rv = true;
2858 		pmap_changebit(pg, PG_FOR | PG_FOW | PG_FOE, ~0UL, &tlbctx);
2859 		md->pvh_listx &= ~PGA_REFERENCED;
2860 	}
2861 
2862 	mutex_exit(lock);
2863 	PMAP_HEAD_TO_MAP_UNLOCK();
2864 
2865 	pmap_tlb_shootnow(&tlbctx);
2866 	TLB_COUNT(reason_clear_reference);
2867 
2868 	return (rv);
2869 }
2870 
2871 /*
2872  * pmap_is_referenced:		[ INTERFACE ]
2873  *
2874  *	Return whether or not the specified physical page is referenced
2875  *	by any physical maps.
2876  */
2877 /* See <machine/pmap.h> */
2878 
2879 /*
2880  * pmap_is_modified:		[ INTERFACE ]
2881  *
2882  *	Return whether or not the specified physical page is modified
2883  *	by any physical maps.
2884  */
2885 /* See <machine/pmap.h> */
2886 
2887 /*
2888  * pmap_phys_address:		[ INTERFACE ]
2889  *
2890  *	Return the physical address corresponding to the specified
2891  *	cookie.  Used by the device pager to decode a device driver's
2892  *	mmap entry point return value.
2893  *
2894  *	Note: no locking is necessary in this function.
2895  */
2896 paddr_t
pmap_phys_address(paddr_t ppn)2897 pmap_phys_address(paddr_t ppn)
2898 {
2899 
2900 	return (alpha_ptob(ppn));
2901 }
2902 
2903 /*
2904  * Miscellaneous support routines follow
2905  */
2906 
2907 /*
2908  * alpha_protection_init:
2909  *
2910  *	Initialize Alpha protection code array.
2911  *
2912  *	Note: no locking is necessary in this function.
2913  */
2914 static void
alpha_protection_init(void)2915 alpha_protection_init(void)
2916 {
2917 	int prot, *kp, *up;
2918 
2919 	kp = protection_codes[0];
2920 	up = protection_codes[1];
2921 
2922 	for (prot = 0; prot < 8; prot++) {
2923 		kp[prot] = PG_ASM;
2924 		up[prot] = 0;
2925 
2926 		if (prot & VM_PROT_READ) {
2927 			kp[prot] |= PG_KRE;
2928 			up[prot] |= PG_KRE | PG_URE;
2929 		}
2930 		if (prot & VM_PROT_WRITE) {
2931 			kp[prot] |= PG_KWE;
2932 			up[prot] |= PG_KWE | PG_UWE;
2933 		}
2934 		if (prot & VM_PROT_EXECUTE) {
2935 			kp[prot] |= PG_EXEC | PG_KRE;
2936 			up[prot] |= PG_EXEC | PG_KRE | PG_URE;
2937 		} else {
2938 			kp[prot] |= PG_FOE;
2939 			up[prot] |= PG_FOE;
2940 		}
2941 	}
2942 }
2943 
2944 /*
2945  * pmap_remove_mapping:
2946  *
2947  *	Invalidate a single page denoted by pmap/va.
2948  *
2949  *	If (pte != NULL), it is the already computed PTE for the page.
2950  *
2951  *	Note: locking in this function is complicated by the fact
2952  *	that it can be called when the PV list is already locked.
2953  *	(pmap_page_protect()).  In this case, the caller must be
2954  *	careful to get the next PV entry while we remove this entry
2955  *	from beneath it.  We assume that the pmap itself is already
2956  *	locked; dolock applies only to the PV list.
2957  *
2958  *	Returns important PTE bits that the caller needs to check for
2959  *	TLB / I-stream invalidation purposes.
2960  */
2961 static pt_entry_t
pmap_remove_mapping(pmap_t pmap,vaddr_t va,pt_entry_t * pte,bool dolock,pv_entry_t * opvp,struct pmap_tlb_context * const tlbctx)2962 pmap_remove_mapping(pmap_t pmap, vaddr_t va, pt_entry_t *pte,
2963     bool dolock, pv_entry_t *opvp, struct pmap_tlb_context * const tlbctx)
2964 {
2965 	pt_entry_t opte;
2966 	paddr_t pa;
2967 	struct vm_page *pg;		/* if != NULL, page is managed */
2968 
2969 #ifdef DEBUG
2970 	if (pmapdebug & (PDB_FOLLOW|PDB_REMOVE|PDB_PROTECT))
2971 		printf("pmap_remove_mapping(%p, %lx, %p, %d, %p, %p)\n",
2972 		       pmap, va, pte, dolock, opvp, tlbctx);
2973 #endif
2974 
2975 	/*
2976 	 * PTE not provided, compute it from pmap and va.
2977 	 */
2978 	if (pte == NULL) {
2979 		pte = pmap_l3pte(pmap_lev1map(pmap), va, NULL);
2980 		if (pmap_pte_v(pte) == 0)
2981 			return 0;
2982 	}
2983 
2984 	opte = *pte;
2985 
2986 	pa = PG_PFNUM(opte) << PGSHIFT;
2987 
2988 	/*
2989 	 * Update statistics
2990 	 */
2991 	if (pmap_pte_w(pte))
2992 		PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
2993 	PMAP_STAT_DECR(pmap->pm_stats.resident_count, 1);
2994 
2995 	/*
2996 	 * Invalidate the PTE after saving the reference modify info.
2997 	 */
2998 #ifdef DEBUG
2999 	if (pmapdebug & PDB_REMOVE)
3000 		printf("remove: invalidating pte at %p\n", pte);
3001 #endif
3002 	atomic_store_relaxed(pte, PG_NV);
3003 
3004 	/*
3005 	 * If we're removing a user mapping, check to see if we
3006 	 * can free page table pages.
3007 	 */
3008 	if (pmap != pmap_kernel()) {
3009 		/*
3010 		 * Delete the reference on the level 3 table.  It will
3011 		 * delete references on the level 2 and 1 tables as
3012 		 * appropriate.
3013 		 */
3014 		pmap_l3pt_delref(pmap, va, pte, tlbctx);
3015 	}
3016 
3017 	if (opte & PG_PVLIST) {
3018 		/*
3019 		 * Remove it from the PV table.
3020 		 */
3021 		pg = PHYS_TO_VM_PAGE(pa);
3022 		KASSERT(pg != NULL);
3023 		pmap_pv_remove(pmap, pg, va, dolock, opvp, tlbctx);
3024 		KASSERT(opvp == NULL || *opvp != NULL);
3025 	}
3026 
3027 	return opte & (PG_V | PG_ASM | PG_EXEC);
3028 }
3029 
3030 /*
3031  * pmap_changebit:
3032  *
3033  *	Set or clear the specified PTE bits for all mappings on the
3034  *	specified page.
3035  *
3036  *	Note: we assume that the pv_head is already locked, and that
3037  *	the caller has acquired a PV->pmap mutex so that we can lock
3038  *	the pmaps as we encounter them.
3039  */
3040 static void
pmap_changebit(struct vm_page * pg,pt_entry_t set,pt_entry_t mask,struct pmap_tlb_context * const tlbctx)3041 pmap_changebit(struct vm_page *pg, pt_entry_t set, pt_entry_t mask,
3042     struct pmap_tlb_context * const tlbctx)
3043 {
3044 	pv_entry_t pv;
3045 	pt_entry_t *pte, npte, opte;
3046 
3047 #ifdef DEBUG
3048 	if (pmapdebug & PDB_BITS)
3049 		printf("pmap_changebit(%p, 0x%lx, 0x%lx)\n",
3050 		    pg, set, mask);
3051 #endif
3052 
3053 	/*
3054 	 * Loop over all current mappings setting/clearing as apropos.
3055 	 */
3056 	for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next) {
3057 		PMAP_LOCK(pv->pv_pmap);
3058 
3059 		pte = pv->pv_pte;
3060 
3061 		opte = atomic_load_relaxed(pte);
3062 		npte = (opte | set) & mask;
3063 		if (npte != opte) {
3064 			atomic_store_relaxed(pte, npte);
3065 			pmap_tlb_shootdown_pv(pv->pv_pmap, pv->pv_va,
3066 			    opte, tlbctx);
3067 		}
3068 		PMAP_UNLOCK(pv->pv_pmap);
3069 	}
3070 }
3071 
3072 /*
3073  * pmap_emulate_reference:
3074  *
3075  *	Emulate reference and/or modified bit hits.
3076  *	Return 1 if this was an execute fault on a non-exec mapping,
3077  *	otherwise return 0.
3078  */
3079 int
pmap_emulate_reference(struct lwp * l,vaddr_t v,int user,int type)3080 pmap_emulate_reference(struct lwp *l, vaddr_t v, int user, int type)
3081 {
3082 	struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap;
3083 	pt_entry_t faultoff, *pte;
3084 	struct vm_page *pg;
3085 	paddr_t pa;
3086 	bool didlock = false;
3087 	bool exec = false;
3088 	kmutex_t *lock;
3089 
3090 #ifdef DEBUG
3091 	if (pmapdebug & PDB_FOLLOW)
3092 		printf("pmap_emulate_reference: %p, 0x%lx, %d, %d\n",
3093 		    l, v, user, type);
3094 #endif
3095 
3096 	/*
3097 	 * Convert process and virtual address to physical address.
3098 	 */
3099 	if (v >= VM_MIN_KERNEL_ADDRESS) {
3100 		if (user)
3101 			panic("pmap_emulate_reference: user ref to kernel");
3102 		/*
3103 		 * No need to lock here; kernel PT pages never go away.
3104 		 */
3105 		pte = PMAP_KERNEL_PTE(v);
3106 	} else {
3107 #ifdef DIAGNOSTIC
3108 		if (l == NULL)
3109 			panic("pmap_emulate_reference: bad proc");
3110 		if (l->l_proc->p_vmspace == NULL)
3111 			panic("pmap_emulate_reference: bad p_vmspace");
3112 #endif
3113 		PMAP_LOCK(pmap);
3114 		didlock = true;
3115 		pte = pmap_l3pte(pmap_lev1map(pmap), v, NULL);
3116 		/*
3117 		 * We'll unlock below where we're done with the PTE.
3118 		 */
3119 	}
3120 	exec = pmap_pte_exec(pte);
3121 	if (!exec && type == ALPHA_MMCSR_FOE) {
3122 		if (didlock)
3123 			PMAP_UNLOCK(pmap);
3124 	       return (1);
3125 	}
3126 #ifdef DEBUG
3127 	if (pmapdebug & PDB_FOLLOW) {
3128 		printf("\tpte = %p, ", pte);
3129 		printf("*pte = 0x%lx\n", *pte);
3130 	}
3131 #endif
3132 
3133 	pa = pmap_pte_pa(pte);
3134 
3135 	/*
3136 	 * We're now done with the PTE.  If it was a user pmap, unlock
3137 	 * it now.
3138 	 */
3139 	if (didlock)
3140 		PMAP_UNLOCK(pmap);
3141 
3142 #ifdef DEBUG
3143 	if (pmapdebug & PDB_FOLLOW)
3144 		printf("\tpa = 0x%lx\n", pa);
3145 #endif
3146 #ifdef DIAGNOSTIC
3147 	if (!uvm_pageismanaged(pa))
3148 		panic("pmap_emulate_reference(%p, 0x%lx, %d, %d): "
3149 		      "pa 0x%lx not managed", l, v, user, type, pa);
3150 #endif
3151 
3152 	/*
3153 	 * Twiddle the appropriate bits to reflect the reference
3154 	 * and/or modification..
3155 	 *
3156 	 * The rules:
3157 	 * 	(1) always mark page as used, and
3158 	 *	(2) if it was a write fault, mark page as modified.
3159 	 */
3160 	pg = PHYS_TO_VM_PAGE(pa);
3161 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
3162 	struct pmap_tlb_context tlbctx;
3163 
3164 	pmap_tlb_context_init(&tlbctx, TLB_CTX_F_PV);
3165 
3166 	PMAP_HEAD_TO_MAP_LOCK();
3167 	lock = pmap_pvh_lock(pg);
3168 	mutex_enter(lock);
3169 
3170 	if (type == ALPHA_MMCSR_FOW) {
3171 		md->pvh_listx |= (PGA_REFERENCED|PGA_MODIFIED);
3172 		faultoff = PG_FOR | PG_FOW;
3173 	} else {
3174 		md->pvh_listx |= PGA_REFERENCED;
3175 		faultoff = PG_FOR;
3176 		if (exec) {
3177 			faultoff |= PG_FOE;
3178 		}
3179 	}
3180 	pmap_changebit(pg, 0, ~faultoff, &tlbctx);
3181 
3182 	mutex_exit(lock);
3183 	PMAP_HEAD_TO_MAP_UNLOCK();
3184 
3185 	pmap_tlb_shootnow(&tlbctx);
3186 	TLB_COUNT(reason_emulate_reference);
3187 
3188 	return (0);
3189 }
3190 
3191 #ifdef DEBUG
3192 /*
3193  * pmap_pv_dump:
3194  *
3195  *	Dump the physical->virtual data for the specified page.
3196  */
3197 void
pmap_pv_dump(paddr_t pa)3198 pmap_pv_dump(paddr_t pa)
3199 {
3200 	struct vm_page *pg;
3201 	struct vm_page_md *md;
3202 	pv_entry_t pv;
3203 	kmutex_t *lock;
3204 
3205 	pg = PHYS_TO_VM_PAGE(pa);
3206 	md = VM_PAGE_TO_MD(pg);
3207 
3208 	lock = pmap_pvh_lock(pg);
3209 	mutex_enter(lock);
3210 
3211 	printf("pa 0x%lx (attrs = 0x%lx):\n", pa, md->pvh_listx & PGA_ATTRS);
3212 	for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next)
3213 		printf("     pmap %p, va 0x%lx\n",
3214 		    pv->pv_pmap, pv->pv_va);
3215 	printf("\n");
3216 
3217 	mutex_exit(lock);
3218 }
3219 #endif
3220 
3221 /*
3222  * vtophys:
3223  *
3224  *	Return the physical address corresponding to the K0SEG or
3225  *	K1SEG address provided.
3226  *
3227  *	Note: no locking is necessary in this function.
3228  */
3229 static bool
vtophys_internal(vaddr_t const vaddr,paddr_t * const pap)3230 vtophys_internal(vaddr_t const vaddr, paddr_t * const pap)
3231 {
3232 	paddr_t pa;
3233 
3234 	KASSERT(vaddr >= ALPHA_K0SEG_BASE);
3235 
3236 	if (vaddr <= ALPHA_K0SEG_END) {
3237 		pa = ALPHA_K0SEG_TO_PHYS(vaddr);
3238 	} else {
3239 		pt_entry_t * const pte = PMAP_KERNEL_PTE(vaddr);
3240 		if (__predict_false(! pmap_pte_v(pte))) {
3241 			return false;
3242 		}
3243 		pa = pmap_pte_pa(pte) | (vaddr & PGOFSET);
3244 	}
3245 
3246 	if (pap != NULL) {
3247 		*pap = pa;
3248 	}
3249 
3250 	return true;
3251 }
3252 
3253 paddr_t
vtophys(vaddr_t const vaddr)3254 vtophys(vaddr_t const vaddr)
3255 {
3256 	paddr_t pa;
3257 
3258 	if (__predict_false(! vtophys_internal(vaddr, &pa)))
3259 		pa = 0;
3260 	return pa;
3261 }
3262 
3263 /******************** pv_entry management ********************/
3264 
3265 /*
3266  * pmap_pv_enter:
3267  *
3268  *	Add a physical->virtual entry to the pv_table.
3269  */
3270 static int
pmap_pv_enter(pmap_t pmap,struct vm_page * pg,vaddr_t va,pt_entry_t * pte,bool dolock,pv_entry_t newpv)3271 pmap_pv_enter(pmap_t pmap, struct vm_page *pg, vaddr_t va, pt_entry_t *pte,
3272     bool dolock, pv_entry_t newpv)
3273 {
3274 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
3275 	kmutex_t *lock;
3276 
3277 	/*
3278 	 * Allocate and fill in the new pv_entry.
3279 	 */
3280 	if (newpv == NULL) {
3281 		newpv = pmap_pv_alloc();
3282 		if (newpv == NULL)
3283 			return ENOMEM;
3284 	}
3285 	newpv->pv_va = va;
3286 	newpv->pv_pmap = pmap;
3287 	newpv->pv_pte = pte;
3288 
3289 	if (dolock) {
3290 		lock = pmap_pvh_lock(pg);
3291 		mutex_enter(lock);
3292 	}
3293 
3294 #ifdef DEBUG
3295     {
3296 	pv_entry_t pv;
3297 	/*
3298 	 * Make sure the entry doesn't already exist.
3299 	 */
3300 	for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next) {
3301 		if (pmap == pv->pv_pmap && va == pv->pv_va) {
3302 			printf("pmap = %p, va = 0x%lx\n", pmap, va);
3303 			panic("pmap_pv_enter: already in pv table");
3304 		}
3305 	}
3306     }
3307 #endif
3308 
3309 	/*
3310 	 * ...and put it in the list.
3311 	 */
3312 	uintptr_t const attrs = md->pvh_listx & PGA_ATTRS;
3313 	newpv->pv_next = (struct pv_entry *)(md->pvh_listx & ~PGA_ATTRS);
3314 	md->pvh_listx = (uintptr_t)newpv | attrs;
3315 	LIST_INSERT_HEAD(&pmap->pm_pvents, newpv, pv_link);
3316 
3317 	if (dolock) {
3318 		mutex_exit(lock);
3319 	}
3320 
3321 	return 0;
3322 }
3323 
3324 /*
3325  * pmap_pv_remove:
3326  *
3327  *	Remove a physical->virtual entry from the pv_table.
3328  */
3329 static void
pmap_pv_remove(pmap_t pmap,struct vm_page * pg,vaddr_t va,bool dolock,pv_entry_t * opvp,struct pmap_tlb_context * const tlbctx)3330 pmap_pv_remove(pmap_t pmap, struct vm_page *pg, vaddr_t va, bool dolock,
3331     pv_entry_t *opvp, struct pmap_tlb_context * const tlbctx)
3332 {
3333 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
3334 	pv_entry_t pv, *pvp;
3335 	kmutex_t *lock;
3336 
3337 	if (dolock) {
3338 		lock = pmap_pvh_lock(pg);
3339 		mutex_enter(lock);
3340 	} else {
3341 		lock = NULL; /* XXX stupid gcc */
3342 	}
3343 
3344 	/*
3345 	 * Find the entry to remove.
3346 	 */
3347 	for (pvp = (struct pv_entry **)&md->pvh_listx, pv = VM_MDPAGE_PVS(pg);
3348 	     pv != NULL; pvp = &pv->pv_next, pv = *pvp)
3349 		if (pmap == pv->pv_pmap && va == pv->pv_va)
3350 			break;
3351 
3352 	KASSERT(pv != NULL);
3353 
3354 	/*
3355 	 * The page attributes are in the lower 2 bits of the first
3356 	 * PV entry pointer.  Rather than comparing the pointer address
3357 	 * and branching, we just always preserve what might be there
3358 	 * (either attribute bits or zero bits).
3359 	 */
3360 	*pvp = (pv_entry_t)((uintptr_t)pv->pv_next |
3361 			    (((uintptr_t)*pvp) & PGA_ATTRS));
3362 	LIST_REMOVE(pv, pv_link);
3363 
3364 	if (dolock) {
3365 		mutex_exit(lock);
3366 	}
3367 
3368 	if (opvp != NULL) {
3369 		*opvp = pv;
3370 	} else {
3371 		KASSERT(tlbctx != NULL);
3372 		LIST_INSERT_HEAD(&tlbctx->t_freepvq, pv, pv_link);
3373 	}
3374 }
3375 
3376 /*
3377  * pmap_pv_page_alloc:
3378  *
3379  *	Allocate a page for the pv_entry pool.
3380  */
3381 static void *
pmap_pv_page_alloc(struct pool * pp,int flags)3382 pmap_pv_page_alloc(struct pool *pp, int flags)
3383 {
3384 	struct vm_page * const pg = pmap_physpage_alloc(PGU_PVENT);
3385 	if (__predict_false(pg == NULL)) {
3386 		return NULL;
3387 	}
3388 	return (void *)ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(pg));
3389 }
3390 
3391 /*
3392  * pmap_pv_page_free:
3393  *
3394  *	Free a pv_entry pool page.
3395  */
3396 static void
pmap_pv_page_free(struct pool * pp,void * v)3397 pmap_pv_page_free(struct pool *pp, void *v)
3398 {
3399 
3400 	pmap_physpage_free(ALPHA_K0SEG_TO_PHYS((vaddr_t)v));
3401 }
3402 
3403 /******************** misc. functions ********************/
3404 
3405 /*
3406  * pmap_physpage_alloc:
3407  *
3408  *	Allocate a single page from the VM system and return the
3409  *	physical address for that page.
3410  */
3411 static struct vm_page *
pmap_physpage_alloc(int usage)3412 pmap_physpage_alloc(int usage)
3413 {
3414 	struct vm_page *pg;
3415 
3416 	/*
3417 	 * Don't ask for a zero'd page in the L1PT case -- we will
3418 	 * properly initialize it in the constructor.
3419 	 */
3420 
3421 	pg = uvm_pagealloc(NULL, 0, NULL, usage == PGU_L1PT ?
3422 	    UVM_PGA_USERESERVE : UVM_PGA_USERESERVE|UVM_PGA_ZERO);
3423 	if (pg != NULL) {
3424 		KASSERT(PHYSPAGE_REFCNT(pg) == 0);
3425 	}
3426 	return pg;
3427 }
3428 
3429 /*
3430  * pmap_physpage_free:
3431  *
3432  *	Free the single page table page at the specified physical address.
3433  */
3434 static void
pmap_physpage_free(paddr_t pa)3435 pmap_physpage_free(paddr_t pa)
3436 {
3437 	struct vm_page *pg;
3438 
3439 	if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL)
3440 		panic("pmap_physpage_free: bogus physical page address");
3441 
3442 	KASSERT(PHYSPAGE_REFCNT(pg) == 0);
3443 
3444 	uvm_pagefree(pg);
3445 }
3446 
3447 /*
3448  * pmap_physpage_addref:
3449  *
3450  *	Add a reference to the specified special use page.
3451  */
3452 static int
pmap_physpage_addref(void * kva)3453 pmap_physpage_addref(void *kva)
3454 {
3455 	struct vm_page *pg;
3456 	paddr_t pa;
3457 
3458 	pa = ALPHA_K0SEG_TO_PHYS(trunc_page((vaddr_t)kva));
3459 	pg = PHYS_TO_VM_PAGE(pa);
3460 
3461 	KASSERT(PHYSPAGE_REFCNT(pg) < UINT32_MAX);
3462 
3463 	return PHYSPAGE_REFCNT_INC(pg);
3464 }
3465 
3466 /*
3467  * pmap_physpage_delref:
3468  *
3469  *	Delete a reference to the specified special use page.
3470  */
3471 static int
pmap_physpage_delref(void * kva)3472 pmap_physpage_delref(void *kva)
3473 {
3474 	struct vm_page *pg;
3475 	paddr_t pa;
3476 
3477 	pa = ALPHA_K0SEG_TO_PHYS(trunc_page((vaddr_t)kva));
3478 	pg = PHYS_TO_VM_PAGE(pa);
3479 
3480 	KASSERT(PHYSPAGE_REFCNT(pg) != 0);
3481 
3482 	return PHYSPAGE_REFCNT_DEC(pg);
3483 }
3484 
3485 /******************** page table page management ********************/
3486 
3487 static bool
pmap_kptpage_alloc(paddr_t * pap)3488 pmap_kptpage_alloc(paddr_t *pap)
3489 {
3490 	if (uvm.page_init_done == false) {
3491 		/*
3492 		 * We're growing the kernel pmap early (from
3493 		 * uvm_pageboot_alloc()).  This case must
3494 		 * be handled a little differently.
3495 		 */
3496 		*pap = ALPHA_K0SEG_TO_PHYS(
3497 		    pmap_steal_memory(PAGE_SIZE, NULL, NULL));
3498 		return true;
3499 	}
3500 
3501 	struct vm_page * const pg = pmap_physpage_alloc(PGU_NORMAL);
3502 	if (__predict_true(pg != NULL)) {
3503 		*pap = VM_PAGE_TO_PHYS(pg);
3504 		return true;
3505 	}
3506 	return false;
3507 }
3508 
3509 /*
3510  * pmap_growkernel:		[ INTERFACE ]
3511  *
3512  *	Grow the kernel address space.  This is a hint from the
3513  *	upper layer to pre-allocate more kernel PT pages.
3514  */
3515 vaddr_t
pmap_growkernel(vaddr_t maxkvaddr)3516 pmap_growkernel(vaddr_t maxkvaddr)
3517 {
3518 	struct pmap *pm;
3519 	paddr_t ptaddr;
3520 	pt_entry_t *l1pte, *l2pte, pte;
3521 	pt_entry_t *lev1map;
3522 	vaddr_t va;
3523 	int l1idx;
3524 
3525 	rw_enter(&pmap_growkernel_lock, RW_WRITER);
3526 
3527 	if (maxkvaddr <= virtual_end)
3528 		goto out;		/* we are OK */
3529 
3530 	pmap_growkernel_evcnt.ev_count++;
3531 
3532 	va = virtual_end;
3533 
3534 	while (va < maxkvaddr) {
3535 		/*
3536 		 * If there is no valid L1 PTE (i.e. no L2 PT page),
3537 		 * allocate a new L2 PT page and insert it into the
3538 		 * L1 map.
3539 		 */
3540 		l1pte = pmap_l1pte(kernel_lev1map, va);
3541 		if (pmap_pte_v(l1pte) == 0) {
3542 			if (!pmap_kptpage_alloc(&ptaddr))
3543 				goto die;
3544 			pte = (atop(ptaddr) << PG_SHIFT) |
3545 			    PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
3546 			*l1pte = pte;
3547 
3548 			l1idx = l1pte_index(va);
3549 
3550 			/* Update all the user pmaps. */
3551 			mutex_enter(&pmap_all_pmaps_lock);
3552 			for (pm = TAILQ_FIRST(&pmap_all_pmaps);
3553 			     pm != NULL; pm = TAILQ_NEXT(pm, pm_list)) {
3554 				/* Skip the kernel pmap. */
3555 				if (pm == pmap_kernel())
3556 					continue;
3557 
3558 				/*
3559 				 * Any pmaps published on the global list
3560 				 * should never be referencing kernel_lev1map.
3561 				 */
3562 				lev1map = pmap_lev1map(pm);
3563 				KASSERT(lev1map != kernel_lev1map);
3564 
3565 				PMAP_LOCK(pm);
3566 				lev1map[l1idx] = pte;
3567 				PMAP_UNLOCK(pm);
3568 			}
3569 			mutex_exit(&pmap_all_pmaps_lock);
3570 		}
3571 
3572 		/*
3573 		 * Have an L2 PT page now, add the L3 PT page.
3574 		 */
3575 		l2pte = pmap_l2pte(kernel_lev1map, va, l1pte);
3576 		KASSERT(pmap_pte_v(l2pte) == 0);
3577 		if (!pmap_kptpage_alloc(&ptaddr))
3578 			goto die;
3579 		*l2pte = (atop(ptaddr) << PG_SHIFT) |
3580 		    PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
3581 		va += ALPHA_L2SEG_SIZE;
3582 	}
3583 
3584 	/* Invalidate the L1 PT cache. */
3585 	pool_cache_invalidate(&pmap_l1pt_cache);
3586 
3587 	virtual_end = va;
3588 
3589  out:
3590 	rw_exit(&pmap_growkernel_lock);
3591 
3592 	return (virtual_end);
3593 
3594  die:
3595 	panic("pmap_growkernel: out of memory");
3596 }
3597 
3598 /*
3599  * pmap_l1pt_ctor:
3600  *
3601  *	Pool cache constructor for L1 PT pages.
3602  *
3603  *	Note: The growkernel lock is held across allocations
3604  *	from our pool_cache, so we don't need to acquire it
3605  *	ourselves.
3606  */
3607 static int
pmap_l1pt_ctor(void * arg,void * object,int flags)3608 pmap_l1pt_ctor(void *arg, void *object, int flags)
3609 {
3610 	pt_entry_t *l1pt = object, pte;
3611 	int i;
3612 
3613 	/*
3614 	 * Initialize the new level 1 table by zeroing the
3615 	 * user portion and copying the kernel mappings into
3616 	 * the kernel portion.
3617 	 */
3618 	for (i = 0; i < l1pte_index(VM_MIN_KERNEL_ADDRESS); i++)
3619 		l1pt[i] = 0;
3620 
3621 	for (i = l1pte_index(VM_MIN_KERNEL_ADDRESS);
3622 	     i <= l1pte_index(VM_MAX_KERNEL_ADDRESS); i++)
3623 		l1pt[i] = kernel_lev1map[i];
3624 
3625 	/*
3626 	 * Now, map the new virtual page table.  NOTE: NO ASM!
3627 	 */
3628 	pte = ((ALPHA_K0SEG_TO_PHYS((vaddr_t) l1pt) >> PGSHIFT) << PG_SHIFT) |
3629 	    PG_V | PG_KRE | PG_KWE;
3630 	l1pt[l1pte_index(VPTBASE)] = pte;
3631 
3632 	return (0);
3633 }
3634 
3635 /*
3636  * pmap_l1pt_alloc:
3637  *
3638  *	Page allocator for L1 PT pages.
3639  */
3640 static void *
pmap_l1pt_alloc(struct pool * pp,int flags)3641 pmap_l1pt_alloc(struct pool *pp, int flags)
3642 {
3643 	/*
3644 	 * Attempt to allocate a free page.
3645 	 */
3646 	struct vm_page * const pg = pmap_physpage_alloc(PGU_L1PT);
3647 	if (__predict_false(pg == NULL)) {
3648 		return NULL;
3649 	}
3650 	return (void *)ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(pg));
3651 }
3652 
3653 /*
3654  * pmap_l1pt_free:
3655  *
3656  *	Page freer for L1 PT pages.
3657  */
3658 static void
pmap_l1pt_free(struct pool * pp,void * v)3659 pmap_l1pt_free(struct pool *pp, void *v)
3660 {
3661 
3662 	pmap_physpage_free(ALPHA_K0SEG_TO_PHYS((vaddr_t) v));
3663 }
3664 
3665 /*
3666  * pmap_ptpage_alloc:
3667  *
3668  *	Allocate a level 2 or level 3 page table page for a user
3669  *	pmap, and initialize the PTE that references it.
3670  *
3671  *	Note: the pmap must already be locked.
3672  */
3673 static int
pmap_ptpage_alloc(pmap_t pmap,pt_entry_t * const pte,int const usage)3674 pmap_ptpage_alloc(pmap_t pmap, pt_entry_t * const pte, int const usage)
3675 {
3676 	/*
3677 	 * Allocate the page table page.
3678 	 */
3679 	struct vm_page * const pg = pmap_physpage_alloc(usage);
3680 	if (__predict_false(pg == NULL)) {
3681 		return ENOMEM;
3682 	}
3683 
3684 	LIST_INSERT_HEAD(&pmap->pm_ptpages, pg, pageq.list);
3685 
3686 	/*
3687 	 * Initialize the referencing PTE.
3688 	 */
3689 	const pt_entry_t npte = ((VM_PAGE_TO_PHYS(pg) >> PGSHIFT) << PG_SHIFT) |
3690 	    PG_V | PG_KRE | PG_KWE | PG_WIRED;
3691 
3692 	atomic_store_relaxed(pte, npte);
3693 
3694 	return (0);
3695 }
3696 
3697 /*
3698  * pmap_ptpage_free:
3699  *
3700  *	Free the level 2 or level 3 page table page referenced
3701  *	be the provided PTE.
3702  *
3703  *	Note: the pmap must already be locked.
3704  */
3705 static void
pmap_ptpage_free(pmap_t pmap,pt_entry_t * const pte,struct pmap_tlb_context * const tlbctx)3706 pmap_ptpage_free(pmap_t pmap, pt_entry_t * const pte,
3707     struct pmap_tlb_context * const tlbctx)
3708 {
3709 
3710 	/*
3711 	 * Extract the physical address of the page from the PTE
3712 	 * and clear the entry.
3713 	 */
3714 	const paddr_t ptpa = pmap_pte_pa(pte);
3715 	atomic_store_relaxed(pte, PG_NV);
3716 
3717 	struct vm_page * const pg = PHYS_TO_VM_PAGE(ptpa);
3718 	KASSERT(pg != NULL);
3719 
3720 	KASSERT(PHYSPAGE_REFCNT(pg) == 0);
3721 #ifdef DEBUG
3722 	pmap_zero_page(ptpa);
3723 #endif
3724 
3725 	LIST_REMOVE(pg, pageq.list);
3726 	LIST_INSERT_HEAD(&tlbctx->t_freeptq, pg, pageq.list);
3727 }
3728 
3729 /*
3730  * pmap_l3pt_delref:
3731  *
3732  *	Delete a reference on a level 3 PT page.  If the reference drops
3733  *	to zero, free it.
3734  *
3735  *	Note: the pmap must already be locked.
3736  */
3737 static void
pmap_l3pt_delref(pmap_t pmap,vaddr_t va,pt_entry_t * l3pte,struct pmap_tlb_context * const tlbctx)3738 pmap_l3pt_delref(pmap_t pmap, vaddr_t va, pt_entry_t *l3pte,
3739     struct pmap_tlb_context * const tlbctx)
3740 {
3741 	pt_entry_t *l1pte, *l2pte;
3742 	pt_entry_t * const lev1map = pmap_lev1map(pmap);
3743 
3744 	l1pte = pmap_l1pte(lev1map, va);
3745 	l2pte = pmap_l2pte(lev1map, va, l1pte);
3746 
3747 #ifdef DIAGNOSTIC
3748 	if (pmap == pmap_kernel())
3749 		panic("pmap_l3pt_delref: kernel pmap");
3750 #endif
3751 
3752 	if (pmap_physpage_delref(l3pte) == 0) {
3753 		/*
3754 		 * No more mappings; we can free the level 3 table.
3755 		 */
3756 #ifdef DEBUG
3757 		if (pmapdebug & PDB_PTPAGE)
3758 			printf("pmap_l3pt_delref: freeing level 3 table at "
3759 			    "0x%lx\n", pmap_pte_pa(l2pte));
3760 #endif
3761 		/*
3762 		 * You can pass NULL if you know the last reference won't
3763 		 * be dropped.
3764 		 */
3765 		KASSERT(tlbctx != NULL);
3766 		pmap_ptpage_free(pmap, l2pte, tlbctx);
3767 
3768 		/*
3769 		 * We've freed a level 3 table, so we must invalidate
3770 		 * any now-stale TLB entries for the corresponding VPT
3771 		 * VA range.  Easiest way to guarantee this is to hit
3772 		 * all of the user TLB entries.
3773 		 */
3774 		pmap_tlb_shootdown_all_user(pmap, PG_V, tlbctx);
3775 
3776 		/*
3777 		 * We've freed a level 3 table, so delete the reference
3778 		 * on the level 2 table.
3779 		 */
3780 		pmap_l2pt_delref(pmap, l1pte, l2pte, tlbctx);
3781 	}
3782 }
3783 
3784 /*
3785  * pmap_l2pt_delref:
3786  *
3787  *	Delete a reference on a level 2 PT page.  If the reference drops
3788  *	to zero, free it.
3789  *
3790  *	Note: the pmap must already be locked.
3791  */
3792 static void
pmap_l2pt_delref(pmap_t pmap,pt_entry_t * l1pte,pt_entry_t * l2pte,struct pmap_tlb_context * const tlbctx)3793 pmap_l2pt_delref(pmap_t pmap, pt_entry_t *l1pte, pt_entry_t *l2pte,
3794     struct pmap_tlb_context * const tlbctx)
3795 {
3796 
3797 #ifdef DIAGNOSTIC
3798 	if (pmap == pmap_kernel())
3799 		panic("pmap_l2pt_delref: kernel pmap");
3800 #endif
3801 
3802 	if (pmap_physpage_delref(l2pte) == 0) {
3803 		/*
3804 		 * No more mappings in this segment; we can free the
3805 		 * level 2 table.
3806 		 */
3807 #ifdef DEBUG
3808 		if (pmapdebug & PDB_PTPAGE)
3809 			printf("pmap_l2pt_delref: freeing level 2 table at "
3810 			    "0x%lx\n", pmap_pte_pa(l1pte));
3811 #endif
3812 		/*
3813 		 * You can pass NULL if you know the last reference won't
3814 		 * be dropped.
3815 		 */
3816 		KASSERT(tlbctx != NULL);
3817 		pmap_ptpage_free(pmap, l1pte, tlbctx);
3818 
3819 		/*
3820 		 * We've freed a level 2 table, so we must invalidate
3821 		 * any now-stale TLB entries for the corresponding VPT
3822 		 * VA range.  Easiest way to guarantee this is to hit
3823 		 * all of the user TLB entries.
3824 		 */
3825 		pmap_tlb_shootdown_all_user(pmap, PG_V, tlbctx);
3826 
3827 		/*
3828 		 * We've freed a level 2 table, so delete the reference
3829 		 * on the level 1 table.
3830 		 */
3831 		pmap_l1pt_delref(pmap, l1pte);
3832 	}
3833 }
3834 
3835 /*
3836  * pmap_l1pt_delref:
3837  *
3838  *	Delete a reference on a level 1 PT page.
3839  */
3840 static void
pmap_l1pt_delref(pmap_t pmap,pt_entry_t * l1pte)3841 pmap_l1pt_delref(pmap_t pmap, pt_entry_t *l1pte)
3842 {
3843 
3844 	KASSERT(pmap != pmap_kernel());
3845 
3846 	(void)pmap_physpage_delref(l1pte);
3847 }
3848 
3849 /******************** Address Space Number management ********************/
3850 
3851 /*
3852  * pmap_asn_alloc:
3853  *
3854  *	Allocate and assign an ASN to the specified pmap.
3855  *
3856  *	Note: the pmap must already be locked.  This may be called from
3857  *	an interprocessor interrupt, and in that case, the sender of
3858  *	the IPI has the pmap lock.
3859  */
3860 static u_int
pmap_asn_alloc(pmap_t const pmap,struct cpu_info * const ci)3861 pmap_asn_alloc(pmap_t const pmap, struct cpu_info * const ci)
3862 {
3863 
3864 #ifdef DEBUG
3865 	if (pmapdebug & (PDB_FOLLOW|PDB_ASN))
3866 		printf("pmap_asn_alloc(%p)\n", pmap);
3867 #endif
3868 
3869 	KASSERT(pmap != pmap_kernel());
3870 	KASSERT(pmap->pm_percpu[ci->ci_cpuid].pmc_lev1map != kernel_lev1map);
3871 	KASSERT(kpreempt_disabled());
3872 
3873 	/* No work to do if the CPU does not implement ASNs. */
3874 	if (pmap_max_asn == 0)
3875 		return 0;
3876 
3877 	struct pmap_percpu * const pmc = &pmap->pm_percpu[ci->ci_cpuid];
3878 
3879 	/*
3880 	 * Hopefully, we can continue using the one we have...
3881 	 *
3882 	 * N.B. the generation check will fail the first time
3883 	 * any pmap is activated on a given CPU, because we start
3884 	 * the generation counter at 1, but initialize pmaps with
3885 	 * 0; this forces the first ASN allocation to occur.
3886 	 */
3887 	if (pmc->pmc_asngen == ci->ci_asn_gen) {
3888 #ifdef DEBUG
3889 		if (pmapdebug & PDB_ASN)
3890 			printf("pmap_asn_alloc: same generation, keeping %u\n",
3891 			    pmc->pmc_asn);
3892 #endif
3893 		TLB_COUNT(asn_reuse);
3894 		return pmc->pmc_asn;
3895 	}
3896 
3897 	/*
3898 	 * Need to assign a new ASN.  Grab the next one, incrementing
3899 	 * the generation number if we have to.
3900 	 */
3901 	if (ci->ci_next_asn > pmap_max_asn) {
3902 		/*
3903 		 * Invalidate all non-PG_ASM TLB entries and the
3904 		 * I-cache, and bump the generation number.
3905 		 */
3906 		ALPHA_TBIAP();
3907 		alpha_pal_imb();
3908 
3909 		ci->ci_next_asn = PMAP_ASN_FIRST_USER;
3910 		ci->ci_asn_gen++;
3911 		TLB_COUNT(asn_newgen);
3912 
3913 		/*
3914 		 * Make sure the generation number doesn't wrap.  We could
3915 		 * handle this scenario by traversing all of the pmaps,
3916 		 * and invalidating the generation number on those which
3917 		 * are not currently in use by this processor.
3918 		 *
3919 		 * However... considering that we're using an unsigned 64-bit
3920 		 * integer for generation numbers, on non-ASN CPUs, we won't
3921 		 * wrap for approximately 75 billion years on a 128-ASN CPU
3922 		 * (assuming 1000 switch * operations per second).
3923 		 *
3924 		 * So, we don't bother.
3925 		 */
3926 		KASSERT(ci->ci_asn_gen != PMAP_ASNGEN_INVALID);
3927 #ifdef DEBUG
3928 		if (pmapdebug & PDB_ASN)
3929 			printf("pmap_asn_alloc: generation bumped to %lu\n",
3930 			    ci->ci_asn_gen);
3931 #endif
3932 	}
3933 
3934 	/*
3935 	 * Assign the new ASN and validate the generation number.
3936 	 */
3937 	pmc->pmc_asn = ci->ci_next_asn++;
3938 	pmc->pmc_asngen = ci->ci_asn_gen;
3939 	TLB_COUNT(asn_assign);
3940 
3941 	/*
3942 	 * We have a new ASN, so we can skip any pending I-stream sync
3943 	 * on the way back out to user space.
3944 	 */
3945 	pmc->pmc_needisync = 0;
3946 
3947 #ifdef DEBUG
3948 	if (pmapdebug & PDB_ASN)
3949 		printf("pmap_asn_alloc: assigning %u to pmap %p\n",
3950 		    pmc->pmc_asn, pmap);
3951 #endif
3952 	return pmc->pmc_asn;
3953 }
3954