xref: /illumos-gate/usr/src/uts/common/io/physmem.c (revision b6c3f786)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/modctl.h>
30 #include <sys/conf.h>
31 #include <sys/ddi.h>
32 #include <sys/sunddi.h>
33 #include <sys/devops.h>
34 #include <sys/stat.h>
35 #include <sys/file.h>
36 #include <sys/cred.h>
37 #include <sys/policy.h>
38 #include <sys/errno.h>
39 #include <vm/seg_dev.h>
40 #include <vm/seg_vn.h>
41 #include <vm/page.h>
42 #include <sys/fs/swapnode.h>
43 #include <sys/sysmacros.h>
44 #include <sys/fcntl.h>
45 #include <sys/vmsystm.h>
46 #include <sys/physmem.h>
47 #include <sys/vfs_opreg.h>
48 
49 static dev_info_t		*physmem_dip = NULL;
50 
51 /*
52  * Linked list element hanging off physmem_proc_hash below, which holds all
53  * the information for a given segment which has been setup for this process.
54  * This is a simple linked list as we are assuming that for a given process
55  * the setup ioctl will only be called a handful of times.  If this assumption
56  * changes in the future, a quicker to traverse data structure should be used.
57  */
58 struct physmem_hash {
59 	struct physmem_hash *ph_next;
60 	uint64_t ph_base_pa;
61 	caddr_t ph_base_va;
62 	size_t ph_seg_len;
63 	struct vnode *ph_vnode;
64 };
65 
66 /*
67  * Hash of all of the processes which have setup mappings with the driver with
68  * pointers to per process data.
69  */
70 struct physmem_proc_hash {
71 	struct proc *pph_proc;
72 	struct physmem_hash *pph_hash;
73 	struct physmem_proc_hash *pph_next;
74 };
75 
76 
77 /* Needs to be a power of two for simple hash algorithm */
78 #define	PPH_SIZE	8
79 struct physmem_proc_hash *pph[PPH_SIZE];
80 
81 /*
82  * Lock which protects the pph hash above.  To add an element (either a new
83  * process or a new segment) the WRITE lock must be held.  To traverse the
84  * list, only a READ lock is needed.
85  */
86 krwlock_t pph_rwlock;
87 
88 #define	PHYSMEM_HASH(procp) ((int)((((uintptr_t)procp) >> 8) & (PPH_SIZE - 1)))
89 
90 /*
91  * Need to keep a reference count of how many processes have the driver
92  * open to prevent it from disappearing.
93  */
94 uint64_t physmem_vnodecnt;
95 kmutex_t physmem_mutex;		/* protects phsymem_vnodecnt */
96 
97 static int physmem_getpage(struct vnode *vp, offset_t off, size_t len,
98     uint_t *protp, page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
99     enum seg_rw rw, struct cred *cr, caller_context_t *ct);
100 
101 static int physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
102     caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
103     struct cred *cred, caller_context_t *ct);
104 
105 static int physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
106     caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
107     struct cred *cred, caller_context_t *ct);
108 
109 static void physmem_inactive(vnode_t *vp, cred_t *crp, caller_context_t *ct);
110 
111 const fs_operation_def_t physmem_vnodeops_template[] = {
112 	VOPNAME_GETPAGE,	{ .vop_getpage = physmem_getpage },
113 	VOPNAME_ADDMAP,		{ .vop_addmap = physmem_addmap },
114 	VOPNAME_DELMAP,		{ .vop_delmap = physmem_delmap },
115 	VOPNAME_INACTIVE,	{ .vop_inactive = physmem_inactive },
116 	NULL,			NULL
117 };
118 
119 vnodeops_t *physmem_vnodeops = NULL;
120 
121 /*
122  * Removes the current process from the hash if the process has no more
123  * physmem segments active.
124  */
125 void
126 physmem_remove_hash_proc()
127 {
128 	int index;
129 	struct physmem_proc_hash **walker;
130 	struct physmem_proc_hash *victim = NULL;
131 
132 	index = PHYSMEM_HASH(curproc);
133 	rw_enter(&pph_rwlock, RW_WRITER);
134 	walker = &pph[index];
135 	while (*walker != NULL) {
136 		if ((*walker)->pph_proc == curproc &&
137 		    (*walker)->pph_hash == NULL) {
138 			victim = *walker;
139 			*walker = victim->pph_next;
140 			break;
141 		}
142 		walker = &((*walker)->pph_next);
143 	}
144 	rw_exit(&pph_rwlock);
145 	if (victim != NULL)
146 		kmem_free(victim, sizeof (struct physmem_proc_hash));
147 }
148 
149 /*
150  * Add a new entry to the hash for the given process to cache the
151  * address ranges that it is working on.  If this is the first hash
152  * item to be added for this process, we will create the head pointer
153  * for this process.
154  * Returns 0 on success, ERANGE when the physical address is already in the
155  * hash.
156  */
157 int
158 physmem_add_hash(struct physmem_hash *php)
159 {
160 	int index;
161 	struct physmem_proc_hash *iterator;
162 	struct physmem_proc_hash *newp = NULL;
163 	struct physmem_hash *temp;
164 	int ret = 0;
165 
166 	index = PHYSMEM_HASH(curproc);
167 
168 insert:
169 	rw_enter(&pph_rwlock, RW_WRITER);
170 	iterator = pph[index];
171 	while (iterator != NULL) {
172 		if (iterator->pph_proc == curproc) {
173 			/*
174 			 * check to make sure a single process does not try to
175 			 * map the same region twice.
176 			 */
177 			for (temp = iterator->pph_hash; temp != NULL;
178 			    temp = temp->ph_next) {
179 				if ((php->ph_base_pa >= temp->ph_base_pa &&
180 				    php->ph_base_pa < temp->ph_base_pa +
181 				    temp->ph_seg_len) ||
182 				    (temp->ph_base_pa >= php->ph_base_pa &&
183 				    temp->ph_base_pa < php->ph_base_pa +
184 				    php->ph_seg_len)) {
185 					ret = ERANGE;
186 					break;
187 				}
188 			}
189 			if (ret == 0) {
190 				php->ph_next = iterator->pph_hash;
191 				iterator->pph_hash = php;
192 			}
193 			rw_exit(&pph_rwlock);
194 			/* Need to check for two threads in sync */
195 			if (newp != NULL)
196 				kmem_free(newp, sizeof (*newp));
197 			return (ret);
198 		}
199 		iterator = iterator->pph_next;
200 	}
201 
202 	if (newp != NULL) {
203 		newp->pph_proc = curproc;
204 		newp->pph_next = pph[index];
205 		newp->pph_hash = php;
206 		php->ph_next = NULL;
207 		pph[index] = newp;
208 		rw_exit(&pph_rwlock);
209 		return (0);
210 	}
211 
212 	rw_exit(&pph_rwlock);
213 	/* Dropped the lock so we could use KM_SLEEP */
214 	newp = kmem_zalloc(sizeof (struct physmem_proc_hash), KM_SLEEP);
215 	goto insert;
216 }
217 
218 /*
219  * Will return the pointer to the physmem_hash struct if the setup routine
220  * has previously been called for this memory.
221  * Returns NULL on failure.
222  */
223 struct physmem_hash *
224 physmem_get_hash(uint64_t req_paddr, size_t len, proc_t *procp)
225 {
226 	int index;
227 	struct physmem_proc_hash *proc_hp;
228 	struct physmem_hash *php;
229 
230 	ASSERT(rw_lock_held(&pph_rwlock));
231 
232 	index = PHYSMEM_HASH(procp);
233 	proc_hp = pph[index];
234 	while (proc_hp != NULL) {
235 		if (proc_hp->pph_proc == procp) {
236 			php = proc_hp->pph_hash;
237 			while (php != NULL) {
238 				if ((req_paddr >= php->ph_base_pa) &&
239 				    (req_paddr + len <=
240 				    php->ph_base_pa + php->ph_seg_len)) {
241 					return (php);
242 				}
243 				php = php->ph_next;
244 			}
245 		}
246 		proc_hp = proc_hp->pph_next;
247 	}
248 	return (NULL);
249 }
250 
251 int
252 physmem_validate_cookie(uint64_t p_cookie)
253 {
254 	int index;
255 	struct physmem_proc_hash *proc_hp;
256 	struct physmem_hash *php;
257 
258 	ASSERT(rw_lock_held(&pph_rwlock));
259 
260 	index = PHYSMEM_HASH(curproc);
261 	proc_hp = pph[index];
262 	while (proc_hp != NULL) {
263 		if (proc_hp->pph_proc == curproc) {
264 			php = proc_hp->pph_hash;
265 			while (php != NULL) {
266 				if ((uint64_t)(uintptr_t)php == p_cookie) {
267 					return (1);
268 				}
269 				php = php->ph_next;
270 			}
271 		}
272 		proc_hp = proc_hp->pph_next;
273 	}
274 	return (0);
275 }
276 
277 /*
278  * Remove the given vnode from the pph hash.  If it exists in the hash the
279  * process still has to be around as the vnode is obviously still around and
280  * since it's a physmem vnode, it must be in the hash.
281  * If it is not in the hash that must mean that the setup ioctl failed.
282  * Return 0 in this instance, 1 if it is in the hash.
283  */
284 int
285 physmem_remove_vnode_hash(vnode_t *vp)
286 {
287 	int index;
288 	struct physmem_proc_hash *proc_hp;
289 	struct physmem_hash **phpp;
290 	struct physmem_hash *victim;
291 
292 	index = PHYSMEM_HASH(curproc);
293 	/* synchronize with the map routine */
294 	rw_enter(&pph_rwlock, RW_WRITER);
295 	proc_hp = pph[index];
296 	while (proc_hp != NULL) {
297 		if (proc_hp->pph_proc == curproc) {
298 			phpp = &proc_hp->pph_hash;
299 			while (*phpp != NULL) {
300 				if ((*phpp)->ph_vnode == vp) {
301 					victim = *phpp;
302 					*phpp = victim->ph_next;
303 
304 					rw_exit(&pph_rwlock);
305 					kmem_free(victim, sizeof (*victim));
306 					return (1);
307 				}
308 				phpp = &(*phpp)->ph_next;
309 			}
310 		}
311 		proc_hp = proc_hp->pph_next;
312 	}
313 	rw_exit(&pph_rwlock);
314 
315 	/* not found */
316 	return (0);
317 }
318 
319 int
320 physmem_setup_vnops()
321 {
322 	int error;
323 	char *name = "physmem";
324 	if (physmem_vnodeops != NULL)
325 		cmn_err(CE_PANIC, "physmem vnodeops already set\n");
326 	error = vn_make_ops(name, physmem_vnodeops_template, &physmem_vnodeops);
327 	if (error != 0) {
328 		cmn_err(CE_WARN, "physmem_setup_vnops: bad vnode ops template");
329 	}
330 	return (error);
331 }
332 
333 /*
334  * The guts of the PHYSMEM_SETUP ioctl.
335  * Create a segment in the address space with the specified parameters.
336  * If pspp->user_va is NULL, as_gap will be used to find an appropriate VA.
337  * We do not do bounds checking on the requested physical addresses, if they
338  * do not exist in the system, they will not be mappable.
339  * Returns 0 on success with the following error codes on failure:
340  *	ENOMEM - The VA range requested was already mapped if pspp->user_va is
341  *		non-NULL or the system was unable to find enough VA space for
342  *		the desired length if user_va was NULL>
343  *	EINVAL - The requested PA, VA, or length was not PAGESIZE aligned.
344  */
345 int
346 physmem_setup_addrs(struct physmem_setup_param *pspp)
347 {
348 	struct as *as = curproc->p_as;
349 	struct segvn_crargs vn_a;
350 	int ret = 0;
351 	uint64_t base_pa;
352 	size_t len;
353 	caddr_t uvaddr;
354 	struct vnode *vp;
355 	struct physmem_hash *php;
356 
357 	ASSERT(pspp != NULL);
358 	base_pa = pspp->req_paddr;
359 	len = pspp->len;
360 	uvaddr = (caddr_t)(uintptr_t)pspp->user_va;
361 
362 	/* Sanity checking */
363 	if (!IS_P2ALIGNED(base_pa, PAGESIZE))
364 		return (EINVAL);
365 	if (!IS_P2ALIGNED(len, PAGESIZE))
366 		return (EINVAL);
367 	if (uvaddr != NULL && !IS_P2ALIGNED(uvaddr, PAGESIZE))
368 		return (EINVAL);
369 
370 	php = kmem_zalloc(sizeof (struct physmem_hash), KM_SLEEP);
371 
372 	/* Need to bump vnode count so that the driver can not be unloaded */
373 	mutex_enter(&physmem_mutex);
374 	physmem_vnodecnt++;
375 	mutex_exit(&physmem_mutex);
376 
377 	vp = vn_alloc(KM_SLEEP);
378 	ASSERT(vp != NULL);	/* SLEEP can't return NULL */
379 	vn_setops(vp, physmem_vnodeops);
380 
381 	php->ph_vnode = vp;
382 
383 	vn_a.vp = vp;
384 	vn_a.offset = (u_offset_t)base_pa;
385 	vn_a.type = MAP_SHARED;
386 	vn_a.prot = PROT_ALL;
387 	vn_a.maxprot = PROT_ALL;
388 	vn_a.flags = 0;
389 	vn_a.cred = NULL;
390 	vn_a.amp = NULL;
391 	vn_a.szc = 0;
392 	vn_a.lgrp_mem_policy_flags = 0;
393 
394 	as_rangelock(as);
395 	if (uvaddr != NULL) {
396 		if (as_gap(as, len, &uvaddr, &len, AH_LO, NULL) == -1) {
397 			ret = ENOMEM;
398 fail:
399 			as_rangeunlock(as);
400 			vn_free(vp);
401 			kmem_free(php, sizeof (*php));
402 			mutex_enter(&physmem_mutex);
403 			physmem_vnodecnt--;
404 			mutex_exit(&physmem_mutex);
405 			return (ret);
406 		}
407 	} else {
408 		/* We pick the address for the user */
409 		map_addr(&uvaddr, len, 0, 1, 0);
410 		if (uvaddr == NULL) {
411 			ret = ENOMEM;
412 			goto fail;
413 		}
414 	}
415 	ret = as_map(as, uvaddr, len, segvn_create, &vn_a);
416 
417 	if (ret == 0) {
418 		as_rangeunlock(as);
419 		php->ph_base_pa = base_pa;
420 		php->ph_base_va = uvaddr;
421 		php->ph_seg_len = len;
422 		pspp->user_va = (uint64_t)(uintptr_t)uvaddr;
423 		pspp->cookie = (uint64_t)(uintptr_t)php;
424 		ret = physmem_add_hash(php);
425 		if (ret == 0)
426 			return (0);
427 
428 		/* Note that the call to as_unmap will free the vnode */
429 		(void) as_unmap(as, uvaddr, len);
430 		kmem_free(php, sizeof (*php));
431 		return (ret);
432 	}
433 
434 	goto fail;
435 	/*NOTREACHED*/
436 }
437 
438 /*
439  * The guts of the PHYSMEM_MAP ioctl.
440  * Map the given PA to the appropriate VA if PHYSMEM_SETUP ioctl has already
441  * been called for this PA range.
442  * Returns 0 on success with the following error codes on failure:
443  *	EPERM - The requested page is long term locked, and thus repeated
444  *		requests to allocate this page will likely fail.
445  *	EAGAIN - The requested page could not be allocated, but it is believed
446  *		that future attempts could succeed.
447  *	ENOMEM - There was not enough free memory in the system to safely
448  *		map the requested page.
449  *	EINVAL - The requested paddr was not PAGESIZE aligned or the
450  *		PHYSMEM_SETUP ioctl was not called for this page.
451  *	ENOENT - The requested page was iniside the kernel cage, and the
452  *		PHYSMEM_CAGE flag was not set.
453  *	EBUSY - The requested page is retired and the PHYSMEM_RETIRE flag
454  *		was not set.
455  */
456 static int
457 physmem_map_addrs(struct physmem_map_param *pmpp)
458 {
459 	caddr_t uvaddr;
460 	page_t *pp;
461 	uint64_t req_paddr;
462 	struct vnode *vp;
463 	int ret = 0;
464 	struct physmem_hash *php;
465 	uint_t flags = 0;
466 
467 	ASSERT(pmpp != NULL);
468 	req_paddr = pmpp->req_paddr;
469 
470 	if (!IS_P2ALIGNED(req_paddr, PAGESIZE))
471 		return (EINVAL);
472 	/* Find the vnode for this map request */
473 	rw_enter(&pph_rwlock, RW_READER);
474 	php = physmem_get_hash(req_paddr, PAGESIZE, curproc);
475 	if (php == NULL) {
476 		rw_exit(&pph_rwlock);
477 		return (EINVAL);
478 	}
479 	vp = php->ph_vnode;
480 	uvaddr = php->ph_base_va + (req_paddr - php->ph_base_pa);
481 	rw_exit(&pph_rwlock);
482 
483 	pp = page_numtopp_nolock(btop((size_t)req_paddr));
484 	if (pp == NULL) {
485 		pmpp->ret_va = NULL;
486 		return (EPERM);
487 	}
488 
489 	/*
490 	 * Check to see if page already mapped correctly.  This can happen
491 	 * when we failed to capture a page previously and it was captured
492 	 * asynchronously for us.  Return success in this case.
493 	 */
494 	if (pp->p_vnode == vp) {
495 		ASSERT(pp->p_offset == (u_offset_t)req_paddr);
496 		pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
497 		return (0);
498 	}
499 
500 	/*
501 	 * physmem should be responsible for checking for cage
502 	 * and prom pages.
503 	 */
504 	if (pmpp->flags & PHYSMEM_CAGE)
505 		flags = CAPTURE_GET_CAGE;
506 	if (pmpp->flags & PHYSMEM_RETIRED)
507 		flags |= CAPTURE_GET_RETIRED;
508 
509 	ret = page_trycapture(pp, 0, flags | CAPTURE_PHYSMEM, curproc);
510 
511 	if (ret != 0) {
512 		pmpp->ret_va = NULL;
513 		return (ret);
514 	} else {
515 		pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
516 		return (0);
517 	}
518 }
519 
520 /*
521  * Map the given page into the process's address space if possible.
522  * We actually only hash the page in on the correct vnode as the page
523  * will be mapped via segvn_pagefault.
524  * returns 0 on success
525  * returns 1 if there is no need to map this page anymore (process exited)
526  * returns -1 if we failed to map the page.
527  */
528 int
529 map_page_proc(page_t *pp, void *arg, uint_t flags)
530 {
531 	struct vnode *vp;
532 	proc_t *procp = (proc_t *)arg;
533 	int ret;
534 	u_offset_t paddr = (u_offset_t)ptob(pp->p_pagenum);
535 	struct physmem_hash *php;
536 
537 	ASSERT(pp != NULL);
538 
539 	/*
540 	 * Check against availrmem to make sure that we're not low on memory.
541 	 * We check again here as ASYNC requests do not do this check elsewhere.
542 	 * We return 1 as we don't want the page to have the PR_CAPTURE bit
543 	 * set or be on the page capture hash.
544 	 */
545 	if (swapfs_minfree > availrmem + 1) {
546 		page_free(pp, 1);
547 		return (1);
548 	}
549 
550 	/*
551 	 * If this is an asynchronous request for the current process,
552 	 * we can not map the page as it's possible that we are also in the
553 	 * process of unmapping the page which could result in a deadlock
554 	 * with the as lock.
555 	 */
556 	if ((flags & CAPTURE_ASYNC) && (curproc == procp)) {
557 		page_free(pp, 1);
558 		return (-1);
559 	}
560 
561 	/* only return zeroed out pages */
562 	pagezero(pp, 0, PAGESIZE);
563 
564 	rw_enter(&pph_rwlock, RW_READER);
565 	php = physmem_get_hash(paddr, PAGESIZE, procp);
566 	if (php == NULL) {
567 		rw_exit(&pph_rwlock);
568 		/*
569 		 * Free the page as there is no longer a valid outstanding
570 		 * request for this page.
571 		 */
572 		page_free(pp, 1);
573 		return (1);
574 	}
575 
576 	vp = php->ph_vnode;
577 
578 	/*
579 	 * We need to protect against a possible deadlock here where we own
580 	 * the vnode page hash mutex and want to acquire it again as there
581 	 * are locations in the code, where we unlock a page while holding
582 	 * the mutex which can lead to the page being captured and eventually
583 	 * end up here.
584 	 */
585 	if (mutex_owned(page_vnode_mutex(vp))) {
586 		rw_exit(&pph_rwlock);
587 		page_free(pp, 1);
588 		return (-1);
589 	}
590 
591 	ret = page_hashin(pp, vp, paddr, NULL);
592 	rw_exit(&pph_rwlock);
593 	if (ret == 0) {
594 		page_free(pp, 1);
595 		return (-1);
596 	}
597 
598 	page_downgrade(pp);
599 
600 	mutex_enter(&freemem_lock);
601 	availrmem--;
602 	mutex_exit(&freemem_lock);
603 
604 	return (0);
605 }
606 
607 /*
608  * The guts of the PHYSMEM_DESTROY ioctl.
609  * The cookie passed in will provide all of the information needed to
610  * free up the address space and physical memory associated with the
611  * corresponding PHSYMEM_SETUP ioctl.
612  * Returns 0 on success with the following error codes on failure:
613  *	EINVAL - The cookie supplied is not valid.
614  */
615 int
616 physmem_destroy_addrs(uint64_t p_cookie)
617 {
618 	struct as *as = curproc->p_as;
619 	size_t len;
620 	caddr_t uvaddr;
621 
622 	rw_enter(&pph_rwlock, RW_READER);
623 	if (physmem_validate_cookie(p_cookie) == 0) {
624 		rw_exit(&pph_rwlock);
625 		return (EINVAL);
626 	}
627 
628 	len = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_seg_len;
629 	uvaddr = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_base_va;
630 	rw_exit(&pph_rwlock);
631 
632 	(void) as_unmap(as, uvaddr, len);
633 
634 	return (0);
635 }
636 
637 /*
638  * If the page has been hashed into the physmem vnode, then just look it up
639  * and return it via pl, otherwise return ENOMEM as the map ioctl has not
640  * succeeded on the given page.
641  */
642 /*ARGSUSED*/
643 static int
644 physmem_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
645     page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
646     struct cred *cr, caller_context_t *ct)
647 {
648 	page_t *pp;
649 
650 	ASSERT(len == PAGESIZE);
651 	ASSERT(AS_READ_HELD(seg->s_as, &seg->s_as->a_lock));
652 
653 	/*
654 	 * If the page is in the hash, then we successfully claimed this
655 	 * page earlier, so return it to the caller.
656 	 */
657 	pp = page_lookup(vp, off, SE_SHARED);
658 	if (pp != NULL) {
659 		pl[0] = pp;
660 		pl[1] = NULL;
661 		*protp = PROT_ALL;
662 		return (0);
663 	}
664 	return (ENOMEM);
665 }
666 
667 /*
668  * We can not allow a process mapping /dev/physmem pages to fork as there can
669  * only be a single mapping to a /dev/physmem page at a given time.  Thus, the
670  * return of EINVAL when we are not working on our own address space.
671  * Otherwise we return zero as this function is required for normal operation.
672  */
673 /*ARGSUSED*/
674 static int
675 physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
676     caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
677     struct cred *cred, caller_context_t *ct)
678 {
679 	if (curproc->p_as != as) {
680 		return (EINVAL);
681 	}
682 	return (0);
683 }
684 
685 /* Will always get called for removing a whole segment. */
686 /*ARGSUSED*/
687 static int
688 physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
689     caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
690     struct cred *cred, caller_context_t *ct)
691 {
692 	/*
693 	 * Release our hold on the vnode so that the final VN_RELE will
694 	 * call physmem_inactive to clean things up.
695 	 */
696 	VN_RELE(vp);
697 
698 	return (0);
699 }
700 
701 /*
702  * Clean up all the pages belonging to this vnode and then free it.
703  */
704 /*ARGSUSED*/
705 static void
706 physmem_inactive(vnode_t *vp, cred_t *crp, caller_context_t *ct)
707 {
708 	page_t *pp;
709 
710 	/*
711 	 * Remove the vnode from the hash now, to prevent asynchronous
712 	 * attempts to map into this vnode.  This avoids a deadlock
713 	 * where two threads try to get into this logic at the same
714 	 * time and try to map the pages they are destroying into the
715 	 * other's address space.
716 	 * If it's not in the hash, just free it.
717 	 */
718 	if (physmem_remove_vnode_hash(vp) == 0) {
719 		ASSERT(vp->v_pages == NULL);
720 		vn_free(vp);
721 		physmem_remove_hash_proc();
722 		mutex_enter(&physmem_mutex);
723 		physmem_vnodecnt--;
724 		mutex_exit(&physmem_mutex);
725 		return;
726 	}
727 
728 	/*
729 	 * At this point in time, no other logic can be adding or removing
730 	 * pages from the vnode, otherwise the v_pages list could be inaccurate.
731 	 */
732 
733 	while ((pp = vp->v_pages) != NULL) {
734 		page_t *rpp;
735 		if (page_tryupgrade(pp)) {
736 			/*
737 			 * set lckcnt for page_destroy to do availrmem
738 			 * accounting
739 			 */
740 			pp->p_lckcnt = 1;
741 			page_destroy(pp, 0);
742 		} else {
743 			/* failure to lock should be transient */
744 			rpp = page_lookup(vp, ptob(pp->p_pagenum), SE_SHARED);
745 			if (rpp != pp) {
746 				page_unlock(rpp);
747 				continue;
748 			}
749 			page_unlock(pp);
750 		}
751 	}
752 	vn_free(vp);
753 	physmem_remove_hash_proc();
754 	mutex_enter(&physmem_mutex);
755 	physmem_vnodecnt--;
756 	mutex_exit(&physmem_mutex);
757 }
758 
759 /*ARGSUSED*/
760 static int
761 physmem_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
762     int *rvalp)
763 {
764 	int ret;
765 
766 	switch (cmd) {
767 	case PHYSMEM_SETUP:
768 		{
769 			struct physmem_setup_param psp;
770 			if (ddi_copyin((void *)arg, &psp,
771 			    sizeof (struct physmem_setup_param), 0))
772 				return (EFAULT);
773 			ret = physmem_setup_addrs(&psp);
774 			if (ddi_copyout(&psp, (void *)arg, sizeof (psp), 0))
775 				return (EFAULT);
776 		}
777 		break;
778 	case PHYSMEM_MAP:
779 		{
780 			struct physmem_map_param pmp;
781 			if (ddi_copyin((void *)arg, &pmp,
782 			    sizeof (struct physmem_map_param), 0))
783 				return (EFAULT);
784 			ret = physmem_map_addrs(&pmp);
785 			if (ddi_copyout(&pmp, (void *)arg, sizeof (pmp), 0))
786 				return (EFAULT);
787 		}
788 		break;
789 	case PHYSMEM_DESTROY:
790 		{
791 			uint64_t cookie;
792 			if (ddi_copyin((void *)arg, &cookie,
793 			    sizeof (uint64_t), 0))
794 				return (EFAULT);
795 			ret = physmem_destroy_addrs(cookie);
796 		}
797 		break;
798 	default:
799 		return (ENOTSUP);
800 	}
801 	return (ret);
802 }
803 
804 /*ARGSUSED*/
805 static int
806 physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp)
807 {
808 	int ret;
809 	static int msg_printed = 0;
810 
811 	if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) {
812 		return (EINVAL);
813 	}
814 
815 	/* need to make sure we have the right privileges */
816 	if ((ret = secpolicy_resource(credp)) != 0)
817 		return (ret);
818 	if ((ret = secpolicy_lock_memory(credp)) != 0)
819 		return (ret);
820 
821 	if (msg_printed == 0) {
822 		cmn_err(CE_NOTE, "!driver has been opened. This driver may "
823 		    "take out long term locks on pages which may impact "
824 		    "dynamic reconfiguration events");
825 		msg_printed = 1;
826 	}
827 
828 	return (0);
829 }
830 
831 /*ARGSUSED*/
832 static int
833 physmem_close(dev_t dev, int flag, int otyp, cred_t *credp)
834 {
835 	return (0);
836 }
837 
838 /*ARGSUSED*/
839 static int
840 physmem_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd,
841     void *arg, void **resultp)
842 {
843 	switch (infocmd) {
844 	case DDI_INFO_DEVT2DEVINFO:
845 		*resultp = physmem_dip;
846 		return (DDI_SUCCESS);
847 
848 	case DDI_INFO_DEVT2INSTANCE:
849 		*resultp = (void *)(ulong_t)getminor((dev_t)arg);
850 		return (DDI_SUCCESS);
851 
852 	default:
853 		return (DDI_FAILURE);
854 	}
855 }
856 
857 static int
858 physmem_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
859 {
860 	int i;
861 
862 	if (cmd == DDI_RESUME) {
863 		return (DDI_SUCCESS);
864 	}
865 
866 	if (cmd != DDI_ATTACH)
867 		return (DDI_FAILURE);
868 
869 	if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR,
870 	    ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS)
871 		return (DDI_FAILURE);
872 
873 	physmem_dip = dip;
874 
875 	/* Initialize driver specific data */
876 	if (physmem_setup_vnops()) {
877 		ddi_remove_minor_node(dip, ddi_get_name(dip));
878 		return (DDI_FAILURE);
879 	}
880 
881 	for (i = 0; i < PPH_SIZE; i++)
882 		pph[i] = NULL;
883 
884 	page_capture_register_callback(PC_PHYSMEM, 10000,
885 	    map_page_proc);
886 
887 	return (DDI_SUCCESS);
888 }
889 
890 static int
891 physmem_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
892 {
893 	int ret = DDI_SUCCESS;
894 
895 	if (cmd == DDI_SUSPEND) {
896 		return (DDI_SUCCESS);
897 	}
898 
899 	if (cmd != DDI_DETACH)
900 		return (DDI_FAILURE);
901 
902 	ASSERT(physmem_dip == dip);
903 
904 	mutex_enter(&physmem_mutex);
905 	if (physmem_vnodecnt == 0) {
906 		if (physmem_vnodeops != NULL) {
907 			vn_freevnodeops(physmem_vnodeops);
908 			physmem_vnodeops = NULL;
909 			page_capture_unregister_callback(PC_PHYSMEM);
910 		}
911 	} else {
912 		ret = EBUSY;
913 	}
914 	mutex_exit(&physmem_mutex);
915 	if (ret == DDI_SUCCESS)
916 		ddi_remove_minor_node(dip, ddi_get_name(dip));
917 	return (ret);
918 }
919 
920 static struct cb_ops physmem_cb_ops = {
921 	physmem_open,	/* open */
922 	physmem_close,	/* close */
923 	nodev,		/* strategy */
924 	nodev,		/* print */
925 	nodev,		/* dump */
926 	nodev,		/* read */
927 	nodev,		/* write */
928 	physmem_ioctl,	/* ioctl */
929 	nodev,		/* devmap */
930 	nodev,		/* mmap */
931 	nodev,		/* segmap */
932 	nochpoll,	/* chpoll */
933 	ddi_prop_op,	/* prop_op */
934 	NULL,		/* cb_str */
935 	D_NEW | D_MP | D_DEVMAP,
936 	CB_REV,
937 	NULL,
938 	NULL
939 };
940 
941 static struct dev_ops physmem_ops = {
942 	DEVO_REV,
943 	0,
944 	physmem_getinfo,
945 	nulldev,
946 	nulldev,
947 	physmem_attach,
948 	physmem_detach,
949 	nodev,
950 	&physmem_cb_ops,
951 	NULL,
952 	NULL
953 };
954 
955 static struct modldrv modldrv = {
956 	&mod_driverops,
957 	"physmem driver %I%",
958 	&physmem_ops
959 };
960 
961 static struct modlinkage modlinkage = {
962 	MODREV_1,
963 	&modldrv,
964 	NULL
965 };
966 
967 int
968 _init(void)
969 {
970 	return (mod_install(&modlinkage));
971 }
972 
973 int
974 _info(struct modinfo *modinfop)
975 {
976 	return (mod_info(&modlinkage, modinfop));
977 }
978 
979 int
980 _fini(void)
981 {
982 	return (mod_remove(&modlinkage));
983 }
984