xref: /illumos-gate/usr/src/uts/common/io/mem.c (revision f3041bfa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright (c) 2016 by Delphix. All rights reserved.
25  */
26 
27 /*
28  * Copyright 2017 Joyent, Inc.
29  * Copyright 2017 James S Blachly, MD <james.blachly@gmail.com>
30  */
31 
32 /*
33  * Memory special file
34  */
35 
36 #include <sys/types.h>
37 #include <sys/param.h>
38 #include <sys/user.h>
39 #include <sys/buf.h>
40 #include <sys/systm.h>
41 #include <sys/cred.h>
42 #include <sys/vm.h>
43 #include <sys/uio.h>
44 #include <sys/mman.h>
45 #include <sys/kmem.h>
46 #include <vm/seg.h>
47 #include <vm/page.h>
48 #include <sys/stat.h>
49 #include <sys/vmem.h>
50 #include <sys/memlist.h>
51 #include <sys/bootconf.h>
52 
53 #include <vm/seg_vn.h>
54 #include <vm/seg_dev.h>
55 #include <vm/seg_kmem.h>
56 #include <vm/seg_kp.h>
57 #include <vm/seg_kpm.h>
58 #include <vm/hat.h>
59 
60 #include <sys/conf.h>
61 #include <sys/mem.h>
62 #include <sys/types.h>
63 #include <sys/conf.h>
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/errno.h>
67 #include <sys/modctl.h>
68 #include <sys/memlist.h>
69 #include <sys/ddi.h>
70 #include <sys/sunddi.h>
71 #include <sys/debug.h>
72 #include <sys/fm/protocol.h>
73 
74 #if defined(__sparc)
75 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
76 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
77     uint64_t *, int *, int *, int *);
78 extern size_t cpu_get_name_bufsize(void);
79 extern int cpu_get_mem_sid(char *, char *, int, int *);
80 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
81 #elif defined(__x86)
82 #include <sys/cpu_module.h>
83 #endif	/* __sparc */
84 
85 /*
86  * Turn a byte length into a pagecount.  The DDI btop takes a
87  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
88  * large physical-memory 32-bit machines.
89  */
90 #define	BTOP(x)	((pgcnt_t)((x) >> _pageshift))
91 
92 static kmutex_t mm_lock;
93 static caddr_t mm_map;
94 
95 static dev_info_t *mm_dip;	/* private copy of devinfo pointer */
96 
97 static int mm_kmem_io_access;
98 
99 static int mm_kstat_update(kstat_t *ksp, int rw);
100 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
101 
102 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
103 
104 #define	MM_KMEMLOG_NENTRIES	64
105 
106 static int mm_kmemlogent;
107 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES];
108 
109 /*
110  * On kmem/allmem writes, we log information that might be useful in the event
111  * that a write is errant (that is, due to operator error) and induces a later
112  * problem.  Note that (in particular) in the event of such operator-induced
113  * corruption, a search over the kernel address space for the corrupted
114  * address will yield the ring buffer entry that recorded the write.  And
115  * should it seem baroque or otherwise unnecessary, yes, we need this kind of
116  * auditing facility and yes, we learned that the hard way: disturbingly,
117  * there exist recommendations for "tuning" the system that involve writing to
118  * kernel memory addresses via the kernel debugger, and -- as we discovered --
119  * these can easily be applied incorrectly or unsafely, yielding an entirely
120  * undebuggable "can't happen" kind of panic.
121  */
122 static void
123 mm_logkmem(struct uio *uio)
124 {
125 	mm_logentry_t *ent;
126 	proc_t *p = curthread->t_procp;
127 
128 	mutex_enter(&mm_lock);
129 
130 	ent = &mm_kmemlog[mm_kmemlogent++];
131 
132 	if (mm_kmemlogent == MM_KMEMLOG_NENTRIES)
133 		mm_kmemlogent = 0;
134 
135 	ent->mle_vaddr = (uintptr_t)uio->uio_loffset;
136 	ent->mle_len = uio->uio_resid;
137 	gethrestime(&ent->mle_hrestime);
138 	ent->mle_hrtime = gethrtime();
139 	ent->mle_pid = p->p_pidp->pid_id;
140 
141 	(void) strncpy(ent->mle_psargs,
142 	    p->p_user.u_psargs, sizeof (ent->mle_psargs));
143 
144 	mutex_exit(&mm_lock);
145 }
146 
147 /*ARGSUSED1*/
148 static int
149 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
150 {
151 	int i;
152 	struct mem_minor {
153 		char *name;
154 		minor_t minor;
155 		int privonly;
156 		const char *rdpriv;
157 		const char *wrpriv;
158 		mode_t priv_mode;
159 	} mm[] = {
160 		{ "mem",	M_MEM,		0,	NULL,	"all",	0640 },
161 		{ "kmem",	M_KMEM,		0,	NULL,	"all",	0640 },
162 		{ "allkmem",	M_ALLKMEM,	0,	"all",	"all",	0600 },
163 		{ "null",	M_NULL,	PRIVONLY_DEV,	NULL,	NULL,	0666 },
164 		{ "zero",	M_ZERO, PRIVONLY_DEV,	NULL,	NULL,	0666 },
165 		{ "full",	M_FULL, PRIVONLY_DEV,	NULL,	NULL,	0666 },
166 	};
167 	kstat_t *ksp;
168 
169 	mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
170 	mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
171 
172 	for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
173 		if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
174 		    mm[i].minor, DDI_PSEUDO, mm[i].privonly,
175 		    mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
176 		    DDI_FAILURE) {
177 			ddi_remove_minor_node(devi, NULL);
178 			return (DDI_FAILURE);
179 		}
180 	}
181 
182 	mm_dip = devi;
183 
184 	ksp = kstat_create("mm", 0, "phys_installed", "misc",
185 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
186 	if (ksp != NULL) {
187 		ksp->ks_update = mm_kstat_update;
188 		ksp->ks_snapshot = mm_kstat_snapshot;
189 		ksp->ks_lock = &mm_lock; /* XXX - not really needed */
190 		kstat_install(ksp);
191 	}
192 
193 	mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
194 	    "kmem_io_access", 0);
195 
196 	return (DDI_SUCCESS);
197 }
198 
199 /*ARGSUSED*/
200 static int
201 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
202 {
203 	register int error;
204 
205 	switch (infocmd) {
206 	case DDI_INFO_DEVT2DEVINFO:
207 		*result = (void *)mm_dip;
208 		error = DDI_SUCCESS;
209 		break;
210 	case DDI_INFO_DEVT2INSTANCE:
211 		*result = (void *)0;
212 		error = DDI_SUCCESS;
213 		break;
214 	default:
215 		error = DDI_FAILURE;
216 	}
217 	return (error);
218 }
219 
220 /*ARGSUSED1*/
221 static int
222 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
223 {
224 	switch (getminor(*devp)) {
225 	case M_NULL:
226 	case M_ZERO:
227 	case M_FULL:
228 	case M_MEM:
229 	case M_KMEM:
230 	case M_ALLKMEM:
231 		/* standard devices */
232 		break;
233 
234 	default:
235 		/* Unsupported or unknown type */
236 		return (EINVAL);
237 	}
238 	/* must be character device */
239 	if (typ != OTYP_CHR)
240 		return (EINVAL);
241 	return (0);
242 }
243 
244 struct pollhead	mm_pollhd;
245 
246 /*ARGSUSED*/
247 static int
248 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
249     struct pollhead **phpp)
250 {
251 	switch (getminor(dev)) {
252 	case M_NULL:
253 	case M_ZERO:
254 	case M_FULL:
255 	case M_MEM:
256 	case M_KMEM:
257 	case M_ALLKMEM:
258 		*reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
259 		    POLLWRNORM | POLLRDBAND | POLLWRBAND);
260 		/*
261 		 * A non NULL pollhead pointer should be returned in case
262 		 * user polls for 0 events or is doing an edge-triggerd poll.
263 		 */
264 		if ((!*reventsp && !anyyet) || (events & POLLET)) {
265 			*phpp = &mm_pollhd;
266 		}
267 		return (0);
268 	default:
269 		/* no other devices currently support polling */
270 		return (ENXIO);
271 	}
272 }
273 
274 static int
275 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
276     char *name, caddr_t valuep, int *lengthp)
277 {
278 	/*
279 	 * implement zero size to reduce overhead (avoid two failing
280 	 * property lookups per stat).
281 	 */
282 	return (ddi_prop_op_size(dev, dip, prop_op,
283 	    flags, name, valuep, lengthp, 0));
284 }
285 
286 static int
287 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
288     page_t *pp)
289 {
290 	int error = 0;
291 	int devload = 0;
292 	int is_memory = pf_is_memory(pfn);
293 	size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
294 	    (size_t)uio->uio_iov->iov_len);
295 	caddr_t va = NULL;
296 
297 	mutex_enter(&mm_lock);
298 
299 	if (is_memory && kpm_enable) {
300 		if (pp)
301 			va = hat_kpm_mapin(pp, NULL);
302 		else
303 			va = hat_kpm_mapin_pfn(pfn);
304 	}
305 
306 	if (va == NULL) {
307 		hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
308 		    (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
309 		    HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
310 		va = mm_map;
311 		devload = 1;
312 	}
313 
314 	if (!is_memory) {
315 		if (allowio) {
316 			size_t c = uio->uio_iov->iov_len;
317 
318 			if (ddi_peekpokeio(NULL, uio, rw,
319 			    (caddr_t)(uintptr_t)uio->uio_loffset, c,
320 			    sizeof (int32_t)) != DDI_SUCCESS)
321 				error = EFAULT;
322 		} else
323 			error = EIO;
324 	} else
325 		error = uiomove(va + pageoff, nbytes, rw, uio);
326 
327 	if (devload)
328 		hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
329 	else if (pp)
330 		hat_kpm_mapout(pp, NULL, va);
331 	else
332 		hat_kpm_mapout_pfn(pfn);
333 
334 	mutex_exit(&mm_lock);
335 	return (error);
336 }
337 
338 static int
339 mmpagelock(struct as *as, caddr_t va)
340 {
341 	struct seg *seg;
342 	int i;
343 
344 	AS_LOCK_ENTER(as, RW_READER);
345 	seg = as_segat(as, va);
346 	i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
347 	AS_LOCK_EXIT(as);
348 
349 	return (i);
350 }
351 
352 #ifdef	__sparc
353 
354 #define	NEED_LOCK_KVADDR(kva)	mmpagelock(&kas, kva)
355 
356 #else	/* __i386, __amd64 */
357 
358 #define	NEED_LOCK_KVADDR(va)	0
359 
360 #endif	/* __sparc */
361 
362 /*ARGSUSED3*/
363 static int
364 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
365 {
366 	pfn_t v;
367 	struct iovec *iov;
368 	int error = 0;
369 	size_t c;
370 	ssize_t oresid = uio->uio_resid;
371 	minor_t minor = getminor(dev);
372 
373 	while (uio->uio_resid > 0 && error == 0) {
374 		iov = uio->uio_iov;
375 		if (iov->iov_len == 0) {
376 			uio->uio_iov++;
377 			uio->uio_iovcnt--;
378 			if (uio->uio_iovcnt < 0)
379 				panic("mmrw");
380 			continue;
381 		}
382 		switch (minor) {
383 
384 		case M_MEM:
385 			memlist_read_lock();
386 			if (!address_in_memlist(phys_install,
387 			    (uint64_t)uio->uio_loffset, 1)) {
388 				memlist_read_unlock();
389 				error = EFAULT;
390 				break;
391 			}
392 			memlist_read_unlock();
393 
394 			v = BTOP((u_offset_t)uio->uio_loffset);
395 			error = mmio(uio, rw, v,
396 			    uio->uio_loffset & PAGEOFFSET, 0, NULL);
397 			break;
398 
399 		case M_KMEM:
400 		case M_ALLKMEM:
401 			{
402 			page_t **ppp = NULL;
403 			caddr_t vaddr = (caddr_t)uio->uio_offset;
404 			int try_lock = NEED_LOCK_KVADDR(vaddr);
405 			int locked = 0;
406 
407 			if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
408 				break;
409 
410 			if (rw == UIO_WRITE)
411 				mm_logkmem(uio);
412 
413 			/*
414 			 * If vaddr does not map a valid page, as_pagelock()
415 			 * will return failure. Hence we can't check the
416 			 * return value and return EFAULT here as we'd like.
417 			 * seg_kp and seg_kpm do not properly support
418 			 * as_pagelock() for this context so we avoid it
419 			 * using the try_lock set check above.  Some day when
420 			 * the kernel page locking gets redesigned all this
421 			 * muck can be cleaned up.
422 			 */
423 			if (try_lock)
424 				locked = (as_pagelock(&kas, &ppp, vaddr,
425 				    PAGESIZE, S_WRITE) == 0);
426 
427 			v = hat_getpfnum(kas.a_hat,
428 			    (caddr_t)(uintptr_t)uio->uio_loffset);
429 			if (v == PFN_INVALID) {
430 				if (locked)
431 					as_pageunlock(&kas, ppp, vaddr,
432 					    PAGESIZE, S_WRITE);
433 				error = EFAULT;
434 				break;
435 			}
436 
437 			error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
438 			    minor == M_ALLKMEM || mm_kmem_io_access,
439 			    (locked && ppp) ? *ppp : NULL);
440 			if (locked)
441 				as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
442 				    S_WRITE);
443 			}
444 
445 			break;
446 
447 		case M_FULL:
448 			if (rw == UIO_WRITE) {
449 				error = ENOSPC;
450 				break;
451 			}
452 			/* else it's a read, fall through to zero case */
453 			/*FALLTHROUGH*/
454 
455 		case M_ZERO:
456 			if (rw == UIO_READ) {
457 				label_t ljb;
458 
459 				if (on_fault(&ljb)) {
460 					no_fault();
461 					error = EFAULT;
462 					break;
463 				}
464 				uzero(iov->iov_base, iov->iov_len);
465 				no_fault();
466 				uio->uio_resid -= iov->iov_len;
467 				uio->uio_loffset += iov->iov_len;
468 				break;
469 			}
470 			/* else it's a write, fall through to NULL case */
471 			/*FALLTHROUGH*/
472 
473 		case M_NULL:
474 			if (rw == UIO_READ)
475 				return (0);
476 			c = iov->iov_len;
477 			iov->iov_base += c;
478 			iov->iov_len -= c;
479 			uio->uio_loffset += c;
480 			uio->uio_resid -= c;
481 			break;
482 
483 		}
484 	}
485 	return (uio->uio_resid == oresid ? error : 0);
486 }
487 
488 static int
489 mmread(dev_t dev, struct uio *uio, cred_t *cred)
490 {
491 	return (mmrw(dev, uio, UIO_READ, cred));
492 }
493 
494 static int
495 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
496 {
497 	return (mmrw(dev, uio, UIO_WRITE, cred));
498 }
499 
500 /*
501  * Private ioctl for libkvm to support kvm_physaddr().
502  * Given an address space and a VA, compute the PA.
503  */
504 static int
505 mmioctl_vtop(intptr_t data)
506 {
507 #ifdef _SYSCALL32
508 	mem_vtop32_t vtop32;
509 #endif
510 	mem_vtop_t mem_vtop;
511 	proc_t *p;
512 	pfn_t pfn = (pfn_t)PFN_INVALID;
513 	pid_t pid = 0;
514 	struct as *as;
515 	struct seg *seg;
516 
517 	if (get_udatamodel() == DATAMODEL_NATIVE) {
518 		if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
519 			return (EFAULT);
520 	}
521 #ifdef _SYSCALL32
522 	else {
523 		if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
524 			return (EFAULT);
525 		mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
526 		mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
527 
528 		if (mem_vtop.m_as != NULL)
529 			return (EINVAL);
530 	}
531 #endif
532 
533 	if (mem_vtop.m_as == &kas) {
534 		pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
535 	} else {
536 		if (mem_vtop.m_as == NULL) {
537 			/*
538 			 * Assume the calling process's address space if the
539 			 * caller didn't specify one.
540 			 */
541 			p = curthread->t_procp;
542 			if (p == NULL)
543 				return (EIO);
544 			mem_vtop.m_as = p->p_as;
545 		}
546 
547 		mutex_enter(&pidlock);
548 		for (p = practive; p != NULL; p = p->p_next) {
549 			if (p->p_as == mem_vtop.m_as) {
550 				pid = p->p_pid;
551 				break;
552 			}
553 		}
554 		mutex_exit(&pidlock);
555 		if (p == NULL)
556 			return (EIO);
557 		p = sprlock(pid);
558 		if (p == NULL)
559 			return (EIO);
560 		as = p->p_as;
561 		if (as == mem_vtop.m_as) {
562 			mutex_exit(&p->p_lock);
563 			AS_LOCK_ENTER(as, RW_READER);
564 			for (seg = AS_SEGFIRST(as); seg != NULL;
565 			    seg = AS_SEGNEXT(as, seg))
566 				if ((uintptr_t)mem_vtop.m_va -
567 				    (uintptr_t)seg->s_base < seg->s_size)
568 					break;
569 			if (seg != NULL)
570 				pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
571 			AS_LOCK_EXIT(as);
572 			mutex_enter(&p->p_lock);
573 		}
574 		sprunlock(p);
575 	}
576 	mem_vtop.m_pfn = pfn;
577 	if (pfn == PFN_INVALID)
578 		return (EIO);
579 
580 	if (get_udatamodel() == DATAMODEL_NATIVE) {
581 		if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
582 			return (EFAULT);
583 	}
584 #ifdef _SYSCALL32
585 	else {
586 		vtop32.m_pfn = mem_vtop.m_pfn;
587 		if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
588 			return (EFAULT);
589 	}
590 #endif
591 
592 	return (0);
593 }
594 
595 /*
596  * Given a PA, execute the given page retire command on it.
597  */
598 static int
599 mmioctl_page_retire(int cmd, intptr_t data)
600 {
601 	extern int page_retire_test(void);
602 	uint64_t pa;
603 
604 	if (copyin((void *)data, &pa, sizeof (uint64_t))) {
605 		return (EFAULT);
606 	}
607 
608 	switch (cmd) {
609 	case MEM_PAGE_ISRETIRED:
610 		return (page_retire_check(pa, NULL));
611 
612 	case MEM_PAGE_UNRETIRE:
613 		return (page_unretire(pa));
614 
615 	case MEM_PAGE_RETIRE:
616 		return (page_retire(pa, PR_FMA));
617 
618 	case MEM_PAGE_RETIRE_MCE:
619 		return (page_retire(pa, PR_MCE));
620 
621 	case MEM_PAGE_RETIRE_UE:
622 		return (page_retire(pa, PR_UE));
623 
624 	case MEM_PAGE_GETERRORS:
625 		{
626 			uint64_t page_errors;
627 			int rc = page_retire_check(pa, &page_errors);
628 			if (copyout(&page_errors, (void *)data,
629 			    sizeof (uint64_t))) {
630 				return (EFAULT);
631 			}
632 			return (rc);
633 		}
634 
635 	case MEM_PAGE_RETIRE_TEST:
636 		return (page_retire_test());
637 
638 	}
639 
640 	return (EINVAL);
641 }
642 
643 #ifdef __sparc
644 /*
645  * Given a syndrome, syndrome type, and address return the
646  * associated memory name in the provided data buffer.
647  */
648 static int
649 mmioctl_get_mem_name(intptr_t data)
650 {
651 	mem_name_t mem_name;
652 	void *buf;
653 	size_t bufsize;
654 	int len, err;
655 
656 	if ((bufsize = cpu_get_name_bufsize()) == 0)
657 		return (ENOTSUP);
658 
659 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
660 		return (err);
661 
662 	buf = kmem_alloc(bufsize, KM_SLEEP);
663 
664 	/*
665 	 * Call into cpu specific code to do the lookup.
666 	 */
667 	if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
668 	    mem_name.m_addr, buf, bufsize, &len)) != 0) {
669 		kmem_free(buf, bufsize);
670 		return (err);
671 	}
672 
673 	if (len >= mem_name.m_namelen) {
674 		kmem_free(buf, bufsize);
675 		return (ENOSPC);
676 	}
677 
678 	if (copyoutstr(buf, (char *)mem_name.m_name,
679 	    mem_name.m_namelen, NULL) != 0) {
680 		kmem_free(buf, bufsize);
681 		return (EFAULT);
682 	}
683 
684 	kmem_free(buf, bufsize);
685 	return (0);
686 }
687 
688 /*
689  * Given a syndrome and address return information about the associated memory.
690  */
691 static int
692 mmioctl_get_mem_info(intptr_t data)
693 {
694 	mem_info_t mem_info;
695 	int err;
696 
697 	if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
698 		return (EFAULT);
699 
700 	if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
701 	    &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
702 	    &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
703 		return (err);
704 
705 	if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
706 		return (EFAULT);
707 
708 	return (0);
709 }
710 
711 /*
712  * Given a memory name, return its associated serial id
713  */
714 static int
715 mmioctl_get_mem_sid(intptr_t data)
716 {
717 	mem_name_t mem_name;
718 	void *buf;
719 	void *name;
720 	size_t	name_len;
721 	size_t bufsize;
722 	int len, err;
723 
724 	if ((bufsize = cpu_get_name_bufsize()) == 0)
725 		return (ENOTSUP);
726 
727 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
728 		return (err);
729 
730 	buf = kmem_alloc(bufsize, KM_SLEEP);
731 
732 	if (mem_name.m_namelen > 1024)
733 		mem_name.m_namelen = 1024; /* cap at 1024 bytes */
734 
735 	name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
736 
737 	if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
738 	    mem_name.m_namelen, &name_len)) != 0) {
739 		kmem_free(buf, bufsize);
740 		kmem_free(name, mem_name.m_namelen);
741 		return (err);
742 	}
743 
744 	/*
745 	 * Call into cpu specific code to do the lookup.
746 	 */
747 	if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
748 		kmem_free(buf, bufsize);
749 		kmem_free(name, mem_name.m_namelen);
750 		return (err);
751 	}
752 
753 	if (len > mem_name.m_sidlen) {
754 		kmem_free(buf, bufsize);
755 		kmem_free(name, mem_name.m_namelen);
756 		return (ENAMETOOLONG);
757 	}
758 
759 	if (copyoutstr(buf, (char *)mem_name.m_sid,
760 	    mem_name.m_sidlen, NULL) != 0) {
761 		kmem_free(buf, bufsize);
762 		kmem_free(name, mem_name.m_namelen);
763 		return (EFAULT);
764 	}
765 
766 	kmem_free(buf, bufsize);
767 	kmem_free(name, mem_name.m_namelen);
768 	return (0);
769 }
770 #endif	/* __sparc */
771 
772 /*
773  * Private ioctls for
774  *	libkvm to support kvm_physaddr().
775  *	FMA support for page_retire() and memory attribute information.
776  */
777 /*ARGSUSED*/
778 static int
779 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
780 {
781 	if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
782 	    (cmd != MEM_VTOP && getminor(dev) != M_MEM))
783 		return (ENXIO);
784 
785 	switch (cmd) {
786 	case MEM_VTOP:
787 		return (mmioctl_vtop(data));
788 
789 	case MEM_PAGE_RETIRE:
790 	case MEM_PAGE_ISRETIRED:
791 	case MEM_PAGE_UNRETIRE:
792 	case MEM_PAGE_RETIRE_MCE:
793 	case MEM_PAGE_RETIRE_UE:
794 	case MEM_PAGE_GETERRORS:
795 	case MEM_PAGE_RETIRE_TEST:
796 		return (mmioctl_page_retire(cmd, data));
797 
798 #ifdef __sparc
799 	case MEM_NAME:
800 		return (mmioctl_get_mem_name(data));
801 
802 	case MEM_INFO:
803 		return (mmioctl_get_mem_info(data));
804 
805 	case MEM_SID:
806 		return (mmioctl_get_mem_sid(data));
807 #else
808 	case MEM_NAME:
809 	case MEM_INFO:
810 	case MEM_SID:
811 		return (ENOTSUP);
812 #endif	/* __sparc */
813 	}
814 	return (ENXIO);
815 }
816 
817 /*ARGSUSED2*/
818 static int
819 mmmmap(dev_t dev, off_t off, int prot)
820 {
821 	pfn_t pf;
822 	struct memlist *pmem;
823 	minor_t minor = getminor(dev);
824 
825 	switch (minor) {
826 	case M_MEM:
827 		pf = btop(off);
828 		memlist_read_lock();
829 		for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
830 			if (pf >= BTOP(pmem->ml_address) &&
831 			    pf < BTOP(pmem->ml_address + pmem->ml_size)) {
832 				memlist_read_unlock();
833 				return (impl_obmem_pfnum(pf));
834 			}
835 		}
836 		memlist_read_unlock();
837 		break;
838 
839 	case M_KMEM:
840 	case M_ALLKMEM:
841 		/* no longer supported with KPR */
842 		return (-1);
843 
844 	case M_FULL:
845 	case M_ZERO:
846 		/*
847 		 * We shouldn't be mmap'ing to /dev/zero here as
848 		 * mmsegmap() should have already converted
849 		 * a mapping request for this device to a mapping
850 		 * using seg_vn for anonymous memory.
851 		 */
852 		break;
853 
854 	}
855 	return (-1);
856 }
857 
858 /*
859  * This function is called when a memory device is mmap'ed.
860  * Set up the mapping to the correct device driver.
861  */
862 static int
863 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
864     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
865 {
866 	struct segvn_crargs vn_a;
867 	struct segdev_crargs dev_a;
868 	int error;
869 	minor_t minor;
870 	off_t i;
871 
872 	minor = getminor(dev);
873 
874 	as_rangelock(as);
875 	/*
876 	 * No need to worry about vac alignment on /dev/zero
877 	 * since this is a "clone" object that doesn't yet exist.
878 	 */
879 	error = choose_addr(as, addrp, len, off,
880 	    (minor == M_MEM) || (minor == M_KMEM), flags);
881 	if (error != 0) {
882 		as_rangeunlock(as);
883 		return (error);
884 	}
885 
886 	switch (minor) {
887 	case M_MEM:
888 		/* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
889 		if ((flags & MAP_TYPE) != MAP_SHARED) {
890 			as_rangeunlock(as);
891 			return (EINVAL);
892 		}
893 
894 		/*
895 		 * Check to ensure that the entire range is
896 		 * legal and we are not trying to map in
897 		 * more than the device will let us.
898 		 */
899 		for (i = 0; i < len; i += PAGESIZE) {
900 			if (mmmmap(dev, off + i, maxprot) == -1) {
901 				as_rangeunlock(as);
902 				return (ENXIO);
903 			}
904 		}
905 
906 		/*
907 		 * Use seg_dev segment driver for /dev/mem mapping.
908 		 */
909 		dev_a.mapfunc = mmmmap;
910 		dev_a.dev = dev;
911 		dev_a.offset = off;
912 		dev_a.type = (flags & MAP_TYPE);
913 		dev_a.prot = (uchar_t)prot;
914 		dev_a.maxprot = (uchar_t)maxprot;
915 		dev_a.hat_attr = 0;
916 
917 		/*
918 		 * Make /dev/mem mappings non-consistent since we can't
919 		 * alias pages that don't have page structs behind them,
920 		 * such as kernel stack pages. If someone mmap()s a kernel
921 		 * stack page and if we give them a tte with cv, a line from
922 		 * that page can get into both pages of the spitfire d$.
923 		 * But snoop from another processor will only invalidate
924 		 * the first page. This later caused kernel (xc_attention)
925 		 * to go into an infinite loop at pil 13 and no interrupts
926 		 * could come in. See 1203630.
927 		 *
928 		 */
929 		dev_a.hat_flags = HAT_LOAD_NOCONSIST;
930 		dev_a.devmap_data = NULL;
931 
932 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
933 		break;
934 
935 	case M_ZERO:
936 		/*
937 		 * Use seg_vn segment driver for /dev/zero mapping.
938 		 * Passing in a NULL amp gives us the "cloning" effect.
939 		 */
940 		vn_a.vp = NULL;
941 		vn_a.offset = 0;
942 		vn_a.type = (flags & MAP_TYPE);
943 		vn_a.prot = prot;
944 		vn_a.maxprot = maxprot;
945 		vn_a.flags = flags & ~MAP_TYPE;
946 		vn_a.cred = cred;
947 		vn_a.amp = NULL;
948 		vn_a.szc = 0;
949 		vn_a.lgrp_mem_policy_flags = 0;
950 		error = as_map(as, *addrp, len, segvn_create, &vn_a);
951 		break;
952 
953 	case M_KMEM:
954 	case M_ALLKMEM:
955 		/* No longer supported with KPR. */
956 		error = ENXIO;
957 		break;
958 
959 	case M_NULL:
960 		/*
961 		 * Use seg_dev segment driver for /dev/null mapping.
962 		 */
963 		dev_a.mapfunc = mmmmap;
964 		dev_a.dev = dev;
965 		dev_a.offset = off;
966 		dev_a.type = 0;		/* neither PRIVATE nor SHARED */
967 		dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
968 		dev_a.hat_attr = 0;
969 		dev_a.hat_flags = 0;
970 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
971 		break;
972 
973 	default:
974 		error = ENXIO;
975 	}
976 
977 	as_rangeunlock(as);
978 	return (error);
979 }
980 
981 static struct cb_ops mm_cb_ops = {
982 	mmopen,			/* open */
983 	nulldev,		/* close */
984 	nodev,			/* strategy */
985 	nodev,			/* print */
986 	nodev,			/* dump */
987 	mmread,			/* read */
988 	mmwrite,		/* write */
989 	mmioctl,		/* ioctl */
990 	nodev,			/* devmap */
991 	mmmmap,			/* mmap */
992 	mmsegmap,		/* segmap */
993 	mmchpoll,		/* poll */
994 	mmpropop,		/* prop_op */
995 	0,			/* streamtab  */
996 	D_NEW | D_MP | D_64BIT | D_U64BIT
997 };
998 
999 static struct dev_ops mm_ops = {
1000 	DEVO_REV,		/* devo_rev, */
1001 	0,			/* refcnt  */
1002 	mm_info,		/* get_dev_info */
1003 	nulldev,		/* identify */
1004 	nulldev,		/* probe */
1005 	mm_attach,		/* attach */
1006 	nodev,			/* detach */
1007 	nodev,			/* reset */
1008 	&mm_cb_ops,		/* driver operations */
1009 	(struct bus_ops *)0,	/* bus operations */
1010 	NULL,			/* power */
1011 	ddi_quiesce_not_needed,		/* quiesce */
1012 };
1013 
1014 static struct modldrv modldrv = {
1015 	&mod_driverops, "memory driver", &mm_ops,
1016 };
1017 
1018 static struct modlinkage modlinkage = {
1019 	MODREV_1, &modldrv, NULL
1020 };
1021 
1022 int
1023 _init(void)
1024 {
1025 	return (mod_install(&modlinkage));
1026 }
1027 
1028 int
1029 _info(struct modinfo *modinfop)
1030 {
1031 	return (mod_info(&modlinkage, modinfop));
1032 }
1033 
1034 int
1035 _fini(void)
1036 {
1037 	return (mod_remove(&modlinkage));
1038 }
1039 
1040 static int
1041 mm_kstat_update(kstat_t *ksp, int rw)
1042 {
1043 	struct memlist *pmem;
1044 	uint_t count;
1045 
1046 	if (rw == KSTAT_WRITE)
1047 		return (EACCES);
1048 
1049 	count = 0;
1050 	memlist_read_lock();
1051 	for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
1052 		count++;
1053 	}
1054 	memlist_read_unlock();
1055 
1056 	ksp->ks_ndata = count;
1057 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1058 
1059 	return (0);
1060 }
1061 
1062 static int
1063 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1064 {
1065 	struct memlist *pmem;
1066 	struct memunit {
1067 		uint64_t address;
1068 		uint64_t size;
1069 	} *kspmem;
1070 
1071 	if (rw == KSTAT_WRITE)
1072 		return (EACCES);
1073 
1074 	ksp->ks_snaptime = gethrtime();
1075 
1076 	kspmem = (struct memunit *)buf;
1077 	memlist_read_lock();
1078 	for (pmem = phys_install; pmem != NULL;
1079 	    pmem = pmem->ml_next, kspmem++) {
1080 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1081 			break;
1082 		kspmem->address = pmem->ml_address;
1083 		kspmem->size = pmem->ml_size;
1084 	}
1085 	memlist_read_unlock();
1086 
1087 	return (0);
1088 }
1089 
1090 /*
1091  * Read a mem_name_t from user-space and store it in the mem_name_t
1092  * pointed to by the mem_name argument.
1093  */
1094 static int
1095 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1096 {
1097 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1098 		if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1099 			return (EFAULT);
1100 	}
1101 #ifdef	_SYSCALL32
1102 	else {
1103 		mem_name32_t mem_name32;
1104 
1105 		if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1106 			return (EFAULT);
1107 		mem_name->m_addr = mem_name32.m_addr;
1108 		mem_name->m_synd = mem_name32.m_synd;
1109 		mem_name->m_type[0] = mem_name32.m_type[0];
1110 		mem_name->m_type[1] = mem_name32.m_type[1];
1111 		mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1112 		mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1113 		mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1114 		mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1115 	}
1116 #endif	/* _SYSCALL32 */
1117 
1118 	return (0);
1119 }
1120