1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * KVM backend for hypervisor domain dumps.  We don't use libkvm for such
28  * dumps, since they do not have a namelist file or the typical dump structures
29  * we expect to aid bootstrapping.  Instead, we bootstrap based upon a
30  * debug_info structure at a known VA, using the guest's own page tables to
31  * resolve to physical addresses, and construct the namelist in a manner
32  * similar to ksyms_snapshot().
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 #include <strings.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <stddef.h>
41 #include <stdarg.h>
42 #include <unistd.h>
43 #include <fcntl.h>
44 #include <gelf.h>
45 #include <errno.h>
46 
47 #include <sys/mman.h>
48 #include <sys/stat.h>
49 #include <sys/debug_info.h>
50 #include <sys/xen_mmu.h>
51 #include <sys/elf.h>
52 #include <sys/machelf.h>
53 #include <sys/modctl.h>
54 #include <sys/kobj.h>
55 #include <sys/kobj_impl.h>
56 #include <sys/sysmacros.h>
57 #include <sys/privmregs.h>
58 #include <vm/as.h>
59 
60 #include <mdb/mdb_io.h>
61 #include <mdb/mdb_kb.h>
62 #include <mdb/mdb_target_impl.h>
63 
64 #include <xen/public/xen.h>
65 
66 #if defined(__i386)
67 #define	DEF_DEBUG_INFO_VA 0xfb3ff000
68 #define	PAE_DEBUG_INFO_VA 0xf4bff000
69 #elif defined(__amd64)
70 #define	DEF_DEBUG_INFO_VA 0xfffffffffb7ff000
71 #endif
72 
73 #define	XKB_SHDR_NULL 0
74 #define	XKB_SHDR_SYMTAB 1
75 #define	XKB_SHDR_STRTAB 2
76 #define	XKB_SHDR_SHSTRTAB 3
77 #define	XKB_SHDR_NUM 4
78 
79 #define	XKB_WALK_LOCAL 0x1
80 #define	XKB_WALK_GLOBAL 0x2
81 #define	XKB_WALK_STR 0x4
82 #define	XKB_WALK_ALL (XKB_WALK_LOCAL | XKB_WALK_GLOBAL | XKB_WALK_STR)
83 
84 #define	PAGE_SIZE 0x1000
85 #define	PAGE_SHIFT 12
86 #define	PAGE_OFFSET(a) ((a) & (PAGE_SIZE - 1))
87 #define	PAGE_MASK(a) ((a) & ~(PAGE_SIZE - 1))
88 #define	PT_PADDR 0x000ffffffffff000ull
89 #define	PT_VALID 0x1
90 
91 /*
92  * Once the headers are available easily from within ON, we can use those, but
93  * until then these definitions are duplicates.
94  */
95 
96 #define	XC_CORE_MAGIC 0xF00FEBED
97 #define	XC_CORE_MAGIC_HVM 0xF00FEBEE
98 
99 #define	VGCF_HVM_GUEST (1<<1)
100 
101 typedef struct xc_core_header {
102 	unsigned int xch_magic;
103 	unsigned int xch_nr_vcpus;
104 	unsigned int xch_nr_pages;
105 	unsigned int xch_ctxt_offset;
106 	unsigned int xch_index_offset;
107 	unsigned int xch_pages_offset;
108 } xc_core_header_t;
109 
110 typedef struct mfn_map {
111 	mfn_t mm_mfn;
112 	char *mm_map;
113 } mfn_map_t;
114 
115 typedef struct mmu_info {
116 	size_t mi_max;
117 	size_t mi_shift[4];
118 	size_t mi_ptes;
119 	size_t mi_ptesize;
120 } mmu_info_t;
121 
122 typedef struct xkb {
123 	char *xkb_path;
124 	int xkb_fd;
125 	xc_core_header_t xkb_hdr;
126 	char *xkb_namelist;
127 	size_t xkb_namesize;
128 	struct vcpu_guest_context *xkb_ctxts;
129 	mfn_t xkb_max_mfn;
130 	mmu_info_t xkb_mmu;
131 	char *xkb_pages;
132 	mfn_t *xkb_p2m;
133 	void *xkb_p2m_buf;
134 	xen_pfn_t *xkb_m2p;
135 	debug_info_t xkb_info;
136 	mfn_map_t xkb_pt_map[4];
137 	mfn_map_t xkb_map;
138 } xkb_t;
139 
140 static const char xkb_shstrtab[] = "\0.symtab\0.strtab\0.shstrtab\0";
141 
142 typedef struct xkb_namelist {
143 	Ehdr	kh_elf_hdr;
144 	Phdr	kh_text_phdr;
145 	Phdr	kh_data_phdr;
146 	Shdr	kh_shdr[XKB_SHDR_NUM];
147 	char	shstrings[sizeof (xkb_shstrtab)];
148 } xkb_namelist_t;
149 
150 static int xkb_build_ksyms(xkb_t *);
151 static offset_t xkb_mfn_to_offset(xkb_t *, mfn_t);
152 static mfn_t xkb_va_to_mfn(xkb_t *, uintptr_t, mfn_t);
153 static ssize_t xkb_read(xkb_t *, uintptr_t, void *, size_t);
154 static int xkb_read_word(xkb_t *, uintptr_t, uintptr_t *);
155 static char *xkb_map_mfn(xkb_t *, mfn_t, mfn_map_t *);
156 static int xkb_close(xkb_t *);
157 
158 int
159 xkb_identify(const char *file, int *longmode)
160 {
161 	xc_core_header_t header;
162 	size_t sz;
163 	int fd;
164 
165 	if ((fd = open64(file, O_RDONLY)) == -1)
166 		return (-1);
167 
168 	if (pread64(fd, &header, sizeof (header), 0) != sizeof (header)) {
169 		(void) close(fd);
170 		return (0);
171 	}
172 
173 	(void) close(fd);
174 
175 	if (header.xch_magic != XC_CORE_MAGIC)
176 		return (0);
177 
178 	*longmode = 0;
179 
180 	/*
181 	 * Indeed.
182 	 */
183 	sz = header.xch_index_offset - header.xch_ctxt_offset;
184 #ifdef _LP64
185 	if (sizeof (struct vcpu_guest_context) * header.xch_nr_vcpus == sz)
186 		*longmode = 1;
187 #else
188 	if (sizeof (struct vcpu_guest_context) * header.xch_nr_vcpus != sz)
189 		*longmode = 1;
190 #endif /* _LP64 */
191 
192 	return (1);
193 }
194 
195 static void *
196 xkb_fail(xkb_t *xkb, const char *msg, ...)
197 {
198 	va_list args;
199 
200 	va_start(args, msg);
201 	if (xkb != NULL)
202 		(void) fprintf(stderr, "%s: ", xkb->xkb_path);
203 	(void) vfprintf(stderr, msg, args);
204 	(void) fprintf(stderr, "\n");
205 	va_end(args);
206 	if (xkb != NULL)
207 		(void) xkb_close(xkb);
208 	return (NULL);
209 }
210 
211 static int
212 xkb_build_m2p(xkb_t *xkb)
213 {
214 	size_t i;
215 
216 	for (i = 0; i < xkb->xkb_hdr.xch_nr_pages; i++) {
217 		if (xkb->xkb_p2m[i] != MFN_INVALID &&
218 		    xkb->xkb_p2m[i] > xkb->xkb_max_mfn)
219 			xkb->xkb_max_mfn = xkb->xkb_p2m[i];
220 	}
221 
222 	xkb->xkb_m2p = mdb_alloc((xkb->xkb_max_mfn + 1) * sizeof (xen_pfn_t),
223 	    UM_SLEEP);
224 
225 	for (i = 0; i <= xkb->xkb_max_mfn; i++)
226 		xkb->xkb_m2p[i] = PFN_INVALID;
227 
228 	for (i = 0; i < xkb->xkb_hdr.xch_nr_pages; i++) {
229 		if (xkb->xkb_p2m[i] != MFN_INVALID)
230 			xkb->xkb_m2p[xkb->xkb_p2m[i]] = i;
231 	}
232 
233 	return (1);
234 }
235 
236 /*
237  * Just to make things jolly fun, they've not page-aligned the p2m table.
238  */
239 static int
240 xkb_map_p2m(xkb_t *xkb)
241 {
242 	offset_t off;
243 	size_t size;
244 	size_t count = xkb->xkb_hdr.xch_nr_pages;
245 	size_t boff = xkb->xkb_hdr.xch_index_offset;
246 
247 	size = sizeof (mfn_t) * count + (PAGE_SIZE) * 2;
248 	size = PAGE_MASK(size);
249 	off = PAGE_MASK(boff);
250 
251 	/* LINTED - alignment */
252 	xkb->xkb_p2m_buf = (mfn_t *)mmap(NULL, size, PROT_READ,
253 	    MAP_SHARED, xkb->xkb_fd, off);
254 
255 	if (xkb->xkb_p2m_buf == (xen_pfn_t *)MAP_FAILED) {
256 		(void) xkb_fail(xkb, "cannot map p2m table");
257 		return (0);
258 	}
259 
260 	/* LINTED - alignment */
261 	xkb->xkb_p2m = (mfn_t *)((char *)xkb->xkb_p2m_buf +
262 	    PAGE_OFFSET(boff));
263 
264 	return (1);
265 }
266 
267 /*
268  * Return the MFN of the top-level page table for the given as.
269  */
270 static mfn_t
271 xkb_as_to_mfn(xkb_t *xkb, struct as *as)
272 {
273 	uintptr_t asp = (uintptr_t)as;
274 	uintptr_t hatp;
275 	uintptr_t htablep;
276 	uintptr_t pfn;
277 
278 	if (!xkb_read_word(xkb, asp + offsetof(struct as, a_hat), &hatp))
279 		return (MFN_INVALID);
280 	if (!xkb_read_word(xkb, hatp + xkb->xkb_info.di_hat_htable_off,
281 	    &htablep))
282 		return (MFN_INVALID);
283 	if (!xkb_read_word(xkb, htablep + xkb->xkb_info.di_ht_pfn_off,
284 	    &pfn))
285 		return (MFN_INVALID);
286 
287 	if (pfn >= xkb->xkb_hdr.xch_nr_pages)
288 		return (MFN_INVALID);
289 
290 	return (xkb->xkb_p2m[pfn]);
291 }
292 
293 static ssize_t
294 xkb_read_helper(xkb_t *xkb, struct as *as, int phys, uint64_t addr,
295     void *buf, size_t size)
296 {
297 	size_t left = size;
298 	int windowed = xkb->xkb_pages == NULL;
299 	mfn_t tlmfn = xen_cr3_to_pfn(xkb->xkb_ctxts[0].ctrlreg[3]);
300 
301 	if (as != NULL && (tlmfn = xkb_as_to_mfn(xkb, as)) == MFN_INVALID)
302 		return (-1);
303 
304 	while (left) {
305 		uint64_t pos = addr + (size - left);
306 		char *outpos = (char *)buf + (size - left);
307 		size_t pageoff = PAGE_OFFSET(pos);
308 		size_t sz = MIN(left, PAGE_SIZE - pageoff);
309 		mfn_t mfn;
310 
311 		if (!phys) {
312 			mfn = xkb_va_to_mfn(xkb, pos, tlmfn);
313 			if (mfn == MFN_INVALID)
314 				return (-1);
315 		} else {
316 			xen_pfn_t pfn = pos >> PAGE_SHIFT;
317 			if (pfn >= xkb->xkb_hdr.xch_nr_pages)
318 				return (-1);
319 			mfn = xkb->xkb_p2m[pfn];
320 			if (mfn == MFN_INVALID)
321 				return (-1);
322 		}
323 
324 		/*
325 		 * If we're windowed then pread() is much faster.
326 		 */
327 		if (windowed) {
328 			offset_t off = xkb_mfn_to_offset(xkb, mfn);
329 			int ret;
330 
331 			if (off == ~1ULL)
332 				return (-1);
333 
334 			off += pageoff;
335 
336 			ret = pread64(xkb->xkb_fd, outpos, sz, off);
337 			if (ret == -1)
338 				return (-1);
339 			if (ret != sz)
340 				return ((size - left) + ret);
341 
342 			left -= ret;
343 		} else {
344 			if (xkb_map_mfn(xkb, mfn, &xkb->xkb_map) == NULL)
345 				return (-1);
346 
347 			bcopy(xkb->xkb_map.mm_map + pageoff, outpos, sz);
348 
349 			left -= sz;
350 		}
351 	}
352 
353 	return (size);
354 }
355 
356 static ssize_t
357 xkb_pread(xkb_t *xkb, uint64_t addr, void *buf, size_t size)
358 {
359 	return (xkb_read_helper(xkb, NULL, 1, addr, buf, size));
360 }
361 
362 static ssize_t
363 xkb_aread(xkb_t *xkb, uintptr_t addr, void *buf, size_t size, struct as *as)
364 {
365 	return (xkb_read_helper(xkb, as, 0, addr, buf, size));
366 }
367 
368 static ssize_t
369 xkb_read(xkb_t *xkb, uintptr_t addr, void *buf, size_t size)
370 {
371 	return (xkb_aread(xkb, addr, buf, size, NULL));
372 }
373 
374 static int
375 xkb_read_word(xkb_t *xkb, uintptr_t addr, uintptr_t *buf)
376 {
377 	if (xkb_read(xkb, addr, buf, sizeof (uintptr_t)) !=
378 	    sizeof (uintptr_t))
379 		return (0);
380 	return (1);
381 }
382 
383 static char *
384 xkb_readstr(xkb_t *xkb, uintptr_t addr)
385 {
386 	char *str = mdb_alloc(1024, UM_SLEEP);
387 	size_t i;
388 
389 	for (i = 0; i < 1024; i++) {
390 		if (xkb_read(xkb, addr + i, &str[i], 1) != 1) {
391 			mdb_free(str, 1024);
392 			return (NULL);
393 		}
394 
395 		if (str[i] == '\0')
396 			break;
397 	}
398 
399 	if (i == 1024) {
400 		mdb_free(str, 1024);
401 		return (NULL);
402 	}
403 
404 	return (str);
405 }
406 
407 static offset_t
408 xkb_mfn_to_offset(xkb_t *xkb, mfn_t mfn)
409 {
410 	xen_pfn_t pfn;
411 
412 	if (mfn > xkb->xkb_max_mfn)
413 		return (-1ULL);
414 
415 	pfn = xkb->xkb_m2p[mfn];
416 
417 	if (pfn == PFN_INVALID)
418 		return (-1ULL);
419 
420 	return (xkb->xkb_hdr.xch_pages_offset + (PAGE_SIZE * pfn));
421 }
422 
423 static char *
424 xkb_map_mfn(xkb_t *xkb, mfn_t mfn, mfn_map_t *mm)
425 {
426 	int windowed = xkb->xkb_pages == NULL;
427 	offset_t off;
428 
429 	if (mm->mm_mfn == mfn)
430 		return (mm->mm_map);
431 
432 	mm->mm_mfn = mfn;
433 
434 	if (windowed) {
435 		if (mm->mm_map != (char *)MAP_FAILED) {
436 			(void) munmap(mm->mm_map, PAGE_SIZE);
437 			mm->mm_map = (void *)MAP_FAILED;
438 		}
439 
440 		if ((off = xkb_mfn_to_offset(xkb, mfn)) == (-1ULL))
441 			return (NULL);
442 
443 		mm->mm_map = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED,
444 		    xkb->xkb_fd, off);
445 
446 		if (mm->mm_map == (char *)MAP_FAILED)
447 			return (NULL);
448 	} else {
449 		xen_pfn_t pfn;
450 
451 		mm->mm_map = NULL;
452 
453 		if (mfn > xkb->xkb_max_mfn)
454 			return (NULL);
455 
456 		pfn = xkb->xkb_m2p[mfn];
457 
458 		if (pfn == PFN_INVALID)
459 			return (NULL);
460 
461 		mm->mm_map = xkb->xkb_pages + (PAGE_SIZE * pfn);
462 	}
463 
464 	return (mm->mm_map);
465 }
466 
467 static mfn_t
468 xkb_pte_to_mfn(mmu_info_t *mmu, char *ptep)
469 {
470 	/* LINTED - alignment */
471 	uint64_t pte = *((uint64_t *)ptep);
472 
473 	if (mmu->mi_ptesize == 4) {
474 		/* LINTED - alignment */
475 		pte = *((uint32_t *)ptep);
476 	}
477 
478 	if (!(pte & PT_VALID))
479 		return (MFN_INVALID);
480 
481 	/* XXX: doesn't do large pages */
482 	pte &= PT_PADDR;
483 
484 	return (pte >> PAGE_SHIFT);
485 }
486 
487 /*
488  * Resolve the given VA into an MFN, using the provided mfn as a top-level page
489  * table.
490  */
491 static mfn_t
492 xkb_va_to_mfn(xkb_t *xkb, uintptr_t va, mfn_t mfn)
493 {
494 	mmu_info_t *mmu = &xkb->xkb_mmu;
495 	size_t level;
496 
497 	for (level = mmu->mi_max; ; --level) {
498 		size_t entry;
499 		char *tmp;
500 
501 		if (xkb_map_mfn(xkb, mfn, &xkb->xkb_pt_map[level]) == NULL)
502 			return (MFN_INVALID);
503 
504 		entry = (va >> mmu->mi_shift[level]) & (mmu->mi_ptes - 1);
505 
506 		tmp = (char *)xkb->xkb_pt_map[level].mm_map +
507 		    entry * mmu->mi_ptesize;
508 
509 		if ((mfn = xkb_pte_to_mfn(mmu, tmp)) == MFN_INVALID)
510 			return (MFN_INVALID);
511 
512 		if (level == 0)
513 			break;
514 	}
515 
516 	return (mfn);
517 }
518 
519 static int
520 xkb_read_module(xkb_t *xkb, uintptr_t modulep, struct module *module,
521     uintptr_t *sym_addr, uintptr_t *sym_count, uintptr_t *str_addr)
522 {
523 	if (xkb_read(xkb, modulep, module, sizeof (struct module)) !=
524 	    sizeof (struct module))
525 		return (0);
526 
527 	if (!xkb_read_word(xkb, (uintptr_t)module->symhdr +
528 	    offsetof(Shdr, sh_addr), sym_addr))
529 		return (0);
530 
531 	if (!xkb_read_word(xkb, (uintptr_t)module->strhdr +
532 	    offsetof(Shdr, sh_addr), str_addr))
533 		return (0);
534 
535 	if (!xkb_read_word(xkb, (uintptr_t)module->symhdr +
536 	    offsetof(Shdr, sh_size), sym_count))
537 		return (0);
538 	*sym_count /= sizeof (Sym);
539 
540 	return (1);
541 }
542 
543 static int
544 xkb_read_modsyms(xkb_t *xkb, char **buf, size_t *sizes, int types,
545     uintptr_t sym_addr, uintptr_t str_addr, uintptr_t sym_count)
546 {
547 	size_t i;
548 
549 	for (i = 0; i < sym_count; i++) {
550 		Sym sym;
551 		char *name;
552 		size_t sz;
553 		int type = XKB_WALK_GLOBAL;
554 
555 		if (xkb_read(xkb, sym_addr + i * sizeof (sym), &sym,
556 		    sizeof (sym)) != sizeof (sym))
557 			return (0);
558 
559 		if (GELF_ST_BIND(sym.st_info) == STB_LOCAL)
560 			type = XKB_WALK_LOCAL;
561 
562 		name = xkb_readstr(xkb, str_addr + sym.st_name);
563 
564 		sym.st_shndx = SHN_ABS;
565 		sym.st_name = sizes[XKB_WALK_STR];
566 
567 		sizes[type] += sizeof (sym);
568 		sz = strlen(name) + 1;
569 		sizes[XKB_WALK_STR] += sz;
570 
571 		if (buf != NULL) {
572 			if (types & type) {
573 				bcopy(&sym, *buf, sizeof (sym));
574 				*buf += sizeof (sym);
575 			}
576 			if (types & XKB_WALK_STR) {
577 				bcopy(name, *buf, sz);
578 				*buf += sz;
579 			}
580 		}
581 
582 		mdb_free(name, 1024);
583 	}
584 
585 	return (1);
586 }
587 
588 static int
589 xkb_walk_syms(xkb_t *xkb, uintptr_t modhead, char **buf,
590     size_t *sizes, int types)
591 {
592 	uintptr_t modctl = modhead;
593 	uintptr_t modulep;
594 	struct module module;
595 	uintptr_t sym_count;
596 	uintptr_t sym_addr;
597 	uintptr_t str_addr;
598 	size_t max_iter = 500;
599 
600 	bzero(sizes, sizeof (*sizes) * (XKB_WALK_STR + 1));
601 
602 	/*
603 	 * empty first symbol
604 	 */
605 	sizes[XKB_WALK_LOCAL] += sizeof (Sym);
606 	sizes[XKB_WALK_STR] += 1;
607 
608 	if (buf != NULL) {
609 		if (types & XKB_WALK_LOCAL) {
610 			Sym tmp;
611 			bzero(&tmp, sizeof (tmp));
612 			bcopy(&tmp, *buf, sizeof (tmp));
613 			*buf += sizeof (tmp);
614 		}
615 		if (types & XKB_WALK_STR) {
616 			**buf = '\0';
617 			(*buf)++;
618 		}
619 	}
620 
621 	for (;;) {
622 		if (!xkb_read_word(xkb,
623 		    modctl + offsetof(struct modctl, mod_mp), &modulep))
624 			return (0);
625 
626 		if (modulep == NULL)
627 			goto next;
628 
629 		if (!xkb_read_module(xkb, modulep, &module, &sym_addr,
630 		    &sym_count, &str_addr))
631 			return (0);
632 
633 		if ((module.flags & KOBJ_NOKSYMS))
634 			goto next;
635 
636 		if (!xkb_read_modsyms(xkb, buf, sizes, types, sym_addr,
637 		    str_addr, sym_count))
638 			return (0);
639 
640 next:
641 		if (!xkb_read_word(xkb,
642 		    modctl + offsetof(struct modctl, mod_next), &modctl))
643 			return (0);
644 
645 		if (modctl == modhead)
646 			break;
647 		/*
648 		 * Try and prevent us looping forever if we have a broken list.
649 		 */
650 		if (--max_iter == 0)
651 			break;
652 	}
653 
654 	return (1);
655 }
656 
657 /*
658  * Userspace equivalent of ksyms_snapshot().  Since we don't have a namelist
659  * file for hypervisor images, we fabricate one here using code similar
660  * to that of /dev/ksyms.
661  */
662 static int
663 xkb_build_ksyms(xkb_t *xkb)
664 {
665 	debug_info_t *info = &xkb->xkb_info;
666 	size_t sizes[XKB_WALK_STR + 1];
667 	xkb_namelist_t *hdr;
668 	char *buf;
669 	struct modctl modules;
670 	uintptr_t module;
671 	Shdr *shp;
672 
673 	if (xkb_read(xkb, info->di_modules, &modules,
674 	    sizeof (struct modctl)) != sizeof (struct modctl))
675 		return (0);
676 
677 	module = (uintptr_t)modules.mod_mp;
678 
679 	if (!xkb_walk_syms(xkb, info->di_modules, NULL, sizes,
680 	    XKB_WALK_LOCAL | XKB_WALK_GLOBAL | XKB_WALK_STR))
681 		return (0);
682 
683 	xkb->xkb_namesize = sizeof (xkb_namelist_t);
684 	xkb->xkb_namesize += sizes[XKB_WALK_LOCAL];
685 	xkb->xkb_namesize += sizes[XKB_WALK_GLOBAL];
686 	xkb->xkb_namesize += sizes[XKB_WALK_STR];
687 
688 	if ((xkb->xkb_namelist = mdb_zalloc(xkb->xkb_namesize, UM_SLEEP))
689 	    == NULL)
690 		return (0);
691 
692 	/* LINTED - alignment */
693 	hdr = (xkb_namelist_t *)xkb->xkb_namelist;
694 
695 	if (xkb_read(xkb, module + offsetof(struct module, hdr),
696 	    &hdr->kh_elf_hdr, sizeof (Ehdr)) != sizeof (Ehdr))
697 		return (0);
698 
699 	hdr->kh_elf_hdr.e_phoff = offsetof(xkb_namelist_t, kh_text_phdr);
700 	hdr->kh_elf_hdr.e_shoff = offsetof(xkb_namelist_t, kh_shdr);
701 	hdr->kh_elf_hdr.e_phnum = 2;
702 	hdr->kh_elf_hdr.e_shnum = XKB_SHDR_NUM;
703 	hdr->kh_elf_hdr.e_shstrndx = XKB_SHDR_SHSTRTAB;
704 
705 	hdr->kh_text_phdr.p_type = PT_LOAD;
706 	hdr->kh_text_phdr.p_vaddr = (Addr)info->di_s_text;
707 	hdr->kh_text_phdr.p_memsz = (Word)(info->di_e_text - info->di_s_text);
708 	hdr->kh_text_phdr.p_flags = PF_R | PF_X;
709 
710 	hdr->kh_data_phdr.p_type = PT_LOAD;
711 	hdr->kh_data_phdr.p_vaddr = (Addr)info->di_s_data;
712 	hdr->kh_data_phdr.p_memsz = (Word)(info->di_e_data - info->di_s_data);
713 	hdr->kh_data_phdr.p_flags = PF_R | PF_W | PF_X;
714 
715 	shp = &hdr->kh_shdr[XKB_SHDR_SYMTAB];
716 	shp->sh_name = 1;	/* xkb_shstrtab[1] = ".symtab" */
717 	shp->sh_type = SHT_SYMTAB;
718 	shp->sh_offset = sizeof (xkb_namelist_t);
719 	shp->sh_size = sizes[XKB_WALK_LOCAL] + sizes[XKB_WALK_GLOBAL];
720 	shp->sh_link = XKB_SHDR_STRTAB;
721 	shp->sh_info = sizes[XKB_WALK_LOCAL] / sizeof (Sym);
722 	shp->sh_addralign = sizeof (Addr);
723 	shp->sh_entsize = sizeof (Sym);
724 	shp->sh_addr = (Addr)(xkb->xkb_namelist + shp->sh_offset);
725 
726 
727 	shp = &hdr->kh_shdr[XKB_SHDR_STRTAB];
728 	shp->sh_name = 9;	/* xkb_shstrtab[9] = ".strtab" */
729 	shp->sh_type = SHT_STRTAB;
730 	shp->sh_offset = sizeof (xkb_namelist_t) +
731 	    sizes[XKB_WALK_LOCAL] + sizes[XKB_WALK_GLOBAL];
732 	shp->sh_size = sizes[XKB_WALK_STR];
733 	shp->sh_addralign = 1;
734 	shp->sh_addr = (Addr)(xkb->xkb_namelist + shp->sh_offset);
735 
736 
737 	shp = &hdr->kh_shdr[XKB_SHDR_SHSTRTAB];
738 	shp->sh_name = 17;	/* xkb_shstrtab[17] = ".shstrtab" */
739 	shp->sh_type = SHT_STRTAB;
740 	shp->sh_offset = offsetof(xkb_namelist_t, shstrings);
741 	shp->sh_size = sizeof (xkb_shstrtab);
742 	shp->sh_addralign = 1;
743 	shp->sh_addr = (Addr)(xkb->xkb_namelist + shp->sh_offset);
744 
745 	bcopy(xkb_shstrtab, hdr->shstrings, sizeof (xkb_shstrtab));
746 
747 	buf = xkb->xkb_namelist + sizeof (xkb_namelist_t);
748 
749 	if (!xkb_walk_syms(xkb, info->di_modules, &buf, sizes,
750 	    XKB_WALK_LOCAL))
751 		return (0);
752 	if (!xkb_walk_syms(xkb, info->di_modules, &buf, sizes,
753 	    XKB_WALK_GLOBAL))
754 		return (0);
755 	if (!xkb_walk_syms(xkb, info->di_modules, &buf, sizes,
756 	    XKB_WALK_STR))
757 		return (0);
758 
759 	return (1);
760 }
761 
762 /*ARGSUSED*/
763 xkb_t *
764 xkb_open(const char *namelist, const char *corefile, const char *swapfile,
765     int flag, const char *err)
766 {
767 	struct stat64 corestat;
768 	uintptr_t debug_va = DEF_DEBUG_INFO_VA;
769 	size_t sz;
770 	size_t i;
771 	xkb_t *xkb = NULL;
772 
773 	if (stat64(corefile, &corestat) == -1)
774 		return (xkb_fail(xkb, "cannot stat %s", corefile));
775 
776 	if (flag != O_RDONLY)
777 		return (xkb_fail(xkb, "invalid open flags"));
778 
779 	xkb = mdb_zalloc(sizeof (*xkb), UM_SLEEP);
780 
781 	for (i = 0; i < 4; i++)
782 		xkb->xkb_pt_map[i].mm_map = (char *)MAP_FAILED;
783 
784 	xkb->xkb_map.mm_map = (char *)MAP_FAILED;
785 	xkb->xkb_p2m_buf = (char *)MAP_FAILED;
786 
787 	xkb->xkb_path = strdup(corefile);
788 
789 	if ((xkb->xkb_fd = open64(corefile, O_RDONLY)) == -1)
790 		return (xkb_fail(xkb, "cannot open %s", corefile));
791 
792 	if (pread64(xkb->xkb_fd, &xkb->xkb_hdr, sizeof (xkb->xkb_hdr), 0) !=
793 	    sizeof (xkb->xkb_hdr))
794 		return (xkb_fail(xkb, "invalid dump file"));
795 
796 	if (xkb->xkb_hdr.xch_magic == XC_CORE_MAGIC_HVM)
797 		return (xkb_fail(xkb, "cannot process HVM images"));
798 
799 	if (xkb->xkb_hdr.xch_magic != XC_CORE_MAGIC) {
800 		return (xkb_fail(xkb, "invalid magic %d",
801 		    xkb->xkb_hdr.xch_magic));
802 	}
803 
804 	sz = xkb->xkb_hdr.xch_nr_vcpus * sizeof (*xkb->xkb_ctxts);
805 
806 	xkb->xkb_ctxts = mdb_alloc(sz, UM_SLEEP);
807 
808 	if (pread64(xkb->xkb_fd, xkb->xkb_ctxts, sz,
809 	    xkb->xkb_hdr.xch_ctxt_offset) != sz)
810 		return (xkb_fail(xkb, "cannot read VCPU contexts"));
811 
812 	if (xkb->xkb_ctxts[0].flags & VGCF_HVM_GUEST)
813 		return (xkb_fail(xkb, "cannot process HVM images"));
814 
815 	/*
816 	 * Try to map all the data pages. If we can't, fall back to the
817 	 * window/pread() approach, which is significantly slower.
818 	 */
819 	xkb->xkb_pages = mmap(NULL, PAGE_SIZE * xkb->xkb_hdr.xch_nr_pages,
820 	    PROT_READ, MAP_SHARED, xkb->xkb_fd,
821 	    xkb->xkb_hdr.xch_pages_offset);
822 
823 	if (xkb->xkb_pages == (char *)MAP_FAILED)
824 		xkb->xkb_pages = NULL;
825 
826 #if defined(__amd64)
827 	xkb->xkb_mmu.mi_max = 3;
828 	xkb->xkb_mmu.mi_shift[0] = 12;
829 	xkb->xkb_mmu.mi_shift[1] = 21;
830 	xkb->xkb_mmu.mi_shift[2] = 30;
831 	xkb->xkb_mmu.mi_shift[3] = 39;
832 	xkb->xkb_mmu.mi_ptes = 512;
833 	xkb->xkb_mmu.mi_ptesize = 8;
834 #elif defined(__i386)
835 	/*
836 	 * We'd like to adapt for correctness' sake, but we have no way of
837 	 * detecting a PAE guest, since cr4 writes are disallowed.
838 	 */
839 	debug_va = PAE_DEBUG_INFO_VA;
840 	xkb->xkb_mmu.mi_max = 2;
841 	xkb->xkb_mmu.mi_shift[0] = 12;
842 	xkb->xkb_mmu.mi_shift[1] = 21;
843 	xkb->xkb_mmu.mi_shift[2] = 30;
844 	xkb->xkb_mmu.mi_ptes = 512;
845 	xkb->xkb_mmu.mi_ptesize = 8;
846 #endif
847 
848 	if (!xkb_map_p2m(xkb))
849 		return (NULL);
850 
851 	if (!xkb_build_m2p(xkb))
852 		return (NULL);
853 
854 	if (xkb_read(xkb, debug_va, &xkb->xkb_info,
855 	    sizeof (xkb->xkb_info)) != sizeof (xkb->xkb_info))
856 		return (xkb_fail(xkb, "cannot read debug_info"));
857 
858 	if (xkb->xkb_info.di_magic != DEBUG_INFO_MAGIC) {
859 		return (xkb_fail(xkb, "invalid debug info magic %d",
860 		    xkb->xkb_info.di_magic));
861 	}
862 
863 	if (xkb->xkb_info.di_version != DEBUG_INFO_VERSION) {
864 		return (xkb_fail(xkb, "unknown debug info version %d",
865 		    xkb->xkb_info.di_version));
866 	}
867 
868 	if (!xkb_build_ksyms(xkb))
869 		return (xkb_fail(xkb, "cannot construct namelist"));
870 
871 	return (xkb);
872 }
873 
874 int
875 xkb_close(xkb_t *xkb)
876 {
877 	size_t sz;
878 	size_t i;
879 
880 	if (xkb == NULL)
881 		return (0);
882 
883 	if (xkb->xkb_m2p != NULL) {
884 		mdb_free(xkb->xkb_m2p,
885 		    (xkb->xkb_max_mfn + 1) * sizeof (xen_pfn_t));
886 	}
887 
888 	sz = sizeof (xen_pfn_t) * xkb->xkb_hdr.xch_nr_pages;
889 
890 	if (xkb->xkb_p2m_buf != (xen_pfn_t *)MAP_FAILED)
891 		(void) munmap(xkb->xkb_p2m_buf, sz);
892 
893 	if (xkb->xkb_pages != NULL) {
894 		(void) munmap((void *)xkb->xkb_pages,
895 		    PAGE_SIZE * xkb->xkb_hdr.xch_nr_pages);
896 	} else {
897 		for (i = 0; i < 4; i++) {
898 			char *addr = xkb->xkb_pt_map[i].mm_map;
899 			if (addr != (char *)MAP_FAILED)
900 				(void) munmap((void *)addr, PAGE_SIZE);
901 		}
902 		if (xkb->xkb_map.mm_map != (char *)MAP_FAILED) {
903 			(void) munmap((void *)xkb->xkb_map.mm_map,
904 			    PAGE_SIZE);
905 		}
906 	}
907 
908 	if (xkb->xkb_ctxts != NULL) {
909 		mdb_free(xkb->xkb_ctxts, sizeof (struct vcpu_guest_context) *
910 		    xkb->xkb_hdr.xch_nr_vcpus);
911 	}
912 
913 	if (xkb->xkb_namelist != NULL)
914 		mdb_free(xkb->xkb_namelist, xkb->xkb_namesize);
915 
916 	if (xkb->xkb_fd != -1)
917 		(void) close(xkb->xkb_fd);
918 
919 	free(xkb->xkb_path);
920 
921 	mdb_free(xkb, sizeof (*xkb));
922 	return (0);
923 }
924 
925 /*ARGSUSED*/
926 static mdb_io_t *
927 xkb_sym_io(xkb_t *xkb, const char *symfile)
928 {
929 	mdb_io_t *io = mdb_memio_create(xkb->xkb_namelist, xkb->xkb_namesize);
930 
931 	if (io == NULL)
932 		mdb_warn("failed to create namelist from %s", xkb->xkb_path);
933 
934 	return (io);
935 }
936 
937 uint64_t
938 xkb_vtop(xkb_t *xkb, struct as *as, uintptr_t addr)
939 {
940 	mfn_t tlmfn = xen_cr3_to_pfn(xkb->xkb_ctxts[0].ctrlreg[3]);
941 	mfn_t mfn;
942 
943 	if (as != NULL && (tlmfn = xkb_as_to_mfn(xkb, as)) == MFN_INVALID)
944 		return (-1ULL);
945 
946 	mfn = xkb_va_to_mfn(xkb, addr, tlmfn);
947 
948 	if (mfn == MFN_INVALID || mfn > xkb->xkb_max_mfn)
949 		return (-1ULL);
950 
951 	return (((uint64_t)xkb->xkb_m2p[mfn] << PAGE_SHIFT)
952 	    | PAGE_OFFSET(addr));
953 }
954 
955 static int
956 xkb_getmregs(xkb_t *xkb, uint_t cpu, struct privmregs *mregs)
957 {
958 	struct vcpu_guest_context *vcpu;
959 	struct cpu_user_regs *ur;
960 	struct regs *regs;
961 
962 	if (cpu >= xkb->xkb_hdr.xch_nr_vcpus) {
963 		errno = EINVAL;
964 		return (-1);
965 	}
966 
967 	bzero(mregs, sizeof (*mregs));
968 
969 	vcpu = &xkb->xkb_ctxts[cpu];
970 	ur = &vcpu->user_regs;
971 	regs = &mregs->pm_gregs;
972 
973 	regs->r_ss = ur->ss;
974 	regs->r_cs = ur->cs;
975 	regs->r_ds = ur->ds;
976 	regs->r_es = ur->es;
977 	regs->r_fs = ur->fs;
978 	regs->r_gs = ur->gs;
979 	regs->r_trapno = ur->entry_vector;
980 	regs->r_err = ur->error_code;
981 #ifdef __amd64
982 	regs->r_savfp = ur->rbp;
983 	regs->r_savpc = ur->rip;
984 	regs->r_rdi = ur->rdi;
985 	regs->r_rsi = ur->rsi;
986 	regs->r_rdx = ur->rdx;
987 	regs->r_rcx = ur->rcx;
988 	regs->r_r8 = ur->r8;
989 	regs->r_r9 = ur->r9;
990 	regs->r_rax = ur->rax;
991 	regs->r_rbx = ur->rbx;
992 	regs->r_rbp = ur->rbp;
993 	regs->r_r10 = ur->r10;
994 	regs->r_r11 = ur->r11;
995 	regs->r_r12 = ur->r12;
996 	regs->r_r13 = ur->r13;
997 	regs->r_r14 = ur->r14;
998 	regs->r_r15 = ur->r15;
999 	regs->r_rip = ur->rip;
1000 	regs->r_rfl = ur->rflags;
1001 	regs->r_rsp = ur->rsp;
1002 #else
1003 	regs->r_savfp = ur->ebp;
1004 	regs->r_savpc = ur->eip;
1005 	regs->r_edi = ur->edi;
1006 	regs->r_esi = ur->esi;
1007 	regs->r_ebp = ur->ebp;
1008 	regs->r_esp = ur->esp;
1009 	regs->r_ebx = ur->ebx;
1010 	regs->r_edx = ur->edx;
1011 	regs->r_ecx = ur->ecx;
1012 	regs->r_eax = ur->eax;
1013 	regs->r_eip = ur->eip;
1014 	regs->r_efl = ur->eflags;
1015 	regs->r_uesp = 0;
1016 #endif
1017 
1018 	bcopy(&vcpu->ctrlreg, &mregs->pm_cr, 8 * sizeof (ulong_t));
1019 	bcopy(&vcpu->debugreg, &mregs->pm_dr, 8 * sizeof (ulong_t));
1020 
1021 	mregs->pm_flags = PM_GREGS | PM_CRREGS | PM_DRREGS;
1022 
1023 	return (0);
1024 }
1025 
1026 static mdb_kb_ops_t xpv_kb_ops = {
1027 	.kb_open = (void *(*)())xkb_open,
1028 	.kb_close = (int (*)())xkb_close,
1029 	.kb_sym_io = (mdb_io_t *(*)())xkb_sym_io,
1030 	.kb_kread = (ssize_t (*)())xkb_read,
1031 	.kb_kwrite = (ssize_t (*)())mdb_tgt_notsup,
1032 	.kb_aread = (ssize_t (*)())xkb_aread,
1033 	.kb_awrite = (ssize_t (*)())mdb_tgt_notsup,
1034 	.kb_pread = (ssize_t (*)())xkb_pread,
1035 	.kb_pwrite = (ssize_t (*)())mdb_tgt_notsup,
1036 	.kb_vtop = (uint64_t (*)())xkb_vtop,
1037 	.kb_getmregs = (int (*)())xkb_getmregs
1038 };
1039 
1040 mdb_kb_ops_t *
1041 mdb_kb_ops(void)
1042 {
1043 	return (&xpv_kb_ops);
1044 }
1045 
1046 static const mdb_dcmd_t dcmds[] = { NULL, };
1047 static const mdb_walker_t walkers[] = { NULL, };
1048 static const mdb_modinfo_t modinfo = { MDB_API_VERSION, dcmds, walkers };
1049 
1050 const mdb_modinfo_t *
1051 _mdb_init(void)
1052 {
1053 	return (&modinfo);
1054 }
1055 
1056 void
1057 _mdb_fini(void)
1058 {
1059 }
1060