xref: /illumos-gate/usr/src/uts/i86xpv/os/xen_machdep.c (revision dd4eeefd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /* derived from netbsd's xen_machdep.c 1.1.2.1 */
30 
31 /*
32  *
33  * Copyright (c) 2004 Christian Limpach.
34  * All rights reserved.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. This section intentionally left blank.
45  * 4. The name of the author may not be used to endorse or promote products
46  *    derived from this software without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
49  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
50  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
51  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
52  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
53  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
54  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
55  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
56  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
57  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
58  */
59 /*
60  * Section 3 of the above license was updated in response to bug 6379571.
61  */
62 
63 #include <sys/types.h>
64 #include <sys/cmn_err.h>
65 #include <sys/trap.h>
66 #include <sys/segments.h>
67 #include <sys/sunddi.h>		/* for ddi_strtoul */
68 #include <sys/hypervisor.h>
69 #include <sys/xen_mmu.h>
70 #include <sys/machsystm.h>
71 #include <sys/promif.h>
72 #include <sys/bootconf.h>
73 #include <sys/bootinfo.h>
74 #include <sys/cpr.h>
75 #include <sys/taskq.h>
76 #include <sys/uadmin.h>
77 #include <sys/evtchn_impl.h>
78 #include <sys/archsystm.h>
79 #include <xen/sys/xenbus_impl.h>
80 #include <sys/mach_mmu.h>
81 #include <vm/hat_i86.h>
82 #include <sys/gnttab.h>
83 #include <sys/reboot.h>
84 #include <sys/stack.h>
85 #include <sys/clock.h>
86 #include <sys/bitmap.h>
87 #include <sys/processor.h>
88 #include <sys/xen_errno.h>
89 #include <sys/xpv_panic.h>
90 #include <sys/smp_impldefs.h>
91 #include <sys/cpu.h>
92 #include <sys/balloon_impl.h>
93 #include <sys/ddi.h>
94 
95 /*
96  * Hypervisor-specific utility routines - these can be invoked from the
97  * normal control flow.  It might be useful to partition these into
98  * different files, but let's see how it looks before we get too
99  * carried away with that idea.
100  */
101 
102 /*
103  * In the current absence of any useful way to debug domains that are hung
104  * whilst suspending, we have a more clumsy approach...
105  */
106 #ifdef DEBUG
107 #define	SUSPEND_DEBUG if (xen_suspend_debug) xen_printf
108 #else
109 #define	SUSPEND_DEBUG(...)
110 #endif
111 
112 int cpr_debug;
113 cpuset_t cpu_suspend_set;
114 cpuset_t cpu_suspend_lost_set;
115 volatile int xen_suspending_cpus;
116 static int xen_suspend_debug;
117 
118 void
119 xen_set_callback(void (*func)(void), uint_t type, uint_t flags)
120 {
121 	struct callback_register cb;
122 
123 	bzero(&cb, sizeof (cb));
124 #if defined(__amd64)
125 	cb.address = (ulong_t)func;
126 #elif defined(__i386)
127 	cb.address.cs = KCS_SEL;
128 	cb.address.eip = (ulong_t)func;
129 #endif
130 	cb.type = type;
131 	cb.flags = flags;
132 
133 	/*
134 	 * XXPV always ignore return value for NMI
135 	 */
136 	if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 &&
137 	    type != CALLBACKTYPE_nmi)
138 		panic("HYPERVISOR_callback_op failed");
139 }
140 
141 void
142 xen_init_callbacks(void)
143 {
144 	/*
145 	 * register event (interrupt) handler.
146 	 */
147 	xen_set_callback(xen_callback, CALLBACKTYPE_event, 0);
148 
149 	/*
150 	 * failsafe handler.
151 	 */
152 	xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe,
153 	    CALLBACKF_mask_events);
154 
155 	/*
156 	 * NMI handler.
157 	 */
158 	xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0);
159 
160 	/*
161 	 * system call handler
162 	 * XXPV move to init_cpu_syscall?
163 	 */
164 #if defined(__amd64)
165 	xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
166 	    CALLBACKF_mask_events);
167 #endif	/* __amd64 */
168 }
169 
170 
171 /*
172  * cmn_err() followed by a 1/4 second delay; this gives the
173  * logging service a chance to flush messages and helps avoid
174  * intermixing output from prom_printf().
175  * XXPV: doesn't exactly help us on UP though.
176  */
177 /*PRINTFLIKE2*/
178 void
179 cpr_err(int ce, const char *fmt, ...)
180 {
181 	va_list adx;
182 
183 	va_start(adx, fmt);
184 	vcmn_err(ce, fmt, adx);
185 	va_end(adx);
186 	drv_usecwait(MICROSEC >> 2);
187 }
188 
189 void
190 xen_suspend_devices(void)
191 {
192 	int rc;
193 
194 	SUSPEND_DEBUG("xen_suspend_devices\n");
195 
196 	if ((rc = cpr_suspend_devices(ddi_root_node())) != 0)
197 		panic("failed to suspend devices: %d", rc);
198 }
199 
200 void
201 xen_resume_devices(void)
202 {
203 	int rc;
204 
205 	SUSPEND_DEBUG("xen_resume_devices\n");
206 
207 	if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0)
208 		panic("failed to resume devices: %d", rc);
209 }
210 
211 /*
212  * The list of mfn pages is out of date.  Recompute it.
213  * XXPV: can we race against another suspend call? Think not.
214  */
215 static void
216 rebuild_mfn_list(void)
217 {
218 	int i = 0;
219 	size_t sz;
220 	size_t off;
221 	pfn_t pfn;
222 
223 	SUSPEND_DEBUG("rebuild_mfn_list\n");
224 
225 	sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK;
226 
227 	for (off = 0; off < sz; off += MMU_PAGESIZE) {
228 		size_t j = mmu_btop(off);
229 		if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) {
230 			pfn = hat_getpfnum(kas.a_hat,
231 			    (caddr_t)&mfn_list_pages[j]);
232 			mfn_list_pages_page[i++] = pfn_to_mfn(pfn);
233 		}
234 
235 		pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off);
236 		mfn_list_pages[j] = pfn_to_mfn(pfn);
237 	}
238 
239 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page);
240 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list
241 	    = pfn_to_mfn(pfn);
242 }
243 
244 static void
245 suspend_cpus(void)
246 {
247 	int i;
248 
249 	SUSPEND_DEBUG("suspend_cpus\n");
250 
251 	xen_suspending_cpus = 1;
252 
253 	pause_cpus(NULL);
254 
255 	SUSPEND_DEBUG("waiting for offline CPUs\n");
256 
257 	/*
258 	 * For us to proceed safely, all CPUs except the current one must be
259 	 * present in cpu_suspend_set.  Running CPUs will participate in
260 	 * pause_cpus(), and eventually reach mach_cpu_pause().  Powered-off
261 	 * VCPUs will already be in the set, again in mach_cpu_pause().
262 	 * Finally, offline CPUs will be sitting in mach_cpu_idle().
263 	 */
264 	while (!CPUSET_ISEQUAL(mp_cpus, cpu_suspend_set))
265 		SMT_PAUSE();
266 
267 	for (i = 1; i < ncpus; i++) {
268 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
269 			SUSPEND_DEBUG("xen_vcpu_down %d\n", i);
270 			(void) xen_vcpu_down(i);
271 		}
272 
273 		mach_cpucontext_reset(cpu[i]);
274 	}
275 }
276 
277 static void
278 resume_cpus(void)
279 {
280 	int i;
281 
282 	xen_suspending_cpus = 0;
283 
284 	for (i = 1; i < ncpus; i++) {
285 		if (cpu[i] == NULL)
286 			continue;
287 
288 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
289 			SUSPEND_DEBUG("xen_vcpu_up %d\n", i);
290 			mach_cpucontext_restore(cpu[i]);
291 			(void) xen_vcpu_up(i);
292 		}
293 	}
294 
295 	start_cpus();
296 }
297 
298 /*
299  * Top level routine to direct suspend/resume of a domain.
300  */
301 void
302 xen_suspend_domain(void)
303 {
304 	extern void rtcsync(void);
305 	extern hrtime_t hres_last_tick;
306 	mfn_t start_info_mfn;
307 	ulong_t flags;
308 	pfn_t pfn;
309 	int i;
310 
311 	/*
312 	 * XXPV - Are we definitely OK to suspend by the time we've connected
313 	 * the handler?
314 	 */
315 
316 	cpr_err(CE_NOTE, "Domain suspending for save/migrate");
317 
318 	SUSPEND_DEBUG("xen_suspend_domain\n");
319 
320 	/*
321 	 * suspend interrupts and devices
322 	 * XXPV - we use suspend/resume for both save/restore domains (like sun
323 	 * cpr) and for migration.  Would be nice to know the difference if
324 	 * possible.  For save/restore where down time may be a long time, we
325 	 * may want to do more of the things that cpr does.  (i.e. notify user
326 	 * processes, shrink memory footprint for faster restore, etc.)
327 	 */
328 	xen_suspend_devices();
329 	SUSPEND_DEBUG("xenbus_suspend\n");
330 	xenbus_suspend();
331 
332 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info);
333 	start_info_mfn = pfn_to_mfn(pfn);
334 
335 	/*
336 	 * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe
337 	 * wrt xenbus being suspended here?
338 	 */
339 	mutex_enter(&cpu_lock);
340 
341 	/*
342 	 * Suspend must be done on vcpu 0, as no context for other CPUs is
343 	 * saved.
344 	 *
345 	 * XXPV - add to taskq API ?
346 	 */
347 	thread_affinity_set(curthread, 0);
348 	kpreempt_disable();
349 
350 	SUSPEND_DEBUG("xen_start_migrate\n");
351 	xen_start_migrate();
352 	if (ncpus > 1)
353 		suspend_cpus();
354 
355 	/*
356 	 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence
357 	 * any holder would have dropped it to get through suspend_cpus().
358 	 */
359 	mutex_enter(&ec_lock);
360 
361 	/*
362 	 * From here on in, we can't take locks.
363 	 */
364 	SUSPEND_DEBUG("ec_suspend\n");
365 	ec_suspend();
366 	SUSPEND_DEBUG("gnttab_suspend\n");
367 	gnttab_suspend();
368 
369 	flags = intr_clear();
370 
371 	xpv_time_suspend();
372 
373 	/*
374 	 * Currently, the hypervisor incorrectly fails to bring back
375 	 * powered-down VCPUs.  Thus we need to record any powered-down VCPUs
376 	 * to prevent any attempts to operate on them.  But we have to do this
377 	 * *after* the very first time we do ec_suspend().
378 	 */
379 	for (i = 1; i < ncpus; i++) {
380 		if (cpu[i] == NULL)
381 			continue;
382 
383 		if (cpu_get_state(cpu[i]) == P_POWEROFF)
384 			CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i);
385 	}
386 
387 	/*
388 	 * The dom0 save/migrate code doesn't automatically translate
389 	 * these into PFNs, but expects them to be, so we do it here.
390 	 * We don't use mfn_to_pfn() because so many OS services have
391 	 * been disabled at this point.
392 	 */
393 	xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn];
394 	xen_info->console.domU.mfn =
395 	    mfn_to_pfn_mapping[xen_info->console.domU.mfn];
396 
397 	if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) {
398 		prom_printf("xen_suspend_domain(): "
399 		    "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n");
400 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
401 	}
402 
403 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
404 	    0, UVMF_INVLPG)) {
405 		prom_printf("xen_suspend_domain(): "
406 		    "HYPERVISOR_update_va_mapping() failed\n");
407 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
408 	}
409 
410 	SUSPEND_DEBUG("HYPERVISOR_suspend\n");
411 
412 	/*
413 	 * At this point we suspend and sometime later resume.
414 	 */
415 	if (HYPERVISOR_suspend(start_info_mfn)) {
416 		prom_printf("xen_suspend_domain(): "
417 		    "HYPERVISOR_suspend() failed\n");
418 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
419 	}
420 
421 	/*
422 	 * Point HYPERVISOR_shared_info to its new value.
423 	 */
424 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
425 	    xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE,
426 	    UVMF_INVLPG))
427 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
428 
429 	if (xen_info->nr_pages != mfn_count) {
430 		prom_printf("xen_suspend_domain(): number of pages"
431 		    " changed, was 0x%lx, now 0x%lx\n", mfn_count,
432 		    xen_info->nr_pages);
433 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
434 	}
435 
436 	xpv_time_resume();
437 
438 	cached_max_mfn = 0;
439 
440 	SUSPEND_DEBUG("gnttab_resume\n");
441 	gnttab_resume();
442 
443 	/* XXPV: add a note that this must be lockless. */
444 	SUSPEND_DEBUG("ec_resume\n");
445 	ec_resume();
446 
447 	intr_restore(flags);
448 
449 	if (ncpus > 1)
450 		resume_cpus();
451 
452 	mutex_exit(&ec_lock);
453 	xen_end_migrate();
454 	mutex_exit(&cpu_lock);
455 
456 	/*
457 	 * Now we can take locks again.
458 	 */
459 
460 	/*
461 	 * Force the tick value used for tv_nsec in hres_tick() to be up to
462 	 * date. rtcsync() will reset the hrestime value appropriately.
463 	 */
464 	hres_last_tick = xpv_gethrtime();
465 
466 	/*
467 	 * XXPV: we need to have resumed the CPUs since this takes locks, but
468 	 * can remote CPUs see bad state? Presumably yes. Should probably nest
469 	 * taking of todlock inside of cpu_lock, or vice versa, then provide an
470 	 * unlocked version.  Probably need to call clkinitf to reset cpu freq
471 	 * and re-calibrate if we migrated to a different speed cpu.  Also need
472 	 * to make a (re)init_cpu_info call to update processor info structs
473 	 * and device tree info.  That remains to be written at the moment.
474 	 */
475 	rtcsync();
476 
477 	rebuild_mfn_list();
478 
479 	SUSPEND_DEBUG("xenbus_resume\n");
480 	xenbus_resume();
481 	SUSPEND_DEBUG("xenbus_resume_devices\n");
482 	xen_resume_devices();
483 
484 	thread_affinity_clear(curthread);
485 	kpreempt_enable();
486 
487 	SUSPEND_DEBUG("finished xen_suspend_domain\n");
488 	cmn_err(CE_NOTE, "domain restore/migrate completed");
489 }
490 
491 /*ARGSUSED*/
492 int
493 xen_debug_handler(void *arg)
494 {
495 	debug_enter("External debug event received");
496 
497 	/*
498 	 * If we've not got KMDB loaded, output some stuff difficult to capture
499 	 * from a domain core.
500 	 */
501 	if (!(boothowto & RB_DEBUG)) {
502 		shared_info_t *si = HYPERVISOR_shared_info;
503 		int i;
504 
505 		prom_printf("evtchn_pending [ ");
506 		for (i = 0; i < 8; i++)
507 			prom_printf("%lx ", si->evtchn_pending[i]);
508 		prom_printf("]\nevtchn_mask [ ");
509 		for (i = 0; i < 8; i++)
510 			prom_printf("%lx ", si->evtchn_mask[i]);
511 		prom_printf("]\n");
512 
513 		for (i = 0; i < ncpus; i++) {
514 			vcpu_info_t *vcpu = &si->vcpu_info[i];
515 			if (cpu[i] == NULL)
516 				continue;
517 			prom_printf("CPU%d pending %d mask %d sel %lx\n",
518 			    i, vcpu->evtchn_upcall_pending,
519 			    vcpu->evtchn_upcall_mask,
520 			    vcpu->evtchn_pending_sel);
521 		}
522 	}
523 
524 	return (0);
525 }
526 
527 /*ARGSUSED*/
528 static void
529 xen_sysrq_handler(struct xenbus_watch *watch, const char **vec,
530     unsigned int len)
531 {
532 	xenbus_transaction_t xbt;
533 	char key = '\0';
534 	int ret;
535 
536 retry:
537 	if (xenbus_transaction_start(&xbt)) {
538 		cmn_err(CE_WARN, "failed to start sysrq transaction");
539 		return;
540 	}
541 
542 	if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) {
543 		/*
544 		 * ENOENT happens in response to our own xenbus_rm.
545 		 * XXPV - this happens spuriously on boot?
546 		 */
547 		if (ret != ENOENT)
548 			cmn_err(CE_WARN, "failed to read sysrq: %d", ret);
549 		goto out;
550 	}
551 
552 	if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) {
553 		cmn_err(CE_WARN, "failed to reset sysrq: %d", ret);
554 		goto out;
555 	}
556 
557 	if (xenbus_transaction_end(xbt, 0) == EAGAIN)
558 		goto retry;
559 
560 	/*
561 	 * Somewhat arbitrary - on Linux this means 'reboot'. We could just
562 	 * accept any key, but this might increase the risk of sending a
563 	 * harmless sysrq to the wrong domain...
564 	 */
565 	if (key == 'b')
566 		(void) xen_debug_handler(NULL);
567 	else
568 		cmn_err(CE_WARN, "Ignored sysrq %c", key);
569 	return;
570 
571 out:
572 	(void) xenbus_transaction_end(xbt, 1);
573 }
574 
575 taskq_t *xen_shutdown_tq;
576 volatile int shutdown_req_active;
577 
578 #define	SHUTDOWN_INVALID	-1
579 #define	SHUTDOWN_POWEROFF	0
580 #define	SHUTDOWN_REBOOT		1
581 #define	SHUTDOWN_SUSPEND	2
582 #define	SHUTDOWN_HALT		3
583 #define	SHUTDOWN_MAX		4
584 
585 #define	SHUTDOWN_TIMEOUT_SECS (60 * 5)
586 
587 static const char *cmd_strings[SHUTDOWN_MAX] = {
588 	"poweroff",
589 	"reboot",
590 	"suspend",
591 	"halt"
592 };
593 
594 static void
595 xen_dirty_shutdown(void *arg)
596 {
597 	int cmd = (uintptr_t)arg;
598 
599 	cmn_err(CE_WARN, "Externally requested shutdown failed or "
600 	    "timed out.\nShutting down.\n");
601 
602 	switch (cmd) {
603 	case SHUTDOWN_HALT:
604 	case SHUTDOWN_POWEROFF:
605 		(void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred);
606 		break;
607 	case SHUTDOWN_REBOOT:
608 		(void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred);
609 		break;
610 	}
611 }
612 
613 static void
614 xen_shutdown(void *arg)
615 {
616 	nvlist_t *attr_list = NULL;
617 	sysevent_t *event = NULL;
618 	sysevent_id_t eid;
619 	int cmd = (uintptr_t)arg;
620 	int err;
621 
622 	ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX);
623 
624 	if (cmd == SHUTDOWN_SUSPEND) {
625 		xen_suspend_domain();
626 		shutdown_req_active = 0;
627 		return;
628 	}
629 
630 	err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_SLEEP);
631 	if (err != DDI_SUCCESS)
632 		goto failure;
633 
634 	err = nvlist_add_string(attr_list, "shutdown", cmd_strings[cmd]);
635 	if (err != DDI_SUCCESS)
636 		goto failure;
637 
638 	if ((event = sysevent_alloc("EC_xpvsys", "control", "SUNW:kern:xpv",
639 	    SE_SLEEP)) == NULL)
640 		goto failure;
641 	(void) sysevent_attach_attributes(event,
642 	    (sysevent_attr_list_t *)attr_list);
643 
644 	err = log_sysevent(event, SE_SLEEP, &eid);
645 
646 	sysevent_detach_attributes(event);
647 	sysevent_free(event);
648 
649 	if (err != 0)
650 		goto failure;
651 
652 	(void) timeout(xen_dirty_shutdown, arg,
653 	    SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC));
654 
655 	nvlist_free(attr_list);
656 	return;
657 
658 failure:
659 	if (attr_list != NULL)
660 		nvlist_free(attr_list);
661 	xen_dirty_shutdown(arg);
662 }
663 
664 /*ARGSUSED*/
665 static void
666 xen_shutdown_handler(struct xenbus_watch *watch, const char **vec,
667 	unsigned int len)
668 {
669 	char *str;
670 	xenbus_transaction_t xbt;
671 	int err, shutdown_code = SHUTDOWN_INVALID;
672 	unsigned int slen;
673 
674 again:
675 	err = xenbus_transaction_start(&xbt);
676 	if (err)
677 		return;
678 	if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) {
679 		(void) xenbus_transaction_end(xbt, 1);
680 		return;
681 	}
682 
683 	SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str);
684 
685 	/*
686 	 * If this is a watch fired from our write below, check out early to
687 	 * avoid an infinite loop.
688 	 */
689 	if (strcmp(str, "") == 0) {
690 		(void) xenbus_transaction_end(xbt, 0);
691 		kmem_free(str, slen);
692 		return;
693 	} else if (strcmp(str, "poweroff") == 0) {
694 		shutdown_code = SHUTDOWN_POWEROFF;
695 	} else if (strcmp(str, "reboot") == 0) {
696 		shutdown_code = SHUTDOWN_REBOOT;
697 	} else if (strcmp(str, "suspend") == 0) {
698 		shutdown_code = SHUTDOWN_SUSPEND;
699 	} else if (strcmp(str, "halt") == 0) {
700 		shutdown_code = SHUTDOWN_HALT;
701 	} else {
702 		printf("Ignoring shutdown request: %s\n", str);
703 	}
704 
705 	/*
706 	 * XXPV	Should we check the value of xenbus_write() too, or are all
707 	 *	errors automatically folded into xenbus_transaction_end() ??
708 	 */
709 	(void) xenbus_write(xbt, "control", "shutdown", "");
710 	err = xenbus_transaction_end(xbt, 0);
711 	if (err == EAGAIN) {
712 		SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id);
713 		kmem_free(str, slen);
714 		goto again;
715 	}
716 
717 	kmem_free(str, slen);
718 	if (shutdown_code != SHUTDOWN_INVALID) {
719 		if (shutdown_code == SHUTDOWN_SUSPEND) {
720 			while (shutdown_req_active)
721 				SMT_PAUSE();
722 		}
723 
724 		shutdown_req_active = 1;
725 		(void) taskq_dispatch(xen_shutdown_tq, xen_shutdown,
726 		    (void *)(intptr_t)shutdown_code, 0);
727 	}
728 }
729 
730 static struct xenbus_watch shutdown_watch;
731 static struct xenbus_watch sysrq_watch;
732 
733 void
734 xen_late_startup(void)
735 {
736 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
737 		xen_shutdown_tq = taskq_create("shutdown_taskq", 1,
738 		    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
739 		shutdown_watch.node = "control/shutdown";
740 		shutdown_watch.callback = xen_shutdown_handler;
741 		if (register_xenbus_watch(&shutdown_watch))
742 			cmn_err(CE_WARN, "Failed to set shutdown watcher");
743 
744 		sysrq_watch.node = "control/sysrq";
745 		sysrq_watch.callback = xen_sysrq_handler;
746 		if (register_xenbus_watch(&sysrq_watch))
747 			cmn_err(CE_WARN, "Failed to set sysrq watcher");
748 	}
749 	balloon_init(xen_info->nr_pages);
750 }
751 
752 #ifdef DEBUG
753 #define	XEN_PRINTF_BUFSIZE	1024
754 
755 char xen_printf_buffer[XEN_PRINTF_BUFSIZE];
756 
757 /*
758  * Printf function that calls hypervisor directly.  For DomU it only
759  * works when running on a xen hypervisor built with debug on.  Works
760  * always since no I/O ring interaction is needed.
761  */
762 /*PRINTFLIKE1*/
763 void
764 xen_printf(const char *fmt, ...)
765 {
766 	va_list	ap;
767 
768 	va_start(ap, fmt);
769 	(void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap);
770 	va_end(ap);
771 
772 	(void) HYPERVISOR_console_io(CONSOLEIO_write,
773 	    strlen(xen_printf_buffer), xen_printf_buffer);
774 }
775 #else
776 void
777 xen_printf(const char *fmt, ...)
778 {
779 }
780 #endif	/* DEBUG */
781 
782 /*
783  * Determine helpful version information.
784  *
785  * (And leave a copy around in the data segment so we can look
786  * at them later with e.g. kmdb.)
787  */
788 struct xenver {
789 	char *xv_ver;
790 	char *xv_chgset;
791 	char *xv_compiler;
792 	char *xv_compile_date;
793 	char *xv_compile_by;
794 	char *xv_compile_domain;
795 	char *xv_caps;
796 } xenver;
797 
798 static char *
799 sprintf_alloc(const char *fmt, ...)
800 {
801 	va_list ap;
802 	size_t len;
803 	char *p;
804 
805 	va_start(ap, fmt);
806 	len = 1 + vsnprintf(NULL, 0, fmt, ap);
807 	p = kmem_alloc(len, KM_SLEEP);
808 	(void) vsnprintf(p, len, fmt, ap);
809 	va_end(ap);
810 	return (p);
811 }
812 
813 void
814 xen_version(void)
815 {
816 	static const char strfmt[] = "%s";
817 	static const char xenver_sun[] = "3.0.4-1-xvm";  /* XXPV */
818 	union {
819 		xen_extraversion_t xver;
820 		xen_changeset_info_t chgset;
821 		xen_compile_info_t build;
822 		xen_capabilities_info_t caps;
823 	} data, *src = &data;
824 
825 	ulong_t ver = HYPERVISOR_xen_version(XENVER_version, 0);
826 
827 	if (HYPERVISOR_xen_version(XENVER_extraversion, src) == 0) {
828 		((char *)(src->xver))[sizeof (src->xver) - 1] = '\0';
829 	} else
830 		((char *)(src->xver))[0] = '\0';
831 
832 	xenver.xv_ver = sprintf_alloc("%lu.%lu%s",
833 	    BITX(ver, 31, 16), BITX(ver, 15, 0), src->xver);
834 
835 	if (HYPERVISOR_xen_version(XENVER_changeset, src) == 0) {
836 		((char *)(src->chgset))[sizeof (src->chgset) - 1] = '\0';
837 		xenver.xv_chgset = sprintf_alloc(strfmt, src->chgset);
838 	}
839 
840 	cmn_err(CE_CONT, "?xen v%s chgset '%s'\n",
841 	    xenver.xv_ver, xenver.xv_chgset);
842 
843 	/*
844 	 * XXPV - Solaris guests currently require special version of
845 	 * the hypervisor from Sun to function properly called "3.0.4-1-xvm".
846 	 * This version is based on "3.0.4-1" plus changes from
847 	 * Sun that are a work-in-progress.
848 	 *
849 	 * This version check will disappear after appropriate fixes
850 	 * are accepted upstream.
851 	 */
852 	if (strcmp(xenver.xv_ver, xenver_sun) != 0) {
853 		cmn_err(CE_WARN, "Found xen v%s but need xen v%s",
854 		    xenver.xv_ver, xenver_sun);
855 		cmn_err(CE_WARN, "The kernel may not function correctly");
856 	}
857 
858 	if (HYPERVISOR_xen_version(XENVER_compile_info, src) == 0) {
859 		xenver.xv_compiler = sprintf_alloc(strfmt,
860 		    data.build.compiler);
861 		xenver.xv_compile_date = sprintf_alloc(strfmt,
862 		    data.build.compile_date);
863 		xenver.xv_compile_by = sprintf_alloc(strfmt,
864 		    data.build.compile_by);
865 		xenver.xv_compile_domain = sprintf_alloc(strfmt,
866 		    data.build.compile_domain);
867 	}
868 
869 	/*
870 	 * Capabilities are a set of space separated ascii strings
871 	 * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64'
872 	 */
873 	if (HYPERVISOR_xen_version(XENVER_capabilities, src) == 0) {
874 		((char *)(src->caps))[sizeof (src->caps) - 1] = '\0';
875 		xenver.xv_caps = sprintf_alloc(strfmt, src->caps);
876 	}
877 }
878 
879 /*
880  * Miscellaneous hypercall wrappers with slightly more verbose diagnostics.
881  */
882 
883 void
884 xen_set_gdt(ulong_t *frame_list, int entries)
885 {
886 	int err;
887 	if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) {
888 		/*
889 		 * X_EINVAL:	reserved entry or bad frames
890 		 * X_EFAULT:	bad address
891 		 */
892 		panic("xen_set_gdt(%p, %d): error %d",
893 		    (void *)frame_list, entries, -(int)err);
894 	}
895 }
896 
897 void
898 xen_set_ldt(user_desc_t *ldt, uint_t nsels)
899 {
900 	struct mmuext_op	op;
901 	long			err;
902 
903 	op.cmd = MMUEXT_SET_LDT;
904 	op.arg1.linear_addr = (uintptr_t)ldt;
905 	op.arg2.nr_ents = nsels;
906 
907 	if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) {
908 		panic("xen_set_ldt(%p, %d): error %d",
909 		    (void *)ldt, nsels, -(int)err);
910 	}
911 }
912 
913 void
914 xen_stack_switch(ulong_t ss, ulong_t esp)
915 {
916 	long err;
917 
918 	if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) {
919 		/*
920 		 * X_EPERM:	bad selector
921 		 */
922 		panic("xen_stack_switch(%lx, %lx): error %d", ss, esp,
923 		    -(int)err);
924 	}
925 }
926 
927 long
928 xen_set_trap_table(trap_info_t *table)
929 {
930 	long err;
931 
932 	if ((err = HYPERVISOR_set_trap_table(table)) != 0) {
933 		/*
934 		 * X_EFAULT:	bad address
935 		 * X_EPERM:	bad selector
936 		 */
937 		panic("xen_set_trap_table(%p): error %d", (void *)table,
938 		    -(int)err);
939 	}
940 	return (err);
941 }
942 
943 #if defined(__amd64)
944 void
945 xen_set_segment_base(int reg, ulong_t value)
946 {
947 	long err;
948 
949 	if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) {
950 		/*
951 		 * X_EFAULT:	bad address
952 		 * X_EINVAL:	bad type
953 		 */
954 		panic("xen_set_segment_base(%d, %lx): error %d",
955 		    reg, value, -(int)err);
956 	}
957 }
958 #endif	/* __amd64 */
959 
960 /*
961  * Translate a hypervisor errcode to a Solaris error code.
962  */
963 int
964 xen_xlate_errcode(int error)
965 {
966 	switch (-error) {
967 
968 	/*
969 	 * Translate hypervisor errno's into native errno's
970 	 */
971 
972 #define	CASE(num)	case X_##num: error = num; break
973 
974 	CASE(EPERM);	CASE(ENOENT);	CASE(ESRCH);
975 	CASE(EINTR);	CASE(EIO);	CASE(ENXIO);
976 	CASE(E2BIG);	CASE(ENOMEM);	CASE(EACCES);
977 	CASE(EFAULT);	CASE(EBUSY);	CASE(EEXIST);
978 	CASE(ENODEV);	CASE(EISDIR);	CASE(EINVAL);
979 	CASE(ENOSPC);	CASE(ESPIPE);	CASE(EROFS);
980 	CASE(ENOSYS);	CASE(ENOTEMPTY); CASE(EISCONN);
981 	CASE(ENODATA);
982 
983 #undef CASE
984 
985 	default:
986 		panic("xen_xlate_errcode: unknown error %d", error);
987 	}
988 
989 	return (error);
990 }
991 
992 /*
993  * Raise PS_IOPL on current vcpu to user level.
994  * Caller responsible for preventing kernel preemption.
995  */
996 void
997 xen_enable_user_iopl(void)
998 {
999 	physdev_set_iopl_t set_iopl;
1000 	set_iopl.iopl = 3;		/* user ring 3 */
1001 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1002 }
1003 
1004 /*
1005  * Drop PS_IOPL on current vcpu to kernel level
1006  */
1007 void
1008 xen_disable_user_iopl(void)
1009 {
1010 	physdev_set_iopl_t set_iopl;
1011 	set_iopl.iopl = 1;		/* kernel pseudo ring 1 */
1012 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1013 }
1014 
1015 int
1016 xen_gdt_setprot(cpu_t *cp, uint_t prot)
1017 {
1018 	int err;
1019 #if defined(__amd64)
1020 	int pt_bits = PT_VALID;
1021 	if (prot & PROT_WRITE)
1022 		pt_bits |= PT_WRITABLE;
1023 #endif
1024 
1025 	if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt,
1026 	    MMU_PAGESIZE, prot)) != 0)
1027 		goto done;
1028 
1029 #if defined(__amd64)
1030 	err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits);
1031 #endif
1032 
1033 done:
1034 	if (err) {
1035 		cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d",
1036 		    cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only",
1037 		    err);
1038 	}
1039 
1040 	return (err);
1041 }
1042 
1043 int
1044 xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot)
1045 {
1046 	int err;
1047 	caddr_t	lva = (caddr_t)ldt;
1048 #if defined(__amd64)
1049 	int pt_bits = PT_VALID;
1050 	pgcnt_t npgs;
1051 	if (prot & PROT_WRITE)
1052 		pt_bits |= PT_WRITABLE;
1053 #endif	/* __amd64 */
1054 
1055 	if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0)
1056 		goto done;
1057 
1058 #if defined(__amd64)
1059 
1060 	ASSERT(IS_P2ALIGNED(lsize, PAGESIZE));
1061 	npgs = mmu_btop(lsize);
1062 	while (npgs--) {
1063 		if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva),
1064 		    pt_bits)) != 0)
1065 			break;
1066 		lva += PAGESIZE;
1067 	}
1068 #endif	/* __amd64 */
1069 
1070 done:
1071 	if (err) {
1072 		cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d",
1073 		    (void *)lva,
1074 		    (prot & PROT_WRITE) ? "writable" : "read-only", err);
1075 	}
1076 
1077 	return (err);
1078 }
1079