xref: /netbsd/sys/arch/xen/x86/hypervisor_machdep.c (revision 3b2f9357)
1 /*	$NetBSD: hypervisor_machdep.c,v 1.46 2023/03/01 08:13:44 riastradh Exp $	*/
2 
3 /*
4  *
5  * Copyright (c) 2004 Christian Limpach.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /******************************************************************************
30  * hypervisor.c
31  *
32  * Communication to/from hypervisor.
33  *
34  * Copyright (c) 2002-2004, K A Fraser
35  *
36  * Permission is hereby granted, free of charge, to any person obtaining a copy
37  * of this software and associated documentation files (the "Software"), to
38  * deal in the Software without restriction, including without limitation the
39  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
40  * sell copies of the Software, and to permit persons to whom the Software is
41  * furnished to do so, subject to the following conditions:
42  *
43  * The above copyright notice and this permission notice shall be included in
44  * all copies or substantial portions of the Software.
45  *
46  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
48  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
49  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
51  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
52  * DEALINGS IN THE SOFTWARE.
53  */
54 
55 
56 #include <sys/cdefs.h>
57 __KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.46 2023/03/01 08:13:44 riastradh Exp $");
58 
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/kmem.h>
62 #include <sys/cpu.h>
63 #include <sys/ksyms.h>
64 
65 #include <uvm/uvm_extern.h>
66 
67 #include <machine/vmparam.h>
68 #include <machine/pmap.h>
69 #include <machine/pmap_private.h>
70 
71 #include <x86/machdep.h>
72 #include <x86/cpuvar.h>
73 
74 #include <xen/xen.h>
75 #include <xen/intr.h>
76 #include <xen/hypervisor.h>
77 #include <xen/evtchn.h>
78 #include <xen/xenpmap.h>
79 
80 #include "opt_xen.h"
81 #include "opt_modular.h"
82 #include "opt_ddb.h"
83 #include "isa.h"
84 #include "pci.h"
85 #include "ksyms.h"
86 
87 #ifdef DDB
88 #include <machine/db_machdep.h>
89 #include <ddb/db_extern.h>
90 #include <ddb/db_output.h>
91 #include <ddb/db_interface.h>
92 #endif
93 
94 #ifdef XENPV
95 /*
96  * arch-dependent p2m frame lists list (L3 and L2)
97  * used by Xen for save/restore mappings
98  */
99 static unsigned long * l3_p2m_page;
100 static unsigned long * l2_p2m_page;
101 static int l2_p2m_page_size; /* size of L2 page, in pages */
102 
103 static void build_p2m_frame_list_list(void);
104 static void update_p2m_frame_list_list(void);
105 
106 #endif
107 
108 // #define PORT_DEBUG 4
109 // #define EARLY_DEBUG_EVENT
110 
111 /* callback function type */
112 typedef void (*iterate_func_t)(unsigned int, unsigned int,
113 			       unsigned int, void *);
114 
115 static inline void
evt_iterate_bits(volatile unsigned long * pendingl1,volatile unsigned long * pendingl2,volatile unsigned long * mask,iterate_func_t iterate_pending,void * iterate_args)116 evt_iterate_bits(volatile unsigned long *pendingl1,
117 		 volatile unsigned long *pendingl2,
118 		 volatile unsigned long *mask,
119 		 iterate_func_t iterate_pending, void *iterate_args)
120 {
121 
122 	KASSERT(pendingl1 != NULL);
123 	KASSERT(pendingl2 != NULL);
124 
125 	unsigned long l1, l2;
126 	unsigned int l1i, l2i, port;
127 
128 	l1 = xen_atomic_xchg(pendingl1, 0);
129 	while ((l1i = xen_ffs(l1)) != 0) {
130 		l1i--;
131 		l1 &= ~(1UL << l1i);
132 
133 		l2 = pendingl2[l1i] & (mask != NULL ? ~mask[l1i] : -1UL);
134 		l2 &= curcpu()->ci_evtmask[l1i];
135 
136 		if (mask != NULL) xen_atomic_setbits_l(&mask[l1i], l2);
137 		xen_atomic_clearbits_l(&pendingl2[l1i], l2);
138 
139 		while ((l2i = xen_ffs(l2)) != 0) {
140 			l2i--;
141 			l2 &= ~(1UL << l2i);
142 
143 			port = (l1i << LONG_SHIFT) + l2i;
144 
145 			iterate_pending(port, l1i, l2i, iterate_args);
146 		}
147 	}
148 }
149 
150 /*
151  * Set per-cpu "pending" information for outstanding events that
152  * cannot be processed now.
153  */
154 
155 static inline void
evt_set_pending(unsigned int port,unsigned int l1i,unsigned int l2i,void * args)156 evt_set_pending(unsigned int port, unsigned int l1i,
157 		unsigned int l2i, void *args)
158 {
159 
160 	KASSERT(args != NULL);
161 
162 	int *ret = args;
163 	struct intrhand *ih;
164 
165 	if (evtsource[port]) {
166 		hypervisor_set_ipending(evtsource[port]->ev_imask, l1i, l2i);
167 		evtsource[port]->ev_evcnt.ev_count++;
168 		ih = evtsource[port]->ev_handlers;
169 		while (ih != NULL) {
170 			ih->ih_pending++;
171 			ih = ih->ih_evt_next;
172 		}
173 
174 		if (*ret == 0 && curcpu()->ci_ilevel <
175 		    evtsource[port]->ev_maxlevel)
176 			*ret = 1;
177 	}
178 #ifdef DOM0OPS
179 	else  {
180 		/* set pending event */
181 		xenevt_setipending(l1i, l2i);
182 	}
183 #endif
184 }
185 
186 int stipending(void);
187 int
stipending(void)188 stipending(void)
189 {
190 	volatile shared_info_t *s = HYPERVISOR_shared_info;
191 	struct cpu_info *ci;
192 	volatile struct vcpu_info *vci;
193 	int ret;
194 
195 	kpreempt_disable();
196 
197 	ret = 0;
198 	ci = curcpu();
199 	vci = ci->ci_vcpu;
200 
201 #if 0
202 	if (HYPERVISOR_shared_info->events)
203 		printf("stipending events %08lx mask %08lx ilevel %d\n",
204 		    HYPERVISOR_shared_info->events,
205 		    HYPERVISOR_shared_info->events_mask, ci->ci_ilevel);
206 #endif
207 
208 #ifdef EARLY_DEBUG_EVENT
209 	if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) {
210 		xen_debug_handler(NULL);
211 		xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port);
212 	}
213 #endif
214 
215 	/*
216 	 * we're only called after STIC, so we know that we'll have to
217 	 * STI at the end
218 	 */
219 
220 	while (vci->evtchn_upcall_pending) {
221 		x86_disable_intr();
222 
223 		vci->evtchn_upcall_pending = 0;
224 
225 		evt_iterate_bits(&vci->evtchn_pending_sel,
226 		    s->evtchn_pending, s->evtchn_mask,
227 		    evt_set_pending, &ret);
228 
229 		x86_enable_intr();
230 	}
231 
232 	kpreempt_enable();
233 
234 	return (ret);
235 }
236 
237 /* Iterate through pending events and call the event handler */
238 
239 static inline void
evt_do_hypervisor_callback(unsigned int port,unsigned int l1i,unsigned int l2i,void * args)240 evt_do_hypervisor_callback(unsigned int port, unsigned int l1i,
241 			   unsigned int l2i, void *args)
242 {
243 	KASSERT(args != NULL);
244 
245 #ifdef DOM0OPS
246 	struct cpu_info *ci = curcpu();
247 #endif
248 	struct intrframe *regs = args;
249 
250 #ifdef PORT_DEBUG
251 	if (port == PORT_DEBUG)
252 		printf("do_hypervisor_callback event %d\n", port);
253 #endif
254 	if (evtsource[port]) {
255 		KASSERT(cpu_intr_p());
256 		evtchn_do_event(port, regs);
257 	}
258 #ifdef DOM0OPS
259 	else  {
260 		if (ci->ci_ilevel < IPL_HIGH) {
261 			/* fast path */
262 			int oipl = ci->ci_ilevel;
263 			ci->ci_ilevel = IPL_HIGH;
264 			KASSERT(cpu_intr_p());
265 			xenevt_event(port);
266 			ci->ci_ilevel = oipl;
267 		} else {
268 			/* set pending event */
269 			xenevt_setipending(l1i, l2i);
270 		}
271 	}
272 #endif
273 }
274 
275 void
do_hypervisor_callback(struct intrframe * regs)276 do_hypervisor_callback(struct intrframe *regs)
277 {
278 	volatile shared_info_t *s = HYPERVISOR_shared_info;
279 	struct cpu_info *ci;
280 	volatile struct vcpu_info *vci;
281 	uint64_t level __diagused;
282 
283 	ci = curcpu();
284 	vci = ci->ci_vcpu;
285 	level = ci->ci_ilevel;
286 
287 	/* Save trapframe for clock handler */
288 	KASSERT(regs != NULL);
289 	ci->ci_xen_clockf_usermode = USERMODE(regs->_INTRFRAME_CS);
290 	ci->ci_xen_clockf_pc = regs->_INTRFRAME_IP;
291 
292 	// DDD printf("do_hypervisor_callback\n");
293 
294 #ifdef EARLY_DEBUG_EVENT
295 	if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) {
296 		xen_debug_handler(NULL);
297 		xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port);
298 	}
299 #endif
300 
301 	while (vci->evtchn_upcall_pending) {
302 		vci->evtchn_upcall_pending = 0;
303 
304 		evt_iterate_bits(&vci->evtchn_pending_sel,
305 		    s->evtchn_pending, s->evtchn_mask,
306 		    evt_do_hypervisor_callback, regs);
307 	}
308 
309 #ifdef DIAGNOSTIC
310 	if (level != ci->ci_ilevel)
311 		printf("hypervisor done %08x level %" PRIu64 "/%" PRIu64 " ipending %0" PRIx64 "\n",
312 		    (uint)vci->evtchn_pending_sel,
313 		    level, (uint64_t)ci->ci_ilevel, (uint64_t)ci->ci_ipending);
314 #endif
315 }
316 
317 #if 0
318 void
319 hypervisor_send_event(struct cpu_info *ci, unsigned int ev)
320 {
321 	KASSERT(ci != NULL);
322 
323 	volatile shared_info_t *s = HYPERVISOR_shared_info;
324 	volatile struct vcpu_info *vci = ci->ci_vcpu;
325 
326 #ifdef PORT_DEBUG
327 	if (ev == PORT_DEBUG)
328 		printf("hypervisor_send_event %d\n", ev);
329 #endif
330 
331 	xen_atomic_set_bit(&s->evtchn_pending[0], ev);
332 
333 	if (__predict_false(ci == curcpu())) {
334 		xen_atomic_set_bit(&vci->evtchn_pending_sel,
335 		    ev >> LONG_SHIFT);
336 		xen_atomic_set_bit(&vci->evtchn_upcall_pending, 0);
337 	}
338 
339 	xen_atomic_clear_bit(&s->evtchn_mask[0], ev);
340 
341 	if (__predict_true(ci == curcpu())) {
342 		hypervisor_force_callback();
343 	} else {
344 		if (__predict_false(xen_send_ipi(ci, XEN_IPI_HVCB))) {
345 			panic("xen_send_ipi(cpu%d id %d, XEN_IPI_HVCB) failed\n",
346 			    (int) ci->ci_cpuid, ci->ci_vcpuid);
347 		}
348 	}
349 }
350 #endif
351 
352 void
hypervisor_unmask_event(unsigned int ev)353 hypervisor_unmask_event(unsigned int ev)
354 {
355 
356 	KASSERT(ev > 0 && ev < NR_EVENT_CHANNELS);
357 
358 #ifdef PORT_DEBUG
359 	if (ev == PORT_DEBUG)
360 		printf("hypervisor_unmask_event %d\n", ev);
361 #endif
362 
363 	/* Xen unmasks the evtchn_mask[0]:ev bit for us. */
364 	evtchn_op_t op;
365 	op.cmd = EVTCHNOP_unmask;
366 	op.u.unmask.port = ev;
367 	if (HYPERVISOR_event_channel_op(&op) != 0)
368 		panic("Failed to unmask event %d\n", ev);
369 
370 	return;
371 }
372 
373 void
hypervisor_mask_event(unsigned int ev)374 hypervisor_mask_event(unsigned int ev)
375 {
376 	volatile shared_info_t *s = HYPERVISOR_shared_info;
377 #ifdef PORT_DEBUG
378 	if (ev == PORT_DEBUG)
379 		printf("hypervisor_mask_event %d\n", ev);
380 #endif
381 
382 	xen_atomic_set_bit(&s->evtchn_mask[0], ev);
383 }
384 
385 void
hypervisor_clear_event(unsigned int ev)386 hypervisor_clear_event(unsigned int ev)
387 {
388 	volatile shared_info_t *s = HYPERVISOR_shared_info;
389 #ifdef PORT_DEBUG
390 	if (ev == PORT_DEBUG)
391 		printf("hypervisor_clear_event %d\n", ev);
392 #endif
393 
394 	xen_atomic_clear_bit(&s->evtchn_pending[0], ev);
395 }
396 
397 static inline void
evt_enable_event(unsigned int port,unsigned int l1i,unsigned int l2i,void * args)398 evt_enable_event(unsigned int port, unsigned int l1i,
399 		 unsigned int l2i, void *args)
400 {
401 	KASSERT(args == NULL);
402 	hypervisor_unmask_event(port);
403 #if defined(XENPV) && (NPCI > 0 || NISA > 0)
404 	hypervisor_ack_pirq_event(port);
405 #endif /* NPCI > 0 || NISA > 0 */
406 }
407 
408 void
hypervisor_enable_sir(unsigned int sir)409 hypervisor_enable_sir(unsigned int sir)
410 {
411 	struct cpu_info *ci = curcpu();
412 
413 	/*
414 	 * enable all events for ipl. As we only set an event in ipl_evt_mask
415 	 * for its lowest IPL, and pending IPLs are processed high to low,
416 	 * we know that all callback for this event have been processed.
417 	 */
418 
419 	evt_iterate_bits(&ci->ci_isources[sir]->ipl_evt_mask1,
420 	    ci->ci_isources[sir]->ipl_evt_mask2, NULL,
421 	    evt_enable_event, NULL);
422 
423 }
424 
425 void
hypervisor_set_ipending(uint64_t imask,int l1,int l2)426 hypervisor_set_ipending(uint64_t imask, int l1, int l2)
427 {
428 
429 	/* This function is not re-entrant */
430 	KASSERT(x86_read_psl() != 0);
431 
432 	int sir;
433 	struct cpu_info *ci = curcpu();
434 
435 	/* set pending bit for the appropriate IPLs */
436 	ci->ci_ipending |= imask;
437 
438 	/*
439 	 * And set event pending bit for the lowest IPL. As IPL are handled
440 	 * from high to low, this ensure that all callbacks will have been
441 	 * called when we ack the event
442 	 */
443 	sir = ffs(imask);
444 	KASSERT(sir > SIR_XENIPL_VM);
445 	sir--;
446 	KASSERT(sir <= SIR_XENIPL_HIGH);
447 	KASSERT(ci->ci_isources[sir] != NULL);
448 	ci->ci_isources[sir]->ipl_evt_mask1 |= 1UL << l1;
449 	ci->ci_isources[sir]->ipl_evt_mask2[l1] |= 1UL << l2;
450 	KASSERT(ci == curcpu());
451 #if 0
452 	if (__predict_false(ci != curcpu())) {
453 		if (xen_send_ipi(ci, XEN_IPI_HVCB)) {
454 			panic("hypervisor_set_ipending: "
455 			    "xen_send_ipi(cpu%d id %d, XEN_IPI_HVCB) failed\n",
456 			    (int) ci->ci_cpuid, ci->ci_vcpuid);
457 		}
458 	}
459 #endif
460 }
461 
462 void
hypervisor_machdep_attach(void)463 hypervisor_machdep_attach(void)
464 {
465 #ifdef XENPV
466  	/* dom0 does not require the arch-dependent P2M translation table */
467 	if (!xendomain_is_dom0()) {
468 		build_p2m_frame_list_list();
469 		sysctl_xen_suspend_setup();
470 	}
471 #endif
472 }
473 
474 void
hypervisor_machdep_resume(void)475 hypervisor_machdep_resume(void)
476 {
477 #ifdef XENPV
478 	/* dom0 does not require the arch-dependent P2M translation table */
479 	if (!xendomain_is_dom0())
480 		update_p2m_frame_list_list();
481 #endif
482 }
483 
484 /*
485  * idle_block()
486  *
487  *	Called from the idle loop when we have nothing to do but wait
488  *	for an interrupt.
489  */
490 static void
idle_block(void)491 idle_block(void)
492 {
493 	KASSERT(curcpu()->ci_ipending == 0);
494 	HYPERVISOR_block();
495 	KASSERT(curcpu()->ci_ipending == 0);
496 }
497 
498 void
x86_cpu_idle_xen(void)499 x86_cpu_idle_xen(void)
500 {
501 	struct cpu_info *ci = curcpu();
502 
503 	KASSERT(ci->ci_ilevel == IPL_NONE);
504 
505 	x86_disable_intr();
506 	if (__predict_false(!ci->ci_want_resched)) {
507 		idle_block();
508 	} else {
509 		x86_enable_intr();
510 	}
511 }
512 
513 #ifdef XENPV
514 /*
515  * Generate the p2m_frame_list_list table,
516  * needed for guest save/restore
517  */
518 static void
build_p2m_frame_list_list(void)519 build_p2m_frame_list_list(void)
520 {
521         int fpp; /* number of page (frame) pointer per page */
522         unsigned long max_pfn;
523         /*
524          * The p2m list is composed of three levels of indirection,
525          * each layer containing MFNs pointing to lower level pages
526          * The indirection is used to convert a given PFN to its MFN
527          * Each N level page can point to @fpp (N-1) level pages
528          * For example, for x86 32bit, we have:
529          * - PAGE_SIZE: 4096 bytes
530          * - fpp: 1024 (one L3 page can address 1024 L2 pages)
531          * A L1 page contains the list of MFN we are looking for
532          */
533         max_pfn = xen_start_info.nr_pages;
534         fpp = PAGE_SIZE / sizeof(xen_pfn_t);
535 
536         /* we only need one L3 page */
537         l3_p2m_page = (vaddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE,
538 	    PAGE_SIZE, UVM_KMF_WIRED | UVM_KMF_NOWAIT);
539         if (l3_p2m_page == NULL)
540                 panic("could not allocate memory for l3_p2m_page");
541 
542         /*
543          * Determine how many L2 pages we need for the mapping
544          * Each L2 can map a total of @fpp L1 pages
545          */
546         l2_p2m_page_size = howmany(max_pfn, fpp);
547 
548         l2_p2m_page = (vaddr_t *)uvm_km_alloc(kernel_map,
549 	    l2_p2m_page_size * PAGE_SIZE,
550 	    PAGE_SIZE, UVM_KMF_WIRED | UVM_KMF_NOWAIT);
551         if (l2_p2m_page == NULL)
552                 panic("could not allocate memory for l2_p2m_page");
553 
554         /* We now have L3 and L2 pages ready, update L1 mapping */
555         update_p2m_frame_list_list();
556 
557 }
558 
559 /*
560  * Update the L1 p2m_frame_list_list mapping (during guest boot or resume)
561  */
562 static void
update_p2m_frame_list_list(void)563 update_p2m_frame_list_list(void)
564 {
565         int i;
566         int fpp; /* number of page (frame) pointer per page */
567         unsigned long max_pfn;
568 
569         max_pfn = xen_start_info.nr_pages;
570         fpp = PAGE_SIZE / sizeof(xen_pfn_t);
571 
572         for (i = 0; i < l2_p2m_page_size; i++) {
573                 /*
574                  * Each time we start a new L2 page,
575                  * store its MFN in the L3 page
576                  */
577                 if ((i % fpp) == 0) {
578                         l3_p2m_page[i/fpp] = vtomfn(
579                                 (vaddr_t)&l2_p2m_page[i]);
580                 }
581                 /*
582                  * we use a shortcut
583                  * since @xpmap_phys_to_machine_mapping array
584                  * already contains PFN to MFN mapping, we just
585                  * set the l2_p2m_page MFN pointer to the MFN of the
586                  * according frame of @xpmap_phys_to_machine_mapping
587                  */
588                 l2_p2m_page[i] = vtomfn((vaddr_t)
589                         &xpmap_phys_to_machine_mapping[i*fpp]);
590         }
591 
592         HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
593                                         vtomfn((vaddr_t)l3_p2m_page);
594         HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
595 
596 }
597 #endif /* XENPV */
598 
599 void
xen_init_ksyms(void)600 xen_init_ksyms(void)
601 {
602 #if NKSYMS || defined(DDB) || defined(MODULAR)
603 	extern int end;
604 	extern int *esym;
605 #ifdef DDB
606 	db_machine_init();
607 #endif
608 
609 #ifdef XENPV
610 	esym = xen_start_info.mod_start ?
611 	    (void *)xen_start_info.mod_start :
612 	    (void *)xen_start_info.mfn_list;
613 #endif /* XENPV */
614 	/* for PVH, esym is set in locore.S */
615 	ksyms_addsyms_elf(*(int *)(void *)&end,
616 	    ((int *)(void *)&end) + 1, esym);
617 #endif
618 }
619