xref: /openbsd/usr.sbin/vmd/virtio.c (revision 4bdff4be)
1 /*	$OpenBSD: virtio.c,v 1.110 2023/11/03 11:16:43 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* PAGE_SIZE */
20 #include <sys/socket.h>
21 #include <sys/wait.h>
22 
23 #include <machine/vmmvar.h>
24 #include <dev/pci/pcireg.h>
25 #include <dev/pci/pcidevs.h>
26 #include <dev/pv/virtioreg.h>
27 #include <dev/pci/virtio_pcireg.h>
28 #include <dev/pv/vioblkreg.h>
29 #include <dev/pv/vioscsireg.h>
30 
31 #include <net/if.h>
32 #include <netinet/in.h>
33 #include <netinet/if_ether.h>
34 #include <netinet/ip.h>
35 
36 #include <errno.h>
37 #include <event.h>
38 #include <fcntl.h>
39 #include <poll.h>
40 #include <stddef.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <unistd.h>
44 
45 #include "atomicio.h"
46 #include "pci.h"
47 #include "vioscsi.h"
48 #include "virtio.h"
49 #include "vmd.h"
50 #include "vmm.h"
51 
52 extern struct vmd *env;
53 extern char *__progname;
54 
55 struct viornd_dev viornd;
56 struct vioscsi_dev *vioscsi;
57 struct vmmci_dev vmmci;
58 
59 /* Devices emulated in subprocesses are inserted into this list. */
60 SLIST_HEAD(virtio_dev_head, virtio_dev) virtio_devs;
61 
62 #define MAXPHYS	(64 * 1024)	/* max raw I/O transfer size */
63 
64 #define VIRTIO_NET_F_MAC	(1<<5)
65 
66 #define VMMCI_F_TIMESYNC	(1<<0)
67 #define VMMCI_F_ACK		(1<<1)
68 #define VMMCI_F_SYNCRTC		(1<<2)
69 
70 #define RXQ	0
71 #define TXQ	1
72 
73 static int virtio_dev_launch(struct vmd_vm *, struct virtio_dev *);
74 static void virtio_dispatch_dev(int, short, void *);
75 static int handle_dev_msg(struct viodev_msg *, struct virtio_dev *);
76 
77 const char *
78 virtio_reg_name(uint8_t reg)
79 {
80 	switch (reg) {
81 	case VIRTIO_CONFIG_DEVICE_FEATURES: return "device feature";
82 	case VIRTIO_CONFIG_GUEST_FEATURES: return "guest feature";
83 	case VIRTIO_CONFIG_QUEUE_PFN: return "queue address";
84 	case VIRTIO_CONFIG_QUEUE_SIZE: return "queue size";
85 	case VIRTIO_CONFIG_QUEUE_SELECT: return "queue select";
86 	case VIRTIO_CONFIG_QUEUE_NOTIFY: return "queue notify";
87 	case VIRTIO_CONFIG_DEVICE_STATUS: return "device status";
88 	case VIRTIO_CONFIG_ISR_STATUS: return "isr status";
89 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI...VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
90 		return "device config 0";
91 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
92 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
93 		return "device config 1";
94 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: return "device config 2";
95 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: return "device config 3";
96 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: return "device config 4";
97 	default: return "unknown";
98 	}
99 }
100 
101 uint32_t
102 vring_size(uint32_t vq_size)
103 {
104 	uint32_t allocsize1, allocsize2;
105 
106 	/* allocsize1: descriptor table + avail ring + pad */
107 	allocsize1 = VIRTQUEUE_ALIGN(sizeof(struct vring_desc) * vq_size
108 	    + sizeof(uint16_t) * (2 + vq_size));
109 	/* allocsize2: used ring + pad */
110 	allocsize2 = VIRTQUEUE_ALIGN(sizeof(uint16_t) * 2
111 	    + sizeof(struct vring_used_elem) * vq_size);
112 
113 	return allocsize1 + allocsize2;
114 }
115 
116 /* Update queue select */
117 void
118 viornd_update_qs(void)
119 {
120 	struct virtio_vq_info *vq_info;
121 
122 	/* Invalid queue? */
123 	if (viornd.cfg.queue_select > 0) {
124 		viornd.cfg.queue_size = 0;
125 		return;
126 	}
127 
128 	vq_info = &viornd.vq[viornd.cfg.queue_select];
129 
130 	/* Update queue pfn/size based on queue select */
131 	viornd.cfg.queue_pfn = vq_info->q_gpa >> 12;
132 	viornd.cfg.queue_size = vq_info->qs;
133 }
134 
135 /* Update queue address */
136 void
137 viornd_update_qa(void)
138 {
139 	struct virtio_vq_info *vq_info;
140 	void *hva = NULL;
141 
142 	/* Invalid queue? */
143 	if (viornd.cfg.queue_select > 0)
144 		return;
145 
146 	vq_info = &viornd.vq[viornd.cfg.queue_select];
147 	vq_info->q_gpa = (uint64_t)viornd.cfg.queue_pfn * VIRTIO_PAGE_SIZE;
148 
149 	hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIORND_QUEUE_SIZE));
150 	if (hva == NULL)
151 		fatalx("viornd_update_qa");
152 	vq_info->q_hva = hva;
153 }
154 
155 int
156 viornd_notifyq(void)
157 {
158 	size_t sz;
159 	int dxx, ret;
160 	uint16_t aidx, uidx;
161 	char *vr, *rnd_data;
162 	struct vring_desc *desc;
163 	struct vring_avail *avail;
164 	struct vring_used *used;
165 	struct virtio_vq_info *vq_info;
166 
167 	ret = 0;
168 
169 	/* Invalid queue? */
170 	if (viornd.cfg.queue_notify > 0)
171 		return (0);
172 
173 	vq_info = &viornd.vq[viornd.cfg.queue_notify];
174 	vr = vq_info->q_hva;
175 	if (vr == NULL)
176 		fatalx("%s: null vring", __func__);
177 
178 	desc = (struct vring_desc *)(vr);
179 	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
180 	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
181 
182 	aidx = avail->idx & VIORND_QUEUE_MASK;
183 	uidx = used->idx & VIORND_QUEUE_MASK;
184 
185 	dxx = avail->ring[aidx] & VIORND_QUEUE_MASK;
186 
187 	sz = desc[dxx].len;
188 	if (sz > MAXPHYS)
189 		fatalx("viornd descriptor size too large (%zu)", sz);
190 
191 	rnd_data = malloc(sz);
192 
193 	if (rnd_data != NULL) {
194 		arc4random_buf(rnd_data, sz);
195 		if (write_mem(desc[dxx].addr, rnd_data, sz)) {
196 			log_warnx("viornd: can't write random data @ "
197 			    "0x%llx",
198 			    desc[dxx].addr);
199 		} else {
200 			/* ret == 1 -> interrupt needed */
201 			/* XXX check VIRTIO_F_NO_INTR */
202 			ret = 1;
203 			viornd.cfg.isr_status = 1;
204 			used->ring[uidx].id = dxx;
205 			used->ring[uidx].len = sz;
206 			__sync_synchronize();
207 			used->idx++;
208 		}
209 		free(rnd_data);
210 	} else
211 		fatal("memory allocation error for viornd data");
212 
213 	return (ret);
214 }
215 
216 int
217 virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
218     void *unused, uint8_t sz)
219 {
220 	*intr = 0xFF;
221 
222 	if (dir == 0) {
223 		switch (reg) {
224 		case VIRTIO_CONFIG_DEVICE_FEATURES:
225 		case VIRTIO_CONFIG_QUEUE_SIZE:
226 		case VIRTIO_CONFIG_ISR_STATUS:
227 			log_warnx("%s: illegal write %x to %s",
228 			    __progname, *data, virtio_reg_name(reg));
229 			break;
230 		case VIRTIO_CONFIG_GUEST_FEATURES:
231 			viornd.cfg.guest_feature = *data;
232 			break;
233 		case VIRTIO_CONFIG_QUEUE_PFN:
234 			viornd.cfg.queue_pfn = *data;
235 			viornd_update_qa();
236 			break;
237 		case VIRTIO_CONFIG_QUEUE_SELECT:
238 			viornd.cfg.queue_select = *data;
239 			viornd_update_qs();
240 			break;
241 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
242 			viornd.cfg.queue_notify = *data;
243 			if (viornd_notifyq())
244 				*intr = 1;
245 			break;
246 		case VIRTIO_CONFIG_DEVICE_STATUS:
247 			viornd.cfg.device_status = *data;
248 			break;
249 		}
250 	} else {
251 		switch (reg) {
252 		case VIRTIO_CONFIG_DEVICE_FEATURES:
253 			*data = viornd.cfg.device_feature;
254 			break;
255 		case VIRTIO_CONFIG_GUEST_FEATURES:
256 			*data = viornd.cfg.guest_feature;
257 			break;
258 		case VIRTIO_CONFIG_QUEUE_PFN:
259 			*data = viornd.cfg.queue_pfn;
260 			break;
261 		case VIRTIO_CONFIG_QUEUE_SIZE:
262 			*data = viornd.cfg.queue_size;
263 			break;
264 		case VIRTIO_CONFIG_QUEUE_SELECT:
265 			*data = viornd.cfg.queue_select;
266 			break;
267 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
268 			*data = viornd.cfg.queue_notify;
269 			break;
270 		case VIRTIO_CONFIG_DEVICE_STATUS:
271 			*data = viornd.cfg.device_status;
272 			break;
273 		case VIRTIO_CONFIG_ISR_STATUS:
274 			*data = viornd.cfg.isr_status;
275 			viornd.cfg.isr_status = 0;
276 			vcpu_deassert_pic_irq(viornd.vm_id, 0, viornd.irq);
277 			break;
278 		}
279 	}
280 	return (0);
281 }
282 
283 int
284 vmmci_ctl(unsigned int cmd)
285 {
286 	struct timeval tv = { 0, 0 };
287 
288 	if ((vmmci.cfg.device_status &
289 	    VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0)
290 		return (-1);
291 
292 	if (cmd == vmmci.cmd)
293 		return (0);
294 
295 	switch (cmd) {
296 	case VMMCI_NONE:
297 		break;
298 	case VMMCI_SHUTDOWN:
299 	case VMMCI_REBOOT:
300 		/* Update command */
301 		vmmci.cmd = cmd;
302 
303 		/*
304 		 * vmm VMs do not support powerdown, send a reboot request
305 		 * instead and turn it off after the triple fault.
306 		 */
307 		if (cmd == VMMCI_SHUTDOWN)
308 			cmd = VMMCI_REBOOT;
309 
310 		/* Trigger interrupt */
311 		vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
312 		vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
313 
314 		/* Add ACK timeout */
315 		tv.tv_sec = VMMCI_TIMEOUT;
316 		evtimer_add(&vmmci.timeout, &tv);
317 		break;
318 	case VMMCI_SYNCRTC:
319 		if (vmmci.cfg.guest_feature & VMMCI_F_SYNCRTC) {
320 			/* RTC updated, request guest VM resync of its RTC */
321 			vmmci.cmd = cmd;
322 
323 			vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
324 			vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
325 		} else {
326 			log_debug("%s: RTC sync skipped (guest does not "
327 			    "support RTC sync)\n", __func__);
328 		}
329 		break;
330 	default:
331 		fatalx("invalid vmmci command: %d", cmd);
332 	}
333 
334 	return (0);
335 }
336 
337 void
338 vmmci_ack(unsigned int cmd)
339 {
340 	struct timeval	 tv = { 0, 0 };
341 
342 	switch (cmd) {
343 	case VMMCI_NONE:
344 		break;
345 	case VMMCI_SHUTDOWN:
346 		/*
347 		 * The shutdown was requested by the VM if we don't have
348 		 * a pending shutdown request.  In this case add a short
349 		 * timeout to give the VM a chance to reboot before the
350 		 * timer is expired.
351 		 */
352 		if (vmmci.cmd == 0) {
353 			log_debug("%s: vm %u requested shutdown", __func__,
354 			    vmmci.vm_id);
355 			tv.tv_sec = VMMCI_TIMEOUT;
356 			evtimer_add(&vmmci.timeout, &tv);
357 			return;
358 		}
359 		/* FALLTHROUGH */
360 	case VMMCI_REBOOT:
361 		/*
362 		 * If the VM acknowledged our shutdown request, give it
363 		 * enough time to shutdown or reboot gracefully.  This
364 		 * might take a considerable amount of time (running
365 		 * rc.shutdown on the VM), so increase the timeout before
366 		 * killing it forcefully.
367 		 */
368 		if (cmd == vmmci.cmd &&
369 		    evtimer_pending(&vmmci.timeout, NULL)) {
370 			log_debug("%s: vm %u acknowledged shutdown request",
371 			    __func__, vmmci.vm_id);
372 			tv.tv_sec = VMMCI_SHUTDOWN_TIMEOUT;
373 			evtimer_add(&vmmci.timeout, &tv);
374 		}
375 		break;
376 	case VMMCI_SYNCRTC:
377 		log_debug("%s: vm %u acknowledged RTC sync request",
378 		    __func__, vmmci.vm_id);
379 		vmmci.cmd = VMMCI_NONE;
380 		break;
381 	default:
382 		log_warnx("%s: illegal request %u", __func__, cmd);
383 		break;
384 	}
385 }
386 
387 void
388 vmmci_timeout(int fd, short type, void *arg)
389 {
390 	log_debug("%s: vm %u shutdown", __progname, vmmci.vm_id);
391 	vm_shutdown(vmmci.cmd == VMMCI_REBOOT ? VMMCI_REBOOT : VMMCI_SHUTDOWN);
392 }
393 
394 int
395 vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
396     void *unused, uint8_t sz)
397 {
398 	*intr = 0xFF;
399 
400 	if (dir == 0) {
401 		switch (reg) {
402 		case VIRTIO_CONFIG_DEVICE_FEATURES:
403 		case VIRTIO_CONFIG_QUEUE_SIZE:
404 		case VIRTIO_CONFIG_ISR_STATUS:
405 			log_warnx("%s: illegal write %x to %s",
406 			    __progname, *data, virtio_reg_name(reg));
407 			break;
408 		case VIRTIO_CONFIG_GUEST_FEATURES:
409 			vmmci.cfg.guest_feature = *data;
410 			break;
411 		case VIRTIO_CONFIG_QUEUE_PFN:
412 			vmmci.cfg.queue_pfn = *data;
413 			break;
414 		case VIRTIO_CONFIG_QUEUE_SELECT:
415 			vmmci.cfg.queue_select = *data;
416 			break;
417 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
418 			vmmci.cfg.queue_notify = *data;
419 			break;
420 		case VIRTIO_CONFIG_DEVICE_STATUS:
421 			vmmci.cfg.device_status = *data;
422 			break;
423 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
424 			vmmci_ack(*data);
425 			break;
426 		}
427 	} else {
428 		switch (reg) {
429 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
430 			*data = vmmci.cmd;
431 			break;
432 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
433 			/* Update time once when reading the first register */
434 			gettimeofday(&vmmci.time, NULL);
435 			*data = (uint64_t)vmmci.time.tv_sec;
436 			break;
437 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
438 			*data = (uint64_t)vmmci.time.tv_sec << 32;
439 			break;
440 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12:
441 			*data = (uint64_t)vmmci.time.tv_usec;
442 			break;
443 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16:
444 			*data = (uint64_t)vmmci.time.tv_usec << 32;
445 			break;
446 		case VIRTIO_CONFIG_DEVICE_FEATURES:
447 			*data = vmmci.cfg.device_feature;
448 			break;
449 		case VIRTIO_CONFIG_GUEST_FEATURES:
450 			*data = vmmci.cfg.guest_feature;
451 			break;
452 		case VIRTIO_CONFIG_QUEUE_PFN:
453 			*data = vmmci.cfg.queue_pfn;
454 			break;
455 		case VIRTIO_CONFIG_QUEUE_SIZE:
456 			*data = vmmci.cfg.queue_size;
457 			break;
458 		case VIRTIO_CONFIG_QUEUE_SELECT:
459 			*data = vmmci.cfg.queue_select;
460 			break;
461 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
462 			*data = vmmci.cfg.queue_notify;
463 			break;
464 		case VIRTIO_CONFIG_DEVICE_STATUS:
465 			*data = vmmci.cfg.device_status;
466 			break;
467 		case VIRTIO_CONFIG_ISR_STATUS:
468 			*data = vmmci.cfg.isr_status;
469 			vmmci.cfg.isr_status = 0;
470 			vcpu_deassert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
471 			break;
472 		}
473 	}
474 	return (0);
475 }
476 
477 int
478 virtio_get_base(int fd, char *path, size_t npath, int type, const char *dpath)
479 {
480 	switch (type) {
481 	case VMDF_RAW:
482 		return 0;
483 	case VMDF_QCOW2:
484 		return virtio_qcow2_get_base(fd, path, npath, dpath);
485 	}
486 	log_warnx("%s: invalid disk format", __func__);
487 	return -1;
488 }
489 
490 void
491 virtio_init(struct vmd_vm *vm, int child_cdrom,
492     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
493 {
494 	struct vmop_create_params *vmc = &vm->vm_params;
495 	struct vm_create_params *vcp = &vmc->vmc_params;
496 	struct virtio_dev *dev;
497 	uint8_t id;
498 	uint8_t i, j;
499 
500 	/* Virtio entropy device */
501 	if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
502 	    PCI_PRODUCT_QUMRANET_VIO_RNG, PCI_CLASS_SYSTEM,
503 	    PCI_SUBCLASS_SYSTEM_MISC,
504 	    PCI_VENDOR_OPENBSD,
505 	    PCI_PRODUCT_VIRTIO_ENTROPY, 1, NULL)) {
506 		log_warnx("%s: can't add PCI virtio rng device",
507 		    __progname);
508 		return;
509 	}
510 
511 	if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_rnd_io, NULL)) {
512 		log_warnx("%s: can't add bar for virtio rng device",
513 		    __progname);
514 		return;
515 	}
516 
517 	memset(&viornd, 0, sizeof(viornd));
518 	viornd.vq[0].qs = VIORND_QUEUE_SIZE;
519 	viornd.vq[0].vq_availoffset = sizeof(struct vring_desc) *
520 	    VIORND_QUEUE_SIZE;
521 	viornd.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
522 	    sizeof(struct vring_desc) * VIORND_QUEUE_SIZE
523 	    + sizeof(uint16_t) * (2 + VIORND_QUEUE_SIZE));
524 	viornd.pci_id = id;
525 	viornd.irq = pci_get_dev_irq(id);
526 	viornd.vm_id = vcp->vcp_id;
527 
528 	SLIST_INIT(&virtio_devs);
529 
530 	if (vmc->vmc_nnics > 0) {
531 		for (i = 0; i < vmc->vmc_nnics; i++) {
532 			dev = calloc(1, sizeof(struct virtio_dev));
533 			if (dev == NULL) {
534 				log_warn("%s: calloc failure allocating vionet",
535 				    __progname);
536 				return;
537 			}
538 			/* Virtio network */
539 			dev->dev_type = VMD_DEVTYPE_NET;
540 
541 			if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
542 				PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM,
543 				PCI_SUBCLASS_SYSTEM_MISC, PCI_VENDOR_OPENBSD,
544 				PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL)) {
545 				log_warnx("%s: can't add PCI virtio net device",
546 				    __progname);
547 				return;
548 			}
549 			dev->pci_id = id;
550 			dev->sync_fd = -1;
551 			dev->async_fd = -1;
552 			dev->vm_id = vcp->vcp_id;
553 			dev->vm_vmid = vm->vm_vmid;
554 			dev->irq = pci_get_dev_irq(id);
555 
556 			/* The vionet pci bar function is called by the vcpu. */
557 			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
558 			    dev)) {
559 				log_warnx("%s: can't add bar for virtio net "
560 				    "device", __progname);
561 				return;
562 			}
563 
564 			dev->vionet.vq[RXQ].qs = VIONET_QUEUE_SIZE;
565 			dev->vionet.vq[RXQ].vq_availoffset =
566 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
567 			dev->vionet.vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
568 				sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
569 				+ sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
570 			dev->vionet.vq[RXQ].last_avail = 0;
571 			dev->vionet.vq[RXQ].notified_avail = 0;
572 
573 			dev->vionet.vq[TXQ].qs = VIONET_QUEUE_SIZE;
574 			dev->vionet.vq[TXQ].vq_availoffset =
575 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
576 			dev->vionet.vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
577 				sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
578 				+ sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
579 			dev->vionet.vq[TXQ].last_avail = 0;
580 			dev->vionet.vq[TXQ].notified_avail = 0;
581 
582 			dev->vionet.data_fd = child_taps[i];
583 
584 			/* MAC address has been assigned by the parent */
585 			memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6);
586 			dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC;
587 
588 			dev->vionet.lockedmac =
589 			    vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0;
590 			dev->vionet.local =
591 			    vmc->vmc_ifflags[i] & VMIFF_LOCAL ? 1 : 0;
592 			if (i == 0 && vmc->vmc_bootdevice & VMBOOTDEV_NET)
593 				dev->vionet.pxeboot = 1;
594 			memcpy(&dev->vionet.local_prefix,
595 			    &env->vmd_cfg.cfg_localprefix,
596 			    sizeof(dev->vionet.local_prefix));
597 			log_debug("%s: vm \"%s\" vio%u lladdr %s%s%s%s",
598 			    __func__, vcp->vcp_name, i,
599 			    ether_ntoa((void *)dev->vionet.mac),
600 			    dev->vionet.lockedmac ? ", locked" : "",
601 			    dev->vionet.local ? ", local" : "",
602 			    dev->vionet.pxeboot ? ", pxeboot" : "");
603 
604 			/* Add the vionet to our device list. */
605 			dev->vionet.idx = i;
606 			SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
607 		}
608 	}
609 
610 	if (vmc->vmc_ndisks > 0) {
611 		for (i = 0; i < vmc->vmc_ndisks; i++) {
612 			dev = calloc(1, sizeof(struct virtio_dev));
613 			if (dev == NULL) {
614 				log_warn("%s: calloc failure allocating vioblk",
615 				    __progname);
616 				return;
617 			}
618 
619 			/* One vioblk device for each disk defined in vcp */
620 			dev->dev_type = VMD_DEVTYPE_DISK;
621 
622 			if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
623 			    PCI_PRODUCT_QUMRANET_VIO_BLOCK,
624 			    PCI_CLASS_MASS_STORAGE,
625 			    PCI_SUBCLASS_MASS_STORAGE_SCSI,
626 			    PCI_VENDOR_OPENBSD,
627 			    PCI_PRODUCT_VIRTIO_BLOCK, 1, NULL)) {
628 				log_warnx("%s: can't add PCI virtio block "
629 				    "device", __progname);
630 				return;
631 			}
632 			dev->pci_id = id;
633 			dev->sync_fd = -1;
634 			dev->async_fd = -1;
635 			dev->vm_id = vcp->vcp_id;
636 			dev->vm_vmid = vm->vm_vmid;
637 			dev->irq = pci_get_dev_irq(id);
638 
639 			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
640 			    &dev->vioblk)) {
641 				log_warnx("%s: can't add bar for virtio block "
642 				    "device", __progname);
643 				return;
644 			}
645 			dev->vioblk.vq[0].qs = VIOBLK_QUEUE_SIZE;
646 			dev->vioblk.vq[0].vq_availoffset =
647 			    sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE;
648 			dev->vioblk.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
649 			    sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE
650 			    + sizeof(uint16_t) * (2 + VIOBLK_QUEUE_SIZE));
651 			dev->vioblk.vq[0].last_avail = 0;
652 			dev->vioblk.cfg.device_feature =
653 			    VIRTIO_BLK_F_SEG_MAX;
654 			dev->vioblk.seg_max = VIOBLK_SEG_MAX;
655 
656 			/*
657 			 * Initialize disk fds to an invalid fd (-1), then
658 			 * set any child disk fds.
659 			 */
660 			memset(&dev->vioblk.disk_fd, -1,
661 			    sizeof(dev->vioblk.disk_fd));
662 			dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i];
663 			for (j = 0; j < dev->vioblk.ndisk_fd; j++)
664 				dev->vioblk.disk_fd[j] = child_disks[i][j];
665 
666 			dev->vioblk.idx = i;
667 			SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
668 		}
669 	}
670 
671 	/*
672 	 * Launch virtio devices that support subprocess execution.
673 	 */
674 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
675 		if (virtio_dev_launch(vm, dev) != 0)
676 			fatalx("failed to launch virtio device");
677 	}
678 
679 	/* vioscsi cdrom */
680 	if (strlen(vmc->vmc_cdrom)) {
681 		vioscsi = calloc(1, sizeof(struct vioscsi_dev));
682 		if (vioscsi == NULL) {
683 			log_warn("%s: calloc failure allocating vioscsi",
684 			    __progname);
685 			return;
686 		}
687 
688 		if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
689 		    PCI_PRODUCT_QUMRANET_VIO_SCSI,
690 		    PCI_CLASS_MASS_STORAGE,
691 		    PCI_SUBCLASS_MASS_STORAGE_SCSI,
692 		    PCI_VENDOR_OPENBSD,
693 		    PCI_PRODUCT_VIRTIO_SCSI, 1, NULL)) {
694 			log_warnx("%s: can't add PCI vioscsi device",
695 			    __progname);
696 			return;
697 		}
698 
699 		if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vioscsi_io, vioscsi)) {
700 			log_warnx("%s: can't add bar for vioscsi device",
701 			    __progname);
702 			return;
703 		}
704 
705 		for (i = 0; i < VIRTIO_MAX_QUEUES; i++) {
706 			vioscsi->vq[i].qs = VIOSCSI_QUEUE_SIZE;
707 			vioscsi->vq[i].vq_availoffset =
708 			    sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE;
709 			vioscsi->vq[i].vq_usedoffset = VIRTQUEUE_ALIGN(
710 			    sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE
711 			    + sizeof(uint16_t) * (2 + VIOSCSI_QUEUE_SIZE));
712 			vioscsi->vq[i].last_avail = 0;
713 		}
714 		if (virtio_raw_init(&vioscsi->file, &vioscsi->sz, &child_cdrom,
715 		    1) == -1) {
716 			log_warnx("%s: unable to determine iso format",
717 			    __func__);
718 			return;
719 		}
720 		vioscsi->locked = 0;
721 		vioscsi->lba = 0;
722 		vioscsi->n_blocks = vioscsi->sz / VIOSCSI_BLOCK_SIZE_CDROM;
723 		vioscsi->max_xfer = VIOSCSI_BLOCK_SIZE_CDROM;
724 		vioscsi->pci_id = id;
725 		vioscsi->vm_id = vcp->vcp_id;
726 		vioscsi->irq = pci_get_dev_irq(id);
727 	}
728 
729 	/* virtio control device */
730 	if (pci_add_device(&id, PCI_VENDOR_OPENBSD,
731 	    PCI_PRODUCT_OPENBSD_CONTROL,
732 	    PCI_CLASS_COMMUNICATIONS,
733 	    PCI_SUBCLASS_COMMUNICATIONS_MISC,
734 	    PCI_VENDOR_OPENBSD,
735 	    PCI_PRODUCT_VIRTIO_VMMCI, 1, NULL)) {
736 		log_warnx("%s: can't add PCI vmm control device",
737 		    __progname);
738 		return;
739 	}
740 
741 	if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vmmci_io, NULL)) {
742 		log_warnx("%s: can't add bar for vmm control device",
743 		    __progname);
744 		return;
745 	}
746 
747 	memset(&vmmci, 0, sizeof(vmmci));
748 	vmmci.cfg.device_feature = VMMCI_F_TIMESYNC | VMMCI_F_ACK |
749 	    VMMCI_F_SYNCRTC;
750 	vmmci.vm_id = vcp->vcp_id;
751 	vmmci.irq = pci_get_dev_irq(id);
752 	vmmci.pci_id = id;
753 
754 	evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
755 }
756 
757 /*
758  * vionet_set_hostmac
759  *
760  * Sets the hardware address for the host-side tap(4) on a vionet_dev.
761  *
762  * This should only be called from the event-loop thread
763  *
764  * vm: pointer to the current vmd_vm instance
765  * idx: index into the array of vionet_dev's for the target vionet_dev
766  * addr: ethernet address to set
767  */
768 void
769 vionet_set_hostmac(struct vmd_vm *vm, unsigned int idx, uint8_t *addr)
770 {
771 	struct vmop_create_params	*vmc = &vm->vm_params;
772 	struct virtio_dev		*dev;
773 	struct vionet_dev		*vionet = NULL;
774 	int ret;
775 
776 	if (idx > vmc->vmc_nnics)
777 		fatalx("%s: invalid vionet index: %u", __func__, idx);
778 
779 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
780 		if (dev->dev_type == VMD_DEVTYPE_NET
781 		    && dev->vionet.idx == idx) {
782 			vionet = &dev->vionet;
783 			break;
784 		}
785 	}
786 	if (vionet == NULL)
787 		fatalx("%s: dev == NULL, idx = %u", __func__, idx);
788 
789 	/* Set the local vm process copy. */
790 	memcpy(vionet->hostmac, addr, sizeof(vionet->hostmac));
791 
792 	/* Send the information to the device process. */
793 	ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_HOSTMAC, 0, 0, -1,
794 	    vionet->hostmac, sizeof(vionet->hostmac));
795 	if (ret == -1) {
796 		log_warnx("%s: failed to queue hostmac to vionet dev %u",
797 		    __func__, idx);
798 		return;
799 	}
800 }
801 
802 void
803 virtio_shutdown(struct vmd_vm *vm)
804 {
805 	int ret, status;
806 	pid_t pid = 0;
807 	struct virtio_dev *dev, *tmp;
808 	struct viodev_msg msg;
809 	struct imsgbuf *ibuf;
810 
811 	/* Ensure that our disks are synced. */
812 	if (vioscsi != NULL)
813 		vioscsi->file.close(vioscsi->file.p, 0);
814 
815 	/*
816 	 * Broadcast shutdown to child devices. We need to do this
817 	 * synchronously as we have already stopped the async event thread.
818 	 */
819 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
820 		memset(&msg, 0, sizeof(msg));
821 		msg.type = VIODEV_MSG_SHUTDOWN;
822 		ibuf = &dev->sync_iev.ibuf;
823 		ret = imsg_compose(ibuf, VIODEV_MSG_SHUTDOWN, 0, 0, -1,
824 		    &msg, sizeof(msg));
825 		if (ret == -1)
826 			fatalx("%s: failed to send shutdown to device",
827 			    __func__);
828 		if (imsg_flush(ibuf) == -1)
829 			fatalx("%s: imsg_flush", __func__);
830 	}
831 
832 	/*
833 	 * Wait for all children to shutdown using a simple approach of
834 	 * iterating over known child devices and waiting for them to die.
835 	 */
836 	SLIST_FOREACH_SAFE(dev, &virtio_devs, dev_next, tmp) {
837 		log_debug("%s: waiting on device pid %d", __func__,
838 		    dev->dev_pid);
839 		do {
840 			pid = waitpid(dev->dev_pid, &status, WNOHANG);
841 		} while (pid == 0 || (pid == -1 && errno == EINTR));
842 		if (pid == dev->dev_pid)
843 			log_debug("%s: device for pid %d is stopped",
844 			    __func__, pid);
845 		else
846 			log_warnx("%s: unexpected pid %d", __func__, pid);
847 		free(dev);
848 	}
849 }
850 
851 int
852 vmmci_restore(int fd, uint32_t vm_id)
853 {
854 	log_debug("%s: receiving vmmci", __func__);
855 	if (atomicio(read, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
856 		log_warnx("%s: error reading vmmci from fd", __func__);
857 		return (-1);
858 	}
859 
860 	if (pci_set_bar_fn(vmmci.pci_id, 0, vmmci_io, NULL)) {
861 		log_warnx("%s: can't set bar fn for vmm control device",
862 		    __progname);
863 		return (-1);
864 	}
865 	vmmci.vm_id = vm_id;
866 	vmmci.irq = pci_get_dev_irq(vmmci.pci_id);
867 	memset(&vmmci.timeout, 0, sizeof(struct event));
868 	evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
869 	return (0);
870 }
871 
872 int
873 viornd_restore(int fd, struct vmd_vm *vm)
874 {
875 	void *hva = NULL;
876 
877 	log_debug("%s: receiving viornd", __func__);
878 	if (atomicio(read, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
879 		log_warnx("%s: error reading viornd from fd", __func__);
880 		return (-1);
881 	}
882 	if (pci_set_bar_fn(viornd.pci_id, 0, virtio_rnd_io, NULL)) {
883 		log_warnx("%s: can't set bar fn for virtio rng device",
884 		    __progname);
885 		return (-1);
886 	}
887 	viornd.vm_id = vm->vm_params.vmc_params.vcp_id;
888 	viornd.irq = pci_get_dev_irq(viornd.pci_id);
889 
890 	hva = hvaddr_mem(viornd.vq[0].q_gpa, vring_size(VIORND_QUEUE_SIZE));
891 	if (hva == NULL)
892 		fatal("failed to restore viornd virtqueue");
893 	viornd.vq[0].q_hva = hva;
894 
895 	return (0);
896 }
897 
898 int
899 vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
900 {
901 	struct vmop_create_params *vmc = &vm->vm_params;
902 	struct vm_create_params *vcp = &vmc->vmc_params;
903 	struct virtio_dev *dev;
904 	uint8_t i;
905 
906 	if (vmc->vmc_nnics == 0)
907 		return (0);
908 
909 	for (i = 0; i < vmc->vmc_nnics; i++) {
910 		dev = calloc(1, sizeof(struct virtio_dev));
911 		if (dev == NULL) {
912 			log_warn("%s: calloc failure allocating vionet",
913 			    __progname);
914 			return (-1);
915 		}
916 
917 		log_debug("%s: receiving virtio network device", __func__);
918 		if (atomicio(read, fd, dev, sizeof(struct virtio_dev))
919 		    != sizeof(struct virtio_dev)) {
920 			log_warnx("%s: error reading vionet from fd",
921 			    __func__);
922 			return (-1);
923 		}
924 
925 		/* Virtio network */
926 		if (dev->dev_type != VMD_DEVTYPE_NET) {
927 			log_warnx("%s: invalid device type", __func__);
928 			return (-1);
929 		}
930 
931 		dev->sync_fd = -1;
932 		dev->async_fd = -1;
933 		dev->vm_id = vcp->vcp_id;
934 		dev->vm_vmid = vm->vm_vmid;
935 		dev->irq = pci_get_dev_irq(dev->pci_id);
936 
937 		if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) {
938 			log_warnx("%s: can't set bar fn for virtio net "
939 			    "device", __progname);
940 			return (-1);
941 		}
942 
943 		dev->vionet.data_fd = child_taps[i];
944 		dev->vionet.idx = i;
945 
946 		SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
947 	}
948 
949 	return (0);
950 }
951 
952 int
953 vioblk_restore(int fd, struct vmd_vm *vm,
954     int child_disks[][VM_MAX_BASE_PER_DISK])
955 {
956 	struct vmop_create_params *vmc = &vm->vm_params;
957 	struct virtio_dev *dev;
958 	uint8_t i, j;
959 
960 	if (vmc->vmc_ndisks == 0)
961 		return (0);
962 
963 	for (i = 0; i < vmc->vmc_ndisks; i++) {
964 		dev = calloc(1, sizeof(struct virtio_dev));
965 		if (dev == NULL) {
966 			log_warn("%s: calloc failure allocating vioblks",
967 			    __progname);
968 			return (-1);
969 		}
970 
971 		log_debug("%s: receiving vioblk", __func__);
972 		if (atomicio(read, fd, dev, sizeof(struct virtio_dev))
973 		    != sizeof(struct virtio_dev)) {
974 			log_warnx("%s: error reading vioblk from fd", __func__);
975 			return (-1);
976 		}
977 		if (dev->dev_type != VMD_DEVTYPE_DISK) {
978 			log_warnx("%s: invalid device type", __func__);
979 			return (-1);
980 		}
981 
982 		dev->sync_fd = -1;
983 		dev->async_fd = -1;
984 
985 		if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) {
986 			log_warnx("%s: can't set bar fn for virtio block "
987 			    "device", __progname);
988 			return (-1);
989 		}
990 		dev->vm_id = vmc->vmc_params.vcp_id;
991 		dev->irq = pci_get_dev_irq(dev->pci_id);
992 
993 		memset(&dev->vioblk.disk_fd, -1, sizeof(dev->vioblk.disk_fd));
994 		dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i];
995 		for (j = 0; j < dev->vioblk.ndisk_fd; j++)
996 			dev->vioblk.disk_fd[j] = child_disks[i][j];
997 
998 		dev->vioblk.idx = i;
999 		SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
1000 	}
1001 	return (0);
1002 }
1003 
1004 int
1005 vioscsi_restore(int fd, struct vmd_vm *vm, int child_cdrom)
1006 {
1007 	void *hva = NULL;
1008 	unsigned int i;
1009 
1010 	if (!strlen(vm->vm_params.vmc_cdrom))
1011 		return (0);
1012 
1013 	vioscsi = calloc(1, sizeof(struct vioscsi_dev));
1014 	if (vioscsi == NULL) {
1015 		log_warn("%s: calloc failure allocating vioscsi", __progname);
1016 		return (-1);
1017 	}
1018 
1019 	log_debug("%s: receiving vioscsi", __func__);
1020 
1021 	if (atomicio(read, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
1022 	    sizeof(struct vioscsi_dev)) {
1023 		log_warnx("%s: error reading vioscsi from fd", __func__);
1024 		return (-1);
1025 	}
1026 
1027 	if (pci_set_bar_fn(vioscsi->pci_id, 0, vioscsi_io, vioscsi)) {
1028 		log_warnx("%s: can't set bar fn for vmm control device",
1029 		    __progname);
1030 		return (-1);
1031 	}
1032 
1033 	vioscsi->vm_id = vm->vm_params.vmc_params.vcp_id;
1034 	vioscsi->irq = pci_get_dev_irq(vioscsi->pci_id);
1035 
1036 	/* vioscsi uses 3 virtqueues. */
1037 	for (i = 0; i < 3; i++) {
1038 		hva = hvaddr_mem(vioscsi->vq[i].q_gpa,
1039 		    vring_size(VIOSCSI_QUEUE_SIZE));
1040 		if (hva == NULL)
1041 			fatal("failed to restore vioscsi virtqueue");
1042 		vioscsi->vq[i].q_hva = hva;
1043 	}
1044 
1045 	return (0);
1046 }
1047 
1048 int
1049 virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
1050     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1051 {
1052 	struct virtio_dev *dev;
1053 	int ret;
1054 
1055 	SLIST_INIT(&virtio_devs);
1056 
1057 	if ((ret = viornd_restore(fd, vm)) == -1)
1058 		return (ret);
1059 
1060 	if ((ret = vioblk_restore(fd, vm, child_disks)) == -1)
1061 		return (ret);
1062 
1063 	if ((ret = vioscsi_restore(fd, vm, child_cdrom)) == -1)
1064 		return (ret);
1065 
1066 	if ((ret = vionet_restore(fd, vm, child_taps)) == -1)
1067 		return (ret);
1068 
1069 	if ((ret = vmmci_restore(fd, vm->vm_params.vmc_params.vcp_id)) == -1)
1070 		return (ret);
1071 
1072 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
1073 		if (virtio_dev_launch(vm, dev) != 0)
1074 			fatalx("%s: failed to restore virtio dev", __func__);
1075 	}
1076 
1077 	return (0);
1078 }
1079 
1080 int
1081 viornd_dump(int fd)
1082 {
1083 	log_debug("%s: sending viornd", __func__);
1084 
1085 	viornd.vq[0].q_hva = NULL;
1086 
1087 	if (atomicio(vwrite, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
1088 		log_warnx("%s: error writing viornd to fd", __func__);
1089 		return (-1);
1090 	}
1091 	return (0);
1092 }
1093 
1094 int
1095 vmmci_dump(int fd)
1096 {
1097 	log_debug("%s: sending vmmci", __func__);
1098 
1099 	if (atomicio(vwrite, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
1100 		log_warnx("%s: error writing vmmci to fd", __func__);
1101 		return (-1);
1102 	}
1103 	return (0);
1104 }
1105 
1106 int
1107 vionet_dump(int fd)
1108 {
1109 	struct virtio_dev	*dev, temp;
1110 	struct viodev_msg	 msg;
1111 	struct imsg		 imsg;
1112 	struct imsgbuf		*ibuf = NULL;
1113 	size_t			 sz;
1114 	int			 ret;
1115 
1116 	log_debug("%s: dumping vionet", __func__);
1117 
1118 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
1119 		if (dev->dev_type != VMD_DEVTYPE_NET)
1120 			continue;
1121 
1122 		memset(&msg, 0, sizeof(msg));
1123 		memset(&imsg, 0, sizeof(imsg));
1124 
1125 		ibuf = &dev->sync_iev.ibuf;
1126 		msg.type = VIODEV_MSG_DUMP;
1127 
1128 		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
1129 		    sizeof(msg));
1130 		if (ret == -1) {
1131 			log_warnx("%s: failed requesting dump of vionet[%d]",
1132 			    __func__, dev->vionet.idx);
1133 			return (-1);
1134 		}
1135 		if (imsg_flush(ibuf) == -1) {
1136 			log_warnx("%s: imsg_flush", __func__);
1137 			return (-1);
1138 		}
1139 
1140 		sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp));
1141 		if (sz != sizeof(temp)) {
1142 			log_warnx("%s: failed to dump vionet[%d]", __func__,
1143 			    dev->vionet.idx);
1144 			return (-1);
1145 		}
1146 
1147 		/* Clear volatile state. Will reinitialize on restore. */
1148 		temp.vionet.vq[RXQ].q_hva = NULL;
1149 		temp.vionet.vq[TXQ].q_hva = NULL;
1150 		temp.async_fd = -1;
1151 		temp.sync_fd = -1;
1152 		memset(&temp.async_iev, 0, sizeof(temp.async_iev));
1153 		memset(&temp.sync_iev, 0, sizeof(temp.sync_iev));
1154 
1155 		if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) {
1156 			log_warnx("%s: error writing vionet to fd", __func__);
1157 			return (-1);
1158 		}
1159 	}
1160 
1161 	return (0);
1162 }
1163 
1164 int
1165 vioblk_dump(int fd)
1166 {
1167 	struct virtio_dev	*dev, temp;
1168 	struct viodev_msg	 msg;
1169 	struct imsg		 imsg;
1170 	struct imsgbuf		*ibuf = NULL;
1171 	size_t			 sz;
1172 	int			 ret;
1173 
1174 	log_debug("%s: dumping vioblk", __func__);
1175 
1176 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
1177 		if (dev->dev_type != VMD_DEVTYPE_DISK)
1178 			continue;
1179 
1180 		memset(&msg, 0, sizeof(msg));
1181 		memset(&imsg, 0, sizeof(imsg));
1182 
1183 		ibuf = &dev->sync_iev.ibuf;
1184 		msg.type = VIODEV_MSG_DUMP;
1185 
1186 		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
1187 		    sizeof(msg));
1188 		if (ret == -1) {
1189 			log_warnx("%s: failed requesting dump of vioblk[%d]",
1190 			    __func__, dev->vioblk.idx);
1191 			return (-1);
1192 		}
1193 		if (imsg_flush(ibuf) == -1) {
1194 			log_warnx("%s: imsg_flush", __func__);
1195 			return (-1);
1196 		}
1197 
1198 
1199 		sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp));
1200 		if (sz != sizeof(temp)) {
1201 			log_warnx("%s: failed to dump vioblk[%d]", __func__,
1202 			    dev->vioblk.idx);
1203 			return (-1);
1204 		}
1205 
1206 		/* Clear volatile state. Will reinitialize on restore. */
1207 		temp.vioblk.vq[0].q_hva = NULL;
1208 		temp.async_fd = -1;
1209 		temp.sync_fd = -1;
1210 		memset(&temp.async_iev, 0, sizeof(temp.async_iev));
1211 		memset(&temp.sync_iev, 0, sizeof(temp.sync_iev));
1212 
1213 		if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) {
1214 			log_warnx("%s: error writing vioblk to fd", __func__);
1215 			return (-1);
1216 		}
1217 	}
1218 
1219 	return (0);
1220 }
1221 
1222 int
1223 vioscsi_dump(int fd)
1224 {
1225 	unsigned int i;
1226 
1227 	if (vioscsi == NULL)
1228 		return (0);
1229 
1230 	log_debug("%s: sending vioscsi", __func__);
1231 
1232 	for (i = 0; i < 3; i++)
1233 		vioscsi->vq[i].q_hva = NULL;
1234 
1235 	if (atomicio(vwrite, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
1236 	    sizeof(struct vioscsi_dev)) {
1237 		log_warnx("%s: error writing vioscsi to fd", __func__);
1238 		return (-1);
1239 	}
1240 	return (0);
1241 }
1242 
1243 int
1244 virtio_dump(int fd)
1245 {
1246 	int ret;
1247 
1248 	if ((ret = viornd_dump(fd)) == -1)
1249 		return ret;
1250 
1251 	if ((ret = vioblk_dump(fd)) == -1)
1252 		return ret;
1253 
1254 	if ((ret = vioscsi_dump(fd)) == -1)
1255 		return ret;
1256 
1257 	if ((ret = vionet_dump(fd)) == -1)
1258 		return ret;
1259 
1260 	if ((ret = vmmci_dump(fd)) == -1)
1261 		return ret;
1262 
1263 	return (0);
1264 }
1265 
1266 void virtio_broadcast_imsg(struct vmd_vm *vm, uint16_t type, void *data,
1267     uint16_t datalen)
1268 {
1269 	struct virtio_dev *dev;
1270 	int ret;
1271 
1272 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
1273 		ret = imsg_compose_event(&dev->async_iev, type, 0, 0, -1, data,
1274 		    datalen);
1275 		if (ret == -1) {
1276 			log_warnx("%s: failed to broadcast imsg type %u",
1277 			    __func__, type);
1278 		}
1279 	}
1280 
1281 }
1282 
1283 void
1284 virtio_stop(struct vmd_vm *vm)
1285 {
1286 	return virtio_broadcast_imsg(vm, IMSG_VMDOP_PAUSE_VM, NULL, 0);
1287 }
1288 
1289 void
1290 virtio_start(struct vmd_vm *vm)
1291 {
1292 	return virtio_broadcast_imsg(vm, IMSG_VMDOP_UNPAUSE_VM, NULL, 0);
1293 }
1294 
1295 /*
1296  * Fork+exec a child virtio device. Returns 0 on success.
1297  */
1298 static int
1299 virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev)
1300 {
1301 	char *nargv[12], num[32], vmm_fd[32], vm_name[VM_NAME_MAX], t[2];
1302 	pid_t dev_pid;
1303 	int data_fds[VM_MAX_BASE_PER_DISK], sync_fds[2], async_fds[2], ret = 0;
1304 	size_t i, data_fds_sz, sz = 0;
1305 	struct viodev_msg msg;
1306 	struct imsg imsg;
1307 	struct imsgev *iev = &dev->sync_iev;
1308 
1309 	switch (dev->dev_type) {
1310 	case VMD_DEVTYPE_NET:
1311 		data_fds[0] = dev->vionet.data_fd;
1312 		data_fds_sz = 1;
1313 		log_debug("%s: launching vionet%d",
1314 		    vm->vm_params.vmc_params.vcp_name, dev->vionet.idx);
1315 		break;
1316 	case VMD_DEVTYPE_DISK:
1317 		memcpy(&data_fds, dev->vioblk.disk_fd, sizeof(data_fds));
1318 		data_fds_sz = dev->vioblk.ndisk_fd;
1319 		log_debug("%s: launching vioblk%d",
1320 		    vm->vm_params.vmc_params.vcp_name, dev->vioblk.idx);
1321 		break;
1322 		/* NOTREACHED */
1323 	default:
1324 		log_warn("%s: invalid device type", __func__);
1325 		return (EINVAL);
1326 	}
1327 
1328 	/* We need two channels: one synchronous (IO reads) and one async. */
1329 	if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, sync_fds) == -1) {
1330 		log_warn("failed to create socketpair");
1331 		return (errno);
1332 	}
1333 	if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, async_fds) == -1) {
1334 		log_warn("failed to create async socketpair");
1335 		return (errno);
1336 	}
1337 
1338 	/* Keep communication channels open after exec. */
1339 	if (fcntl(sync_fds[1], F_SETFD, 0)) {
1340 		ret = errno;
1341 		log_warn("%s: fcntl", __func__);
1342 		goto err;
1343 	}
1344 	if (fcntl(async_fds[1], F_SETFD, 0)) {
1345 		ret = errno;
1346 		log_warn("%s: fcnt", __func__);
1347 		goto err;
1348 	}
1349 
1350 	/* Fork... */
1351 	dev_pid = fork();
1352 	if (dev_pid == -1) {
1353 		ret = errno;
1354 		log_warn("%s: fork failed", __func__);
1355 		goto err;
1356 	}
1357 
1358 	if (dev_pid > 0) {
1359 		/* Parent */
1360 		close_fd(sync_fds[1]);
1361 		close_fd(async_fds[1]);
1362 
1363 		/* Save the child's pid to help with cleanup. */
1364 		dev->dev_pid = dev_pid;
1365 
1366 		/* Set the channel fds to the child's before sending. */
1367 		dev->sync_fd = sync_fds[1];
1368 		dev->async_fd = async_fds[1];
1369 
1370 		/* Close data fds. Only the child device needs them now. */
1371 		for (i = 0; i < data_fds_sz; i++)
1372 			close_fd(data_fds[i]);
1373 
1374 		/* Set our synchronous channel to non-blocking. */
1375 		if (fcntl(sync_fds[0], F_SETFL, O_NONBLOCK) == -1) {
1376 			ret = errno;
1377 			log_warn("%s: fcntl", __func__);
1378 			goto err;
1379 		}
1380 
1381 		/* 1. Send over our configured device. */
1382 		log_debug("%s: sending '%c' type device struct", __func__,
1383 			dev->dev_type);
1384 		sz = atomicio(vwrite, sync_fds[0], dev, sizeof(*dev));
1385 		if (sz != sizeof(*dev)) {
1386 			log_warnx("%s: failed to send device", __func__);
1387 			ret = EIO;
1388 			goto err;
1389 		}
1390 
1391 		/* 2. Send over details on the VM (including memory fds). */
1392 		log_debug("%s: sending vm message for '%s'", __func__,
1393 			vm->vm_params.vmc_params.vcp_name);
1394 		sz = atomicio(vwrite, sync_fds[0], vm, sizeof(*vm));
1395 		if (sz != sizeof(*vm)) {
1396 			log_warnx("%s: failed to send vm details", __func__);
1397 			ret = EIO;
1398 			goto err;
1399 		}
1400 
1401 		/*
1402 		 * Initialize our imsg channel to the child device. The initial
1403 		 * communication will be synchronous. We expect the child to
1404 		 * report itself "ready" to confirm the launch was a success.
1405 		 */
1406 		imsg_init(&iev->ibuf, sync_fds[0]);
1407 		do
1408 			ret = imsg_read(&iev->ibuf);
1409 		while (ret == -1 && errno == EAGAIN);
1410 		if (ret == 0 || ret == -1) {
1411 			log_warnx("%s: failed to receive ready message from "
1412 			    "'%c' type device", __func__, dev->dev_type);
1413 			ret = EIO;
1414 			goto err;
1415 		}
1416 		ret = 0;
1417 
1418 		log_debug("%s: receiving reply", __func__);
1419 		if (imsg_get(&iev->ibuf, &imsg) < 1) {
1420 			log_warnx("%s: imsg_get", __func__);
1421 			ret = EIO;
1422 			goto err;
1423 		}
1424 		IMSG_SIZE_CHECK(&imsg, &msg);
1425 		memcpy(&msg, imsg.data, sizeof(msg));
1426 		imsg_free(&imsg);
1427 
1428 		if (msg.type != VIODEV_MSG_READY) {
1429 			log_warnx("%s: expected ready message, got type %d",
1430 			    __func__, msg.type);
1431 			ret = EINVAL;
1432 			goto err;
1433 		}
1434 		log_debug("%s: device reports ready via sync channel",
1435 		    __func__);
1436 
1437 		/*
1438 		 * Wire in the async event handling, but after reverting back
1439 		 * to the parent's fd's.
1440 		 */
1441 		dev->sync_fd = sync_fds[0];
1442 		dev->async_fd = async_fds[0];
1443 		vm_device_pipe(dev, virtio_dispatch_dev);
1444 	} else {
1445 		/* Child */
1446 		close_fd(async_fds[0]);
1447 		close_fd(sync_fds[0]);
1448 
1449 		/* Keep data file descriptors open after exec. */
1450 		for (i = 0; i < data_fds_sz; i++) {
1451 			log_debug("%s: marking fd %d !close-on-exec", __func__,
1452 			    data_fds[i]);
1453 			if (fcntl(data_fds[i], F_SETFD, 0)) {
1454 				ret = errno;
1455 				log_warn("%s: fcntl", __func__);
1456 				goto err;
1457 			}
1458 		}
1459 
1460 		memset(&nargv, 0, sizeof(nargv));
1461 		memset(num, 0, sizeof(num));
1462 		snprintf(num, sizeof(num), "%d", sync_fds[1]);
1463 		memset(vmm_fd, 0, sizeof(vmm_fd));
1464 		snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd);
1465 		memset(vm_name, 0, sizeof(vm_name));
1466 		snprintf(vm_name, sizeof(vm_name), "%s",
1467 		    vm->vm_params.vmc_params.vcp_name);
1468 
1469 		t[0] = dev->dev_type;
1470 		t[1] = '\0';
1471 
1472 		nargv[0] = env->argv0;
1473 		nargv[1] = "-X";
1474 		nargv[2] = num;
1475 		nargv[3] = "-t";
1476 		nargv[4] = t;
1477 		nargv[5] = "-i";
1478 		nargv[6] = vmm_fd;
1479 		nargv[7] = "-p";
1480 		nargv[8] = vm_name;
1481 		nargv[9] = "-n";
1482 		nargv[10] = NULL;
1483 
1484 		if (env->vmd_verbose == 1) {
1485 			nargv[10] = VMD_VERBOSE_1;
1486 			nargv[11] = NULL;
1487 		} else if (env->vmd_verbose > 1) {
1488 			nargv[10] = VMD_VERBOSE_2;
1489 			nargv[11] = NULL;
1490 		}
1491 
1492 		/* Control resumes in vmd.c:main(). */
1493 		execvp(nargv[0], nargv);
1494 
1495 		ret = errno;
1496 		log_warn("%s: failed to exec device", __func__);
1497 		_exit(ret);
1498 		/* NOTREACHED */
1499 	}
1500 
1501 	return (ret);
1502 
1503 err:
1504 	close_fd(sync_fds[0]);
1505 	close_fd(sync_fds[1]);
1506 	close_fd(async_fds[0]);
1507 	close_fd(async_fds[1]);
1508 	return (ret);
1509 }
1510 
1511 /*
1512  * Initialize an async imsg channel for a virtio device.
1513  */
1514 int
1515 vm_device_pipe(struct virtio_dev *dev, void (*cb)(int, short, void *))
1516 {
1517 	struct imsgev *iev = &dev->async_iev;
1518 	int fd = dev->async_fd;
1519 
1520 	log_debug("%s: initializing '%c' device pipe (fd=%d)", __func__,
1521 	    dev->dev_type, fd);
1522 
1523 	if (fcntl(fd, F_SETFL, O_NONBLOCK) == -1) {
1524 		log_warn("failed to set nonblocking mode on vm device pipe");
1525 		return (-1);
1526 	}
1527 
1528 	imsg_init(&iev->ibuf, fd);
1529 	iev->handler = cb;
1530 	iev->data = dev;
1531 	iev->events = EV_READ;
1532 	imsg_event_add(iev);
1533 
1534 	return (0);
1535 }
1536 
1537 void
1538 virtio_dispatch_dev(int fd, short event, void *arg)
1539 {
1540 	struct virtio_dev	*dev = (struct virtio_dev*)arg;
1541 	struct imsgev		*iev = &dev->async_iev;
1542 	struct imsgbuf		*ibuf = &iev->ibuf;
1543 	struct imsg		 imsg;
1544 	struct viodev_msg	 msg;
1545 	ssize_t			 n = 0;
1546 
1547 	if (event & EV_READ) {
1548 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
1549 			fatal("%s: imsg_read", __func__);
1550 		if (n == 0) {
1551 			/* this pipe is dead, so remove the event handler */
1552 			log_debug("%s: pipe dead (EV_READ)", __func__);
1553 			event_del(&iev->ev);
1554 			event_loopexit(NULL);
1555 			return;
1556 		}
1557 	}
1558 
1559 	if (event & EV_WRITE) {
1560 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
1561 			fatal("%s: msgbuf_write", __func__);
1562 		if (n == 0) {
1563 			/* this pipe is dead, so remove the event handler */
1564 			log_debug("%s: pipe dead (EV_WRITE)", __func__);
1565 			event_del(&iev->ev);
1566 			event_loopexit(NULL);
1567 			return;
1568 		}
1569 	}
1570 
1571 	for (;;) {
1572 		if ((n = imsg_get(ibuf, &imsg)) == -1)
1573 			fatal("%s: imsg_get", __func__);
1574 		if (n == 0)
1575 			break;
1576 
1577 		switch (imsg.hdr.type) {
1578 		case IMSG_DEVOP_MSG:
1579 			IMSG_SIZE_CHECK(&imsg, &msg);
1580 			memcpy(&msg, imsg.data, sizeof(msg));
1581 			handle_dev_msg(&msg, dev);
1582 			break;
1583 		default:
1584 			log_warnx("%s: got non devop imsg %d", __func__,
1585 			    imsg.hdr.type);
1586 			break;
1587 		}
1588 		imsg_free(&imsg);
1589 	}
1590 	imsg_event_add(iev);
1591 }
1592 
1593 
1594 static int
1595 handle_dev_msg(struct viodev_msg *msg, struct virtio_dev *gdev)
1596 {
1597 	uint32_t vm_id = gdev->vm_id;
1598 	int irq = gdev->irq;
1599 
1600 	switch (msg->type) {
1601 	case VIODEV_MSG_KICK:
1602 		if (msg->state == INTR_STATE_ASSERT)
1603 			vcpu_assert_pic_irq(vm_id, msg->vcpu, irq);
1604 		else if (msg->state == INTR_STATE_DEASSERT)
1605 			vcpu_deassert_pic_irq(vm_id, msg->vcpu, irq);
1606 		break;
1607 	case VIODEV_MSG_READY:
1608 		log_debug("%s: device reports ready", __func__);
1609 		break;
1610 	case VIODEV_MSG_ERROR:
1611 		log_warnx("%s: device reported error", __func__);
1612 		break;
1613 	case VIODEV_MSG_INVALID:
1614 	case VIODEV_MSG_IO_READ:
1615 	case VIODEV_MSG_IO_WRITE:
1616 		/* FALLTHROUGH */
1617 	default:
1618 		log_warnx("%s: unsupported device message type %d", __func__,
1619 		    msg->type);
1620 		return (1);
1621 	}
1622 
1623 	return (0);
1624 };
1625 
1626 /*
1627  * Called by the VM process while processing IO from the VCPU thread.
1628  *
1629  * N.b. Since the VCPU thread calls this function, we cannot mutate the event
1630  * system. All ipc messages must be sent manually and cannot be queued for
1631  * the event loop to push them. (We need to perform a synchronous read, so
1632  * this isn't really a big deal.)
1633  */
1634 int
1635 virtio_pci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
1636     void *cookie, uint8_t sz)
1637 {
1638 	struct virtio_dev *dev = (struct virtio_dev *)cookie;
1639 	struct imsgbuf *ibuf = &dev->sync_iev.ibuf;
1640 	struct imsg imsg;
1641 	struct viodev_msg msg;
1642 	ssize_t n;
1643 	int ret = 0;
1644 
1645 	memset(&msg, 0, sizeof(msg));
1646 	msg.reg = reg;
1647 	msg.io_sz = sz;
1648 
1649 	if (dir == 0) {
1650 		msg.type = VIODEV_MSG_IO_WRITE;
1651 		msg.data = *data;
1652 		msg.data_valid = 1;
1653 	} else
1654 		msg.type = VIODEV_MSG_IO_READ;
1655 
1656 	if (msg.type == VIODEV_MSG_IO_WRITE) {
1657 		/*
1658 		 * Write request. No reply expected.
1659 		 */
1660 		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
1661 		    sizeof(msg));
1662 		if (ret == -1) {
1663 			log_warn("%s: failed to send async io event to virtio"
1664 			    " device", __func__);
1665 			return (ret);
1666 		}
1667 		if (imsg_flush(ibuf) == -1) {
1668 			log_warnx("%s: imsg_flush (write)", __func__);
1669 			return (-1);
1670 		}
1671 	} else {
1672 		/*
1673 		 * Read request. Requires waiting for a reply.
1674 		 */
1675 		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
1676 		    sizeof(msg));
1677 		if (ret == -1) {
1678 			log_warnx("%s: failed to send sync io event to virtio"
1679 			    " device", __func__);
1680 			return (ret);
1681 		}
1682 		if (imsg_flush(ibuf) == -1) {
1683 			log_warnx("%s: imsg_flush (read)", __func__);
1684 			return (-1);
1685 		}
1686 
1687 		/* Read our reply. */
1688 		do
1689 			n = imsg_read(ibuf);
1690 		while (n == -1 && errno == EAGAIN);
1691 		if (n == 0 || n == -1) {
1692 			log_warn("%s: imsg_read (n=%ld)", __func__, n);
1693 			return (-1);
1694 		}
1695 		if ((n = imsg_get(ibuf, &imsg)) == -1) {
1696 			log_warn("%s: imsg_get (n=%ld)", __func__, n);
1697 			return (-1);
1698 		}
1699 		if (n == 0) {
1700 			log_warnx("%s: invalid imsg", __func__);
1701 			return (-1);
1702 		}
1703 
1704 		IMSG_SIZE_CHECK(&imsg, &msg);
1705 		memcpy(&msg, imsg.data, sizeof(msg));
1706 		imsg_free(&imsg);
1707 
1708 		if (msg.type == VIODEV_MSG_IO_READ && msg.data_valid) {
1709 #if DEBUG
1710 			log_debug("%s: got sync read response (reg=%s)",
1711 			    __func__, virtio_reg_name(msg.reg));
1712 #endif /* DEBUG */
1713 			*data = msg.data;
1714 			/*
1715 			 * It's possible we're asked to {de,}assert after the
1716 			 * device performs a register read.
1717 			 */
1718 			if (msg.state == INTR_STATE_ASSERT)
1719 				vcpu_assert_pic_irq(dev->vm_id, msg.vcpu, msg.irq);
1720 			else if (msg.state == INTR_STATE_DEASSERT)
1721 				vcpu_deassert_pic_irq(dev->vm_id, msg.vcpu, msg.irq);
1722 		} else {
1723 			log_warnx("%s: expected IO_READ, got %d", __func__,
1724 			    msg.type);
1725 			return (-1);
1726 		}
1727 	}
1728 
1729 	return (0);
1730 }
1731 
1732 void
1733 virtio_assert_pic_irq(struct virtio_dev *dev, int vcpu)
1734 {
1735 	struct viodev_msg msg;
1736 	int ret;
1737 
1738 	memset(&msg, 0, sizeof(msg));
1739 	msg.irq = dev->irq;
1740 	msg.vcpu = vcpu;
1741 	msg.type = VIODEV_MSG_KICK;
1742 	msg.state = INTR_STATE_ASSERT;
1743 
1744 	ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
1745 	    &msg, sizeof(msg));
1746 	if (ret == -1)
1747 		log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
1748 }
1749 
1750 void
1751 virtio_deassert_pic_irq(struct virtio_dev *dev, int vcpu)
1752 {
1753 	struct viodev_msg msg;
1754 	int ret;
1755 
1756 	memset(&msg, 0, sizeof(msg));
1757 	msg.irq = dev->irq;
1758 	msg.vcpu = vcpu;
1759 	msg.type = VIODEV_MSG_KICK;
1760 	msg.state = INTR_STATE_DEASSERT;
1761 
1762 	ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
1763 	    &msg, sizeof(msg));
1764 	if (ret == -1)
1765 		log_warnx("%s: failed to deassert irq %d", __func__, dev->irq);
1766 }
1767