xref: /openbsd/usr.sbin/vmd/virtio.c (revision 3cab2bb3)
1 /*	$OpenBSD: virtio.c,v 1.82 2019/12/11 06:45:16 pd Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* PAGE_SIZE */
20 #include <sys/socket.h>
21 
22 #include <machine/vmmvar.h>
23 #include <dev/pci/pcireg.h>
24 #include <dev/pci/pcidevs.h>
25 #include <dev/pv/virtioreg.h>
26 #include <dev/pci/virtio_pcireg.h>
27 #include <dev/pv/vioblkreg.h>
28 #include <dev/pv/vioscsireg.h>
29 
30 #include <net/if.h>
31 #include <netinet/in.h>
32 #include <netinet/if_ether.h>
33 
34 #include <errno.h>
35 #include <event.h>
36 #include <poll.h>
37 #include <stddef.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <unistd.h>
41 
42 #include "pci.h"
43 #include "vmd.h"
44 #include "vmm.h"
45 #include "virtio.h"
46 #include "vioscsi.h"
47 #include "loadfile.h"
48 #include "atomicio.h"
49 
50 extern char *__progname;
51 struct viornd_dev viornd;
52 struct vioblk_dev *vioblk;
53 struct vionet_dev *vionet;
54 struct vioscsi_dev *vioscsi;
55 struct vmmci_dev vmmci;
56 
57 int nr_vionet;
58 int nr_vioblk;
59 
60 #define MAXPHYS	(64 * 1024)	/* max raw I/O transfer size */
61 
62 #define VIRTIO_NET_F_MAC	(1<<5)
63 
64 #define VMMCI_F_TIMESYNC	(1<<0)
65 #define VMMCI_F_ACK		(1<<1)
66 #define VMMCI_F_SYNCRTC		(1<<2)
67 
68 #define RXQ	0
69 #define TXQ	1
70 
71 const char *
72 vioblk_cmd_name(uint32_t type)
73 {
74 	switch (type) {
75 	case VIRTIO_BLK_T_IN: return "read";
76 	case VIRTIO_BLK_T_OUT: return "write";
77 	case VIRTIO_BLK_T_SCSI_CMD: return "scsi read";
78 	case VIRTIO_BLK_T_SCSI_CMD_OUT: return "scsi write";
79 	case VIRTIO_BLK_T_FLUSH: return "flush";
80 	case VIRTIO_BLK_T_FLUSH_OUT: return "flush out";
81 	case VIRTIO_BLK_T_GET_ID: return "get id";
82 	default: return "unknown";
83 	}
84 }
85 
86 static void
87 dump_descriptor_chain(struct vring_desc *desc, int16_t dxx)
88 {
89 	log_debug("descriptor chain @ %d", dxx);
90 	do {
91 		log_debug("desc @%d addr/len/flags/next = 0x%llx / 0x%x "
92 		    "/ 0x%x / 0x%x",
93 		    dxx,
94 		    desc[dxx].addr,
95 		    desc[dxx].len,
96 		    desc[dxx].flags,
97 		    desc[dxx].next);
98 		dxx = desc[dxx].next;
99 	} while (desc[dxx].flags & VRING_DESC_F_NEXT);
100 
101 	log_debug("desc @%d addr/len/flags/next = 0x%llx / 0x%x / 0x%x "
102 	    "/ 0x%x",
103 	    dxx,
104 	    desc[dxx].addr,
105 	    desc[dxx].len,
106 	    desc[dxx].flags,
107 	    desc[dxx].next);
108 }
109 
110 static const char *
111 virtio_reg_name(uint8_t reg)
112 {
113 	switch (reg) {
114 	case VIRTIO_CONFIG_DEVICE_FEATURES: return "device feature";
115 	case VIRTIO_CONFIG_GUEST_FEATURES: return "guest feature";
116 	case VIRTIO_CONFIG_QUEUE_ADDRESS: return "queue address";
117 	case VIRTIO_CONFIG_QUEUE_SIZE: return "queue size";
118 	case VIRTIO_CONFIG_QUEUE_SELECT: return "queue select";
119 	case VIRTIO_CONFIG_QUEUE_NOTIFY: return "queue notify";
120 	case VIRTIO_CONFIG_DEVICE_STATUS: return "device status";
121 	case VIRTIO_CONFIG_ISR_STATUS: return "isr status";
122 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI: return "device config 0";
123 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: return "device config 1";
124 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: return "device config 2";
125 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: return "device config 3";
126 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: return "device config 4";
127 	default: return "unknown";
128 	}
129 }
130 
131 uint32_t
132 vring_size(uint32_t vq_size)
133 {
134 	uint32_t allocsize1, allocsize2;
135 
136 	/* allocsize1: descriptor table + avail ring + pad */
137 	allocsize1 = VIRTQUEUE_ALIGN(sizeof(struct vring_desc) * vq_size
138 	    + sizeof(uint16_t) * (2 + vq_size));
139 	/* allocsize2: used ring + pad */
140 	allocsize2 = VIRTQUEUE_ALIGN(sizeof(uint16_t) * 2
141 	    + sizeof(struct vring_used_elem) * vq_size);
142 
143 	return allocsize1 + allocsize2;
144 }
145 
146 /* Update queue select */
147 void
148 viornd_update_qs(void)
149 {
150 	/* Invalid queue? */
151 	if (viornd.cfg.queue_select > 0) {
152 		viornd.cfg.queue_size = 0;
153 		return;
154 	}
155 
156 	/* Update queue address/size based on queue select */
157 	viornd.cfg.queue_address = viornd.vq[viornd.cfg.queue_select].qa;
158 	viornd.cfg.queue_size = viornd.vq[viornd.cfg.queue_select].qs;
159 }
160 
161 /* Update queue address */
162 void
163 viornd_update_qa(void)
164 {
165 	/* Invalid queue? */
166 	if (viornd.cfg.queue_select > 0)
167 		return;
168 
169 	viornd.vq[viornd.cfg.queue_select].qa = viornd.cfg.queue_address;
170 }
171 
172 int
173 viornd_notifyq(void)
174 {
175 	uint64_t q_gpa;
176 	uint32_t vr_sz;
177 	size_t sz;
178 	int ret;
179 	uint16_t aidx, uidx;
180 	char *buf, *rnd_data;
181 	struct vring_desc *desc;
182 	struct vring_avail *avail;
183 	struct vring_used *used;
184 
185 	ret = 0;
186 
187 	/* Invalid queue? */
188 	if (viornd.cfg.queue_notify > 0)
189 		return (0);
190 
191 	vr_sz = vring_size(VIORND_QUEUE_SIZE);
192 	q_gpa = viornd.vq[viornd.cfg.queue_notify].qa;
193 	q_gpa = q_gpa * VIRTIO_PAGE_SIZE;
194 
195 	buf = calloc(1, vr_sz);
196 	if (buf == NULL) {
197 		log_warn("calloc error getting viornd ring");
198 		return (0);
199 	}
200 
201 	if (read_mem(q_gpa, buf, vr_sz)) {
202 		free(buf);
203 		return (0);
204 	}
205 
206 	desc = (struct vring_desc *)(buf);
207 	avail = (struct vring_avail *)(buf +
208 	    viornd.vq[viornd.cfg.queue_notify].vq_availoffset);
209 	used = (struct vring_used *)(buf +
210 	    viornd.vq[viornd.cfg.queue_notify].vq_usedoffset);
211 
212 	aidx = avail->idx & VIORND_QUEUE_MASK;
213 	uidx = used->idx & VIORND_QUEUE_MASK;
214 
215 	sz = desc[avail->ring[aidx]].len;
216 	if (sz > MAXPHYS)
217 		fatal("viornd descriptor size too large (%zu)", sz);
218 
219 	rnd_data = malloc(sz);
220 
221 	if (rnd_data != NULL) {
222 		arc4random_buf(rnd_data, desc[avail->ring[aidx]].len);
223 		if (write_mem(desc[avail->ring[aidx]].addr,
224 		    rnd_data, desc[avail->ring[aidx]].len)) {
225 			log_warnx("viornd: can't write random data @ "
226 			    "0x%llx",
227 			    desc[avail->ring[aidx]].addr);
228 		} else {
229 			/* ret == 1 -> interrupt needed */
230 			/* XXX check VIRTIO_F_NO_INTR */
231 			ret = 1;
232 			viornd.cfg.isr_status = 1;
233 			used->ring[uidx].id = avail->ring[aidx] &
234 			    VIORND_QUEUE_MASK;
235 			used->ring[uidx].len = desc[avail->ring[aidx]].len;
236 			used->idx++;
237 
238 			if (write_mem(q_gpa, buf, vr_sz)) {
239 				log_warnx("viornd: error writing vio ring");
240 			}
241 		}
242 		free(rnd_data);
243 	} else
244 		fatal("memory allocation error for viornd data");
245 
246 	free(buf);
247 
248 	return (ret);
249 }
250 
251 int
252 virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
253     void *unused, uint8_t sz)
254 {
255 	*intr = 0xFF;
256 
257 	if (dir == 0) {
258 		switch (reg) {
259 		case VIRTIO_CONFIG_DEVICE_FEATURES:
260 		case VIRTIO_CONFIG_QUEUE_SIZE:
261 		case VIRTIO_CONFIG_ISR_STATUS:
262 			log_warnx("%s: illegal write %x to %s",
263 			    __progname, *data, virtio_reg_name(reg));
264 			break;
265 		case VIRTIO_CONFIG_GUEST_FEATURES:
266 			viornd.cfg.guest_feature = *data;
267 			break;
268 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
269 			viornd.cfg.queue_address = *data;
270 			viornd_update_qa();
271 			break;
272 		case VIRTIO_CONFIG_QUEUE_SELECT:
273 			viornd.cfg.queue_select = *data;
274 			viornd_update_qs();
275 			break;
276 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
277 			viornd.cfg.queue_notify = *data;
278 			if (viornd_notifyq())
279 				*intr = 1;
280 			break;
281 		case VIRTIO_CONFIG_DEVICE_STATUS:
282 			viornd.cfg.device_status = *data;
283 			break;
284 		}
285 	} else {
286 		switch (reg) {
287 		case VIRTIO_CONFIG_DEVICE_FEATURES:
288 			*data = viornd.cfg.device_feature;
289 			break;
290 		case VIRTIO_CONFIG_GUEST_FEATURES:
291 			*data = viornd.cfg.guest_feature;
292 			break;
293 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
294 			*data = viornd.cfg.queue_address;
295 			break;
296 		case VIRTIO_CONFIG_QUEUE_SIZE:
297 			*data = viornd.cfg.queue_size;
298 			break;
299 		case VIRTIO_CONFIG_QUEUE_SELECT:
300 			*data = viornd.cfg.queue_select;
301 			break;
302 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
303 			*data = viornd.cfg.queue_notify;
304 			break;
305 		case VIRTIO_CONFIG_DEVICE_STATUS:
306 			*data = viornd.cfg.device_status;
307 			break;
308 		case VIRTIO_CONFIG_ISR_STATUS:
309 			*data = viornd.cfg.isr_status;
310 			viornd.cfg.isr_status = 0;
311 			vcpu_deassert_pic_irq(viornd.vm_id, 0, viornd.irq);
312 			break;
313 		}
314 	}
315 	return (0);
316 }
317 
318 void
319 vioblk_update_qa(struct vioblk_dev *dev)
320 {
321 	/* Invalid queue? */
322 	if (dev->cfg.queue_select > 0)
323 		return;
324 
325 	dev->vq[dev->cfg.queue_select].qa = dev->cfg.queue_address;
326 }
327 
328 void
329 vioblk_update_qs(struct vioblk_dev *dev)
330 {
331 	/* Invalid queue? */
332 	if (dev->cfg.queue_select > 0) {
333 		dev->cfg.queue_size = 0;
334 		return;
335 	}
336 
337 	/* Update queue address/size based on queue select */
338 	dev->cfg.queue_address = dev->vq[dev->cfg.queue_select].qa;
339 	dev->cfg.queue_size = dev->vq[dev->cfg.queue_select].qs;
340 }
341 
342 static void
343 vioblk_free_info(struct ioinfo *info)
344 {
345 	if (!info)
346 		return;
347 	free(info->buf);
348 	free(info);
349 }
350 
351 static struct ioinfo *
352 vioblk_start_read(struct vioblk_dev *dev, off_t sector, ssize_t sz)
353 {
354 	struct ioinfo *info;
355 
356 	info = calloc(1, sizeof(*info));
357 	if (!info)
358 		goto nomem;
359 	info->buf = malloc(sz);
360 	if (info->buf == NULL)
361 		goto nomem;
362 	info->len = sz;
363 	info->offset = sector * VIRTIO_BLK_SECTOR_SIZE;
364 	info->file = &dev->file;
365 
366 	return info;
367 
368 nomem:
369 	free(info);
370 	log_warn("malloc error vioblk read");
371 	return (NULL);
372 }
373 
374 
375 static const uint8_t *
376 vioblk_finish_read(struct ioinfo *info)
377 {
378 	struct virtio_backing *file;
379 
380 	file = info->file;
381 	if (file->pread(file->p, info->buf, info->len, info->offset) != info->len) {
382 		info->error = errno;
383 		log_warn("vioblk read error");
384 		return NULL;
385 	}
386 
387 	return info->buf;
388 }
389 
390 static struct ioinfo *
391 vioblk_start_write(struct vioblk_dev *dev, off_t sector,
392     paddr_t addr, size_t len)
393 {
394 	struct ioinfo *info;
395 
396 	info = calloc(1, sizeof(*info));
397 	if (!info)
398 		goto nomem;
399 	info->buf = malloc(len);
400 	if (info->buf == NULL)
401 		goto nomem;
402 	info->len = len;
403 	info->offset = sector * VIRTIO_BLK_SECTOR_SIZE;
404 	info->file = &dev->file;
405 
406 	if (read_mem(addr, info->buf, len)) {
407 		vioblk_free_info(info);
408 		return NULL;
409 	}
410 
411 	return info;
412 
413 nomem:
414 	free(info);
415 	log_warn("malloc error vioblk write");
416 	return (NULL);
417 }
418 
419 static int
420 vioblk_finish_write(struct ioinfo *info)
421 {
422 	struct virtio_backing *file;
423 
424 	file = info->file;
425 	if (file->pwrite(file->p, info->buf, info->len, info->offset) != info->len) {
426 		log_warn("vioblk write error");
427 		return EIO;
428 	}
429 	return 0;
430 }
431 
432 /*
433  * XXX in various cases, ds should be set to VIRTIO_BLK_S_IOERR, if we can
434  * XXX cant trust ring data from VM, be extra cautious.
435  */
436 int
437 vioblk_notifyq(struct vioblk_dev *dev)
438 {
439 	uint64_t q_gpa;
440 	uint32_t vr_sz;
441 	uint16_t idx, cmd_desc_idx, secdata_desc_idx, ds_desc_idx;
442 	uint8_t ds;
443 	int ret;
444 	off_t secbias;
445 	char *vr;
446 	struct vring_desc *desc, *cmd_desc, *secdata_desc, *ds_desc;
447 	struct vring_avail *avail;
448 	struct vring_used *used;
449 	struct virtio_blk_req_hdr cmd;
450 
451 	ret = 0;
452 
453 	/* Invalid queue? */
454 	if (dev->cfg.queue_notify > 0)
455 		return (0);
456 
457 	vr_sz = vring_size(VIOBLK_QUEUE_SIZE);
458 	q_gpa = dev->vq[dev->cfg.queue_notify].qa;
459 	q_gpa = q_gpa * VIRTIO_PAGE_SIZE;
460 
461 	vr = calloc(1, vr_sz);
462 	if (vr == NULL) {
463 		log_warn("calloc error getting vioblk ring");
464 		return (0);
465 	}
466 
467 	if (read_mem(q_gpa, vr, vr_sz)) {
468 		log_warnx("error reading gpa 0x%llx", q_gpa);
469 		goto out;
470 	}
471 
472 	/* Compute offsets in ring of descriptors, avail ring, and used ring */
473 	desc = (struct vring_desc *)(vr);
474 	avail = (struct vring_avail *)(vr +
475 	    dev->vq[dev->cfg.queue_notify].vq_availoffset);
476 	used = (struct vring_used *)(vr +
477 	    dev->vq[dev->cfg.queue_notify].vq_usedoffset);
478 
479 	idx = dev->vq[dev->cfg.queue_notify].last_avail & VIOBLK_QUEUE_MASK;
480 
481 	if ((avail->idx & VIOBLK_QUEUE_MASK) == idx) {
482 		log_warnx("vioblk queue notify - nothing to do?");
483 		goto out;
484 	}
485 
486 	while (idx != (avail->idx & VIOBLK_QUEUE_MASK)) {
487 
488 		cmd_desc_idx = avail->ring[idx] & VIOBLK_QUEUE_MASK;
489 		cmd_desc = &desc[cmd_desc_idx];
490 
491 		if ((cmd_desc->flags & VRING_DESC_F_NEXT) == 0) {
492 			log_warnx("unchained vioblk cmd descriptor received "
493 			    "(idx %d)", cmd_desc_idx);
494 			goto out;
495 		}
496 
497 		/* Read command from descriptor ring */
498 		if (read_mem(cmd_desc->addr, &cmd, cmd_desc->len)) {
499 			log_warnx("vioblk: command read_mem error @ 0x%llx",
500 			    cmd_desc->addr);
501 			goto out;
502 		}
503 
504 		switch (cmd.type) {
505 		case VIRTIO_BLK_T_IN:
506 			/* first descriptor */
507 			secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
508 			secdata_desc = &desc[secdata_desc_idx];
509 
510 			if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
511 				log_warnx("unchained vioblk data descriptor "
512 				    "received (idx %d)", cmd_desc_idx);
513 				goto out;
514 			}
515 
516 			secbias = 0;
517 			do {
518 				struct ioinfo *info;
519 				const uint8_t *secdata;
520 
521 				info = vioblk_start_read(dev,
522 				    cmd.sector + secbias,
523 				    (ssize_t)secdata_desc->len);
524 
525 				/* read the data, use current data descriptor */
526 				secdata = vioblk_finish_read(info);
527 				if (secdata == NULL) {
528 					vioblk_free_info(info);
529 					log_warnx("vioblk: block read error, "
530 					    "sector %lld", cmd.sector);
531 					goto out;
532 				}
533 
534 				if (write_mem(secdata_desc->addr, secdata,
535 				    secdata_desc->len)) {
536 					log_warnx("can't write sector "
537 					    "data to gpa @ 0x%llx",
538 					    secdata_desc->addr);
539 					dump_descriptor_chain(desc,
540 					    cmd_desc_idx);
541 					vioblk_free_info(info);
542 					goto out;
543 				}
544 
545 				vioblk_free_info(info);
546 
547 				secbias += (secdata_desc->len /
548 				    VIRTIO_BLK_SECTOR_SIZE);
549 				secdata_desc_idx = secdata_desc->next &
550 				    VIOBLK_QUEUE_MASK;
551 				secdata_desc = &desc[secdata_desc_idx];
552 			} while (secdata_desc->flags & VRING_DESC_F_NEXT);
553 
554 			ds_desc_idx = secdata_desc_idx;
555 			ds_desc = secdata_desc;
556 
557 			ds = VIRTIO_BLK_S_OK;
558 			if (write_mem(ds_desc->addr, &ds, ds_desc->len)) {
559 				log_warnx("can't write device status data @ "
560 				    "0x%llx", ds_desc->addr);
561 				dump_descriptor_chain(desc, cmd_desc_idx);
562 				goto out;
563 			}
564 
565 			ret = 1;
566 			dev->cfg.isr_status = 1;
567 			used->ring[used->idx & VIOBLK_QUEUE_MASK].id =
568 			    cmd_desc_idx;
569 			used->ring[used->idx & VIOBLK_QUEUE_MASK].len =
570 			    cmd_desc->len;
571 			used->idx++;
572 
573 			dev->vq[dev->cfg.queue_notify].last_avail = avail->idx &
574 			    VIOBLK_QUEUE_MASK;
575 
576 			if (write_mem(q_gpa, vr, vr_sz)) {
577 				log_warnx("vioblk: error writing vio ring");
578 			}
579 			break;
580 		case VIRTIO_BLK_T_OUT:
581 			secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
582 			secdata_desc = &desc[secdata_desc_idx];
583 
584 			if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
585 				log_warnx("wr vioblk: unchained vioblk data "
586 				    "descriptor received (idx %d)",
587 				    cmd_desc_idx);
588 				goto out;
589 			}
590 
591 			if (secdata_desc->len > dev->max_xfer) {
592 				log_warnx("%s: invalid read size %d requested",
593 				    __func__, secdata_desc->len);
594 				goto out;
595 			}
596 
597 			secbias = 0;
598 			do {
599 				struct ioinfo *info;
600 
601 				info = vioblk_start_write(dev,
602 				    cmd.sector + secbias,
603 				    secdata_desc->addr, secdata_desc->len);
604 
605 				if (info == NULL) {
606 					log_warnx("wr vioblk: can't read "
607 					    "sector data @ 0x%llx",
608 					    secdata_desc->addr);
609 					dump_descriptor_chain(desc,
610 					    cmd_desc_idx);
611 					goto out;
612 				}
613 
614 				if (vioblk_finish_write(info)) {
615 					log_warnx("wr vioblk: disk write "
616 					    "error");
617 					vioblk_free_info(info);
618 					goto out;
619 				}
620 
621 				vioblk_free_info(info);
622 
623 				secbias += secdata_desc->len /
624 				    VIRTIO_BLK_SECTOR_SIZE;
625 
626 				secdata_desc_idx = secdata_desc->next &
627 				    VIOBLK_QUEUE_MASK;
628 				secdata_desc = &desc[secdata_desc_idx];
629 			} while (secdata_desc->flags & VRING_DESC_F_NEXT);
630 
631 			ds_desc_idx = secdata_desc_idx;
632 			ds_desc = secdata_desc;
633 
634 			ds = VIRTIO_BLK_S_OK;
635 			if (write_mem(ds_desc->addr, &ds, ds_desc->len)) {
636 				log_warnx("wr vioblk: can't write device "
637 				    "status data @ 0x%llx", ds_desc->addr);
638 				dump_descriptor_chain(desc, cmd_desc_idx);
639 				goto out;
640 			}
641 
642 			ret = 1;
643 			dev->cfg.isr_status = 1;
644 			used->ring[used->idx & VIOBLK_QUEUE_MASK].id =
645 			    cmd_desc_idx;
646 			used->ring[used->idx & VIOBLK_QUEUE_MASK].len =
647 			    cmd_desc->len;
648 			used->idx++;
649 
650 			dev->vq[dev->cfg.queue_notify].last_avail = avail->idx &
651 			    VIOBLK_QUEUE_MASK;
652 			if (write_mem(q_gpa, vr, vr_sz))
653 				log_warnx("wr vioblk: error writing vio ring");
654 			break;
655 		case VIRTIO_BLK_T_FLUSH:
656 		case VIRTIO_BLK_T_FLUSH_OUT:
657 			ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
658 			ds_desc = &desc[ds_desc_idx];
659 
660 			ds = VIRTIO_BLK_S_OK;
661 			if (write_mem(ds_desc->addr, &ds, ds_desc->len)) {
662 				log_warnx("fl vioblk: "
663 				    "can't write device status "
664 				    "data @ 0x%llx", ds_desc->addr);
665 				dump_descriptor_chain(desc, cmd_desc_idx);
666 				goto out;
667 			}
668 
669 			ret = 1;
670 			dev->cfg.isr_status = 1;
671 			used->ring[used->idx & VIOBLK_QUEUE_MASK].id =
672 			    cmd_desc_idx;
673 			used->ring[used->idx & VIOBLK_QUEUE_MASK].len =
674 			    cmd_desc->len;
675 			used->idx++;
676 
677 			dev->vq[dev->cfg.queue_notify].last_avail = avail->idx &
678 			    VIOBLK_QUEUE_MASK;
679 			if (write_mem(q_gpa, vr, vr_sz)) {
680 				log_warnx("fl vioblk: error writing vio ring");
681 			}
682 			break;
683 		default:
684 			log_warnx("%s: unsupported command 0x%x", __func__,
685 			    cmd.type);
686 
687 			ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
688 			ds_desc = &desc[ds_desc_idx];
689 
690 			ds = VIRTIO_BLK_S_UNSUPP;
691 			if (write_mem(ds_desc->addr, &ds, ds_desc->len)) {
692 				log_warnx("%s: get id : can't write device "
693 				    "status data @ 0x%llx", __func__,
694 				    ds_desc->addr);
695 				dump_descriptor_chain(desc, cmd_desc_idx);
696 				goto out;
697 			}
698 
699 			ret = 1;
700 			dev->cfg.isr_status = 1;
701 			used->ring[used->idx & VIOBLK_QUEUE_MASK].id =
702 			    cmd_desc_idx;
703 			used->ring[used->idx & VIOBLK_QUEUE_MASK].len =
704 			    cmd_desc->len;
705 			used->idx++;
706 
707 			dev->vq[dev->cfg.queue_notify].last_avail = avail->idx &
708 			    VIOBLK_QUEUE_MASK;
709 			if (write_mem(q_gpa, vr, vr_sz)) {
710 				log_warnx("%s: get id : error writing vio ring",
711 				    __func__);
712 			}
713 			break;
714 		}
715 
716 		idx = (idx + 1) & VIOBLK_QUEUE_MASK;
717 	}
718 out:
719 	free(vr);
720 	return (ret);
721 }
722 
723 int
724 virtio_blk_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
725     void *cookie, uint8_t sz)
726 {
727 	struct vioblk_dev *dev = (struct vioblk_dev *)cookie;
728 
729 	*intr = 0xFF;
730 
731 
732 	if (dir == 0) {
733 		switch (reg) {
734 		case VIRTIO_CONFIG_DEVICE_FEATURES:
735 		case VIRTIO_CONFIG_QUEUE_SIZE:
736 		case VIRTIO_CONFIG_ISR_STATUS:
737 			log_warnx("%s: illegal write %x to %s",
738 			    __progname, *data, virtio_reg_name(reg));
739 			break;
740 		case VIRTIO_CONFIG_GUEST_FEATURES:
741 			dev->cfg.guest_feature = *data;
742 			break;
743 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
744 			dev->cfg.queue_address = *data;
745 			vioblk_update_qa(dev);
746 			break;
747 		case VIRTIO_CONFIG_QUEUE_SELECT:
748 			dev->cfg.queue_select = *data;
749 			vioblk_update_qs(dev);
750 			break;
751 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
752 			dev->cfg.queue_notify = *data;
753 			if (vioblk_notifyq(dev))
754 				*intr = 1;
755 			break;
756 		case VIRTIO_CONFIG_DEVICE_STATUS:
757 			dev->cfg.device_status = *data;
758 			if (dev->cfg.device_status == 0) {
759 				log_debug("%s: device reset", __func__);
760 				dev->cfg.guest_feature = 0;
761 				dev->cfg.queue_address = 0;
762 				vioblk_update_qa(dev);
763 				dev->cfg.queue_size = 0;
764 				vioblk_update_qs(dev);
765 				dev->cfg.queue_select = 0;
766 				dev->cfg.queue_notify = 0;
767 				dev->cfg.isr_status = 0;
768 				dev->vq[0].last_avail = 0;
769 				vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
770 			}
771 			break;
772 		default:
773 			break;
774 		}
775 	} else {
776 		switch (reg) {
777 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
778 			switch (sz) {
779 			case 4:
780 				*data = (uint32_t)(dev->sz);
781 				break;
782 			case 2:
783 				*data &= 0xFFFF0000;
784 				*data |= (uint32_t)(dev->sz) & 0xFFFF;
785 				break;
786 			case 1:
787 				*data &= 0xFFFFFF00;
788 				*data |= (uint32_t)(dev->sz) & 0xFF;
789 				break;
790 			}
791 			/* XXX handle invalid sz */
792 			break;
793 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
794 			if (sz == 1) {
795 				*data &= 0xFFFFFF00;
796 				*data |= (uint32_t)(dev->sz >> 8) & 0xFF;
797 			}
798 			/* XXX handle invalid sz */
799 			break;
800 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
801 			if (sz == 1) {
802 				*data &= 0xFFFFFF00;
803 				*data |= (uint32_t)(dev->sz >> 16) & 0xFF;
804 			} else if (sz == 2) {
805 				*data &= 0xFFFF0000;
806 				*data |= (uint32_t)(dev->sz >> 16) & 0xFFFF;
807 			}
808 			/* XXX handle invalid sz */
809 			break;
810 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
811 			if (sz == 1) {
812 				*data &= 0xFFFFFF00;
813 				*data |= (uint32_t)(dev->sz >> 24) & 0xFF;
814 			}
815 			/* XXX handle invalid sz */
816 			break;
817 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
818 			switch (sz) {
819 			case 4:
820 				*data = (uint32_t)(dev->sz >> 32);
821 				break;
822 			case 2:
823 				*data &= 0xFFFF0000;
824 				*data |= (uint32_t)(dev->sz >> 32) & 0xFFFF;
825 				break;
826 			case 1:
827 				*data &= 0xFFFFFF00;
828 				*data |= (uint32_t)(dev->sz >> 32) & 0xFF;
829 				break;
830 			}
831 			/* XXX handle invalid sz */
832 			break;
833 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
834 			if (sz == 1) {
835 				*data &= 0xFFFFFF00;
836 				*data |= (uint32_t)(dev->sz >> 40) & 0xFF;
837 			}
838 			/* XXX handle invalid sz */
839 			break;
840 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 6:
841 			if (sz == 1) {
842 				*data &= 0xFFFFFF00;
843 				*data |= (uint32_t)(dev->sz >> 48) & 0xFF;
844 			} else if (sz == 2) {
845 				*data &= 0xFFFF0000;
846 				*data |= (uint32_t)(dev->sz >> 48) & 0xFFFF;
847 			}
848 			/* XXX handle invalid sz */
849 			break;
850 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 7:
851 			if (sz == 1) {
852 				*data &= 0xFFFFFF00;
853 				*data |= (uint32_t)(dev->sz >> 56) & 0xFF;
854 			}
855 			/* XXX handle invalid sz */
856 			break;
857 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
858 			switch (sz) {
859 			case 4:
860 				*data = (uint32_t)(dev->max_xfer);
861 				break;
862 			case 2:
863 				*data &= 0xFFFF0000;
864 				*data |= (uint32_t)(dev->max_xfer) & 0xFFFF;
865 				break;
866 			case 1:
867 				*data &= 0xFFFFFF00;
868 				*data |= (uint32_t)(dev->max_xfer) & 0xFF;
869 				break;
870 			}
871 			/* XXX handle invalid sz */
872 			break;
873 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 9:
874 			if (sz == 1) {
875 				*data &= 0xFFFFFF00;
876 				*data |= (uint32_t)(dev->max_xfer >> 8) & 0xFF;
877 			}
878 			/* XXX handle invalid sz */
879 			break;
880 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 10:
881 			if (sz == 1) {
882 				*data &= 0xFFFFFF00;
883 				*data |= (uint32_t)(dev->max_xfer >> 16) & 0xFF;
884 			} else if (sz == 2) {
885 				*data &= 0xFFFF0000;
886 				*data |= (uint32_t)(dev->max_xfer >> 16)
887 				    & 0xFFFF;
888 			}
889 			/* XXX handle invalid sz */
890 			break;
891 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 11:
892 			if (sz == 1) {
893 				*data &= 0xFFFFFF00;
894 				*data |= (uint32_t)(dev->max_xfer >> 24) & 0xFF;
895 			}
896 			/* XXX handle invalid sz */
897 			break;
898 		case VIRTIO_CONFIG_DEVICE_FEATURES:
899 			*data = dev->cfg.device_feature;
900 			break;
901 		case VIRTIO_CONFIG_GUEST_FEATURES:
902 			*data = dev->cfg.guest_feature;
903 			break;
904 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
905 			*data = dev->cfg.queue_address;
906 			break;
907 		case VIRTIO_CONFIG_QUEUE_SIZE:
908 			if (sz == 4)
909 				*data = dev->cfg.queue_size;
910 			else if (sz == 2) {
911 				*data &= 0xFFFF0000;
912 				*data |= (uint16_t)dev->cfg.queue_size;
913 			} else if (sz == 1) {
914 				*data &= 0xFFFFFF00;
915 				*data |= (uint8_t)dev->cfg.queue_size;
916 			}
917 			break;
918 		case VIRTIO_CONFIG_QUEUE_SELECT:
919 			*data = dev->cfg.queue_select;
920 			break;
921 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
922 			*data = dev->cfg.queue_notify;
923 			break;
924 		case VIRTIO_CONFIG_DEVICE_STATUS:
925 			if (sz == 4)
926 				*data = dev->cfg.device_status;
927 			else if (sz == 2) {
928 				*data &= 0xFFFF0000;
929 				*data |= (uint16_t)dev->cfg.device_status;
930 			} else if (sz == 1) {
931 				*data &= 0xFFFFFF00;
932 				*data |= (uint8_t)dev->cfg.device_status;
933 			}
934 			break;
935 		case VIRTIO_CONFIG_ISR_STATUS:
936 			*data = dev->cfg.isr_status;
937 			dev->cfg.isr_status = 0;
938 			vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
939 			break;
940 		}
941 	}
942 	return (0);
943 }
944 
945 int
946 virtio_net_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
947     void *cookie, uint8_t sz)
948 {
949 	struct vionet_dev *dev = (struct vionet_dev *)cookie;
950 
951 	*intr = 0xFF;
952 	mutex_lock(&dev->mutex);
953 
954 	if (dir == 0) {
955 		switch (reg) {
956 		case VIRTIO_CONFIG_DEVICE_FEATURES:
957 		case VIRTIO_CONFIG_QUEUE_SIZE:
958 		case VIRTIO_CONFIG_ISR_STATUS:
959 			log_warnx("%s: illegal write %x to %s",
960 			    __progname, *data, virtio_reg_name(reg));
961 			break;
962 		case VIRTIO_CONFIG_GUEST_FEATURES:
963 			dev->cfg.guest_feature = *data;
964 			break;
965 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
966 			dev->cfg.queue_address = *data;
967 			vionet_update_qa(dev);
968 			break;
969 		case VIRTIO_CONFIG_QUEUE_SELECT:
970 			dev->cfg.queue_select = *data;
971 			vionet_update_qs(dev);
972 			break;
973 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
974 			dev->cfg.queue_notify = *data;
975 			if (vionet_notifyq(dev))
976 				*intr = 1;
977 			break;
978 		case VIRTIO_CONFIG_DEVICE_STATUS:
979 			dev->cfg.device_status = *data;
980 			if (dev->cfg.device_status == 0) {
981 				log_debug("%s: device reset", __func__);
982 				dev->cfg.guest_feature = 0;
983 				dev->cfg.queue_address = 0;
984 				vionet_update_qa(dev);
985 				dev->cfg.queue_size = 0;
986 				vionet_update_qs(dev);
987 				dev->cfg.queue_select = 0;
988 				dev->cfg.queue_notify = 0;
989 				dev->cfg.isr_status = 0;
990 				dev->vq[RXQ].last_avail = 0;
991 				dev->vq[RXQ].notified_avail = 0;
992 				dev->vq[TXQ].last_avail = 0;
993 				dev->vq[TXQ].notified_avail = 0;
994 				vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
995 			}
996 			break;
997 		default:
998 			break;
999 		}
1000 	} else {
1001 		switch (reg) {
1002 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
1003 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
1004 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
1005 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
1006 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
1007 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
1008 			*data = dev->mac[reg -
1009 			    VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI];
1010 			break;
1011 		case VIRTIO_CONFIG_DEVICE_FEATURES:
1012 			*data = dev->cfg.device_feature;
1013 			break;
1014 		case VIRTIO_CONFIG_GUEST_FEATURES:
1015 			*data = dev->cfg.guest_feature;
1016 			break;
1017 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
1018 			*data = dev->cfg.queue_address;
1019 			break;
1020 		case VIRTIO_CONFIG_QUEUE_SIZE:
1021 			*data = dev->cfg.queue_size;
1022 			break;
1023 		case VIRTIO_CONFIG_QUEUE_SELECT:
1024 			*data = dev->cfg.queue_select;
1025 			break;
1026 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
1027 			*data = dev->cfg.queue_notify;
1028 			break;
1029 		case VIRTIO_CONFIG_DEVICE_STATUS:
1030 			*data = dev->cfg.device_status;
1031 			break;
1032 		case VIRTIO_CONFIG_ISR_STATUS:
1033 			*data = dev->cfg.isr_status;
1034 			dev->cfg.isr_status = 0;
1035 			vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
1036 			break;
1037 		}
1038 	}
1039 
1040 	mutex_unlock(&dev->mutex);
1041 	return (0);
1042 }
1043 
1044 /*
1045  * Must be called with dev->mutex acquired.
1046  */
1047 void
1048 vionet_update_qa(struct vionet_dev *dev)
1049 {
1050 	/* Invalid queue? */
1051 	if (dev->cfg.queue_select > 1)
1052 		return;
1053 
1054 	dev->vq[dev->cfg.queue_select].qa = dev->cfg.queue_address;
1055 }
1056 
1057 /*
1058  * Must be called with dev->mutex acquired.
1059  */
1060 void
1061 vionet_update_qs(struct vionet_dev *dev)
1062 {
1063 	/* Invalid queue? */
1064 	if (dev->cfg.queue_select > 1) {
1065 		dev->cfg.queue_size = 0;
1066 		return;
1067 	}
1068 
1069 	/* Update queue address/size based on queue select */
1070 	dev->cfg.queue_address = dev->vq[dev->cfg.queue_select].qa;
1071 	dev->cfg.queue_size = dev->vq[dev->cfg.queue_select].qs;
1072 }
1073 
1074 /*
1075  * Must be called with dev->mutex acquired.
1076  */
1077 int
1078 vionet_enq_rx(struct vionet_dev *dev, char *pkt, ssize_t sz, int *spc)
1079 {
1080 	uint64_t q_gpa;
1081 	uint32_t vr_sz;
1082 	uint16_t idx, pkt_desc_idx, hdr_desc_idx;
1083 	ptrdiff_t off;
1084 	int ret;
1085 	char *vr;
1086 	ssize_t rem;
1087 	struct vring_desc *desc, *pkt_desc, *hdr_desc;
1088 	struct vring_avail *avail;
1089 	struct vring_used *used;
1090 	struct vring_used_elem *ue;
1091 	struct virtio_net_hdr hdr;
1092 
1093 	ret = 0;
1094 
1095 	if (!(dev->cfg.device_status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK))
1096 		return ret;
1097 
1098 	vr_sz = vring_size(VIONET_QUEUE_SIZE);
1099 	q_gpa = dev->vq[RXQ].qa;
1100 	q_gpa = q_gpa * VIRTIO_PAGE_SIZE;
1101 
1102 	vr = calloc(1, vr_sz);
1103 	if (vr == NULL) {
1104 		log_warn("rx enq: calloc error getting vionet ring");
1105 		return (0);
1106 	}
1107 
1108 	if (read_mem(q_gpa, vr, vr_sz)) {
1109 		log_warnx("rx enq: error reading gpa 0x%llx", q_gpa);
1110 		goto out;
1111 	}
1112 
1113 	/* Compute offsets in ring of descriptors, avail ring, and used ring */
1114 	desc = (struct vring_desc *)(vr);
1115 	avail = (struct vring_avail *)(vr + dev->vq[RXQ].vq_availoffset);
1116 	used = (struct vring_used *)(vr + dev->vq[RXQ].vq_usedoffset);
1117 
1118 	idx = dev->vq[RXQ].last_avail & VIONET_QUEUE_MASK;
1119 
1120 	if ((dev->vq[RXQ].notified_avail & VIONET_QUEUE_MASK) == idx) {
1121 		log_debug("vionet queue notify - no space, dropping packet");
1122 		goto out;
1123 	}
1124 
1125 	hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK;
1126 	hdr_desc = &desc[hdr_desc_idx];
1127 
1128 	pkt_desc_idx = hdr_desc->next & VIONET_QUEUE_MASK;
1129 	pkt_desc = &desc[pkt_desc_idx];
1130 
1131 	/* Set up the virtio header (written first, before the packet data) */
1132 	memset(&hdr, 0, sizeof(struct virtio_net_hdr));
1133 	hdr.hdr_len = sizeof(struct virtio_net_hdr);
1134 
1135 	/* Check size of header descriptor */
1136 	if (hdr_desc->len < sizeof(struct virtio_net_hdr)) {
1137 		log_warnx("%s: invalid header descriptor (too small)",
1138 		    __func__);
1139 		goto out;
1140 	}
1141 
1142 	/* Write out virtio header */
1143 	if (write_mem(hdr_desc->addr, &hdr, sizeof(struct virtio_net_hdr))) {
1144 		log_warnx("vionet: rx enq header write_mem error @ "
1145 		    "0x%llx", hdr_desc->addr);
1146 		goto out;
1147 	}
1148 
1149 	/*
1150 	 * Compute remaining space in the first (header) descriptor, and
1151 	 * copy the packet data after if space is available. Otherwise,
1152 	 * copy to the pkt_desc descriptor.
1153 	 */
1154 	rem = hdr_desc->len - sizeof(struct virtio_net_hdr);
1155 
1156 	if (rem >= sz) {
1157 		if (write_mem(hdr_desc->addr + sizeof(struct virtio_net_hdr),
1158 		    pkt, sz)) {
1159 			log_warnx("vionet: rx enq packet write_mem error @ "
1160 			    "0x%llx", pkt_desc->addr);
1161 			goto out;
1162 		}
1163 	} else {
1164 		/* Fallback to pkt_desc descriptor */
1165 		if ((uint64_t)pkt_desc->len >= (uint64_t)sz) {
1166 			/* Must be not readable */
1167 			if ((pkt_desc->flags & VRING_DESC_F_WRITE) == 0) {
1168 				log_warnx("unexpected readable rx desc %d",
1169 				    pkt_desc_idx);
1170 				goto out;
1171 			}
1172 
1173 			/* Write packet to descriptor ring */
1174 			if (write_mem(pkt_desc->addr, pkt, sz)) {
1175 				log_warnx("vionet: rx enq packet write_mem "
1176 				    "error @ 0x%llx", pkt_desc->addr);
1177 				goto out;
1178 			}
1179 		} else {
1180 			log_warnx("%s: descriptor too small for packet data",
1181 			    __func__);
1182 			goto out;
1183 		}
1184 	}
1185 
1186 	ret = 1;
1187 	dev->cfg.isr_status = 1;
1188 	ue = &used->ring[used->idx & VIONET_QUEUE_MASK];
1189 	ue->id = hdr_desc_idx;
1190 	ue->len = sz + sizeof(struct virtio_net_hdr);
1191 	used->idx++;
1192 	dev->vq[RXQ].last_avail++;
1193 	*spc = dev->vq[RXQ].notified_avail - dev->vq[RXQ].last_avail;
1194 
1195 	off = (char *)ue - vr;
1196 	if (write_mem(q_gpa + off, ue, sizeof *ue))
1197 		log_warnx("vionet: error writing vio ring");
1198 	else {
1199 		off = (char *)&used->idx - vr;
1200 		if (write_mem(q_gpa + off, &used->idx, sizeof used->idx))
1201 			log_warnx("vionet: error writing vio ring");
1202 	}
1203 out:
1204 	free(vr);
1205 	return (ret);
1206 }
1207 
1208 /*
1209  * vionet_rx
1210  *
1211  * Enqueue data that was received on a tap file descriptor
1212  * to the vionet device queue.
1213  *
1214  * Must be called with dev->mutex acquired.
1215  */
1216 static int
1217 vionet_rx(struct vionet_dev *dev)
1218 {
1219 	char buf[PAGE_SIZE];
1220 	int hasdata, num_enq = 0, spc = 0;
1221 	struct ether_header *eh;
1222 	ssize_t sz;
1223 
1224 	do {
1225 		sz = read(dev->fd, buf, sizeof buf);
1226 		if (sz == -1) {
1227 			/*
1228 			 * If we get EAGAIN, No data is currently available.
1229 			 * Do not treat this as an error.
1230 			 */
1231 			if (errno != EAGAIN)
1232 				log_warn("unexpected read error on vionet "
1233 				    "device");
1234 		} else if (sz != 0) {
1235 			eh = (struct ether_header *)buf;
1236 			if (!dev->lockedmac || sz < ETHER_HDR_LEN ||
1237 			    ETHER_IS_MULTICAST(eh->ether_dhost) ||
1238 			    memcmp(eh->ether_dhost, dev->mac,
1239 			    sizeof(eh->ether_dhost)) == 0)
1240 				num_enq += vionet_enq_rx(dev, buf, sz, &spc);
1241 		} else if (sz == 0) {
1242 			log_debug("process_rx: no data");
1243 			hasdata = 0;
1244 			break;
1245 		}
1246 
1247 		hasdata = fd_hasdata(dev->fd);
1248 	} while (spc && hasdata);
1249 
1250 	dev->rx_pending = hasdata;
1251 	return (num_enq);
1252 }
1253 
1254 /*
1255  * vionet_rx_event
1256  *
1257  * Called from the event handling thread when new data can be
1258  * received on the tap fd of a vionet device.
1259  */
1260 static void
1261 vionet_rx_event(int fd, short kind, void *arg)
1262 {
1263 	struct vionet_dev *dev = arg;
1264 
1265 	mutex_lock(&dev->mutex);
1266 
1267 	/*
1268 	 * We already have other data pending to be received. The data that
1269 	 * has become available now will be enqueued to the vionet_dev
1270 	 * later.
1271 	 */
1272 	if (dev->rx_pending) {
1273 		mutex_unlock(&dev->mutex);
1274 		return;
1275 	}
1276 
1277 	if (vionet_rx(dev) > 0) {
1278 		/* XXX: vcpu_id */
1279 		vcpu_assert_pic_irq(dev->vm_id, 0, dev->irq);
1280 	}
1281 
1282 	mutex_unlock(&dev->mutex);
1283 }
1284 
1285 /*
1286  * vionet_process_rx
1287  *
1288  * Processes any remaining pending receivable data for a vionet device.
1289  * Called on VCPU exit. Although we poll on the tap file descriptor of
1290  * a vionet_dev in a separate thread, this function still needs to be
1291  * called on VCPU exit: it can happen that not all data fits into the
1292  * receive queue of the vionet_dev immediately. So any outstanding data
1293  * is handled here.
1294  *
1295  * Parameters:
1296  *  vm_id: VM ID of the VM for which to process vionet events
1297  */
1298 void
1299 vionet_process_rx(uint32_t vm_id)
1300 {
1301 	int i;
1302 
1303 	for (i = 0 ; i < nr_vionet; i++) {
1304 		mutex_lock(&vionet[i].mutex);
1305 		if (!vionet[i].rx_added) {
1306 			mutex_unlock(&vionet[i].mutex);
1307 			continue;
1308 		}
1309 
1310 		if (vionet[i].rx_pending) {
1311 			if (vionet_rx(&vionet[i])) {
1312 				vcpu_assert_pic_irq(vm_id, 0, vionet[i].irq);
1313 			}
1314 		}
1315 		mutex_unlock(&vionet[i].mutex);
1316 	}
1317 }
1318 
1319 /*
1320  * Must be called with dev->mutex acquired.
1321  */
1322 void
1323 vionet_notify_rx(struct vionet_dev *dev)
1324 {
1325 	uint64_t q_gpa;
1326 	uint32_t vr_sz;
1327 	char *vr;
1328 	struct vring_avail *avail;
1329 
1330 	vr_sz = vring_size(VIONET_QUEUE_SIZE);
1331 	q_gpa = dev->vq[RXQ].qa;
1332 	q_gpa = q_gpa * VIRTIO_PAGE_SIZE;
1333 
1334 	vr = malloc(vr_sz);
1335 	if (vr == NULL) {
1336 		log_warn("malloc error getting vionet ring");
1337 		return;
1338 	}
1339 
1340 	if (read_mem(q_gpa, vr, vr_sz)) {
1341 		log_warnx("error reading gpa 0x%llx", q_gpa);
1342 		free(vr);
1343 		return;
1344 	}
1345 
1346 	/* Compute offset into avail ring */
1347 	avail = (struct vring_avail *)(vr + dev->vq[RXQ].vq_availoffset);
1348 
1349 	dev->rx_added = 1;
1350 	dev->vq[RXQ].notified_avail = avail->idx - 1;
1351 
1352 	free(vr);
1353 }
1354 
1355 /*
1356  * Must be called with dev->mutex acquired.
1357  */
1358 int
1359 vionet_notifyq(struct vionet_dev *dev)
1360 {
1361 	int ret;
1362 
1363 	switch (dev->cfg.queue_notify) {
1364 	case RXQ:
1365 		vionet_notify_rx(dev);
1366 		ret = 0;
1367 		break;
1368 	case TXQ:
1369 		ret = vionet_notify_tx(dev);
1370 		break;
1371 	default:
1372 		/*
1373 		 * Catch the unimplemented queue ID 2 (control queue) as
1374 		 * well as any bogus queue IDs.
1375 		 */
1376 		log_debug("%s: notify for unimplemented queue ID %d",
1377 		    __func__, dev->cfg.queue_notify);
1378 		ret = 0;
1379 		break;
1380 	}
1381 
1382 	return (ret);
1383 }
1384 
1385 /*
1386  * Must be called with dev->mutex acquired.
1387  *
1388  * XXX cant trust ring data from VM, be extra cautious.
1389  */
1390 int
1391 vionet_notify_tx(struct vionet_dev *dev)
1392 {
1393 	uint64_t q_gpa;
1394 	uint32_t vr_sz;
1395 	uint16_t idx, pkt_desc_idx, hdr_desc_idx, dxx;
1396 	size_t pktsz;
1397 	ssize_t dhcpsz;
1398 	int ret, num_enq, ofs, spc;
1399 	char *vr, *pkt, *dhcppkt;
1400 	struct vring_desc *desc, *pkt_desc, *hdr_desc;
1401 	struct vring_avail *avail;
1402 	struct vring_used *used;
1403 	struct ether_header *eh;
1404 
1405 	vr = pkt = dhcppkt = NULL;
1406 	ret = spc = 0;
1407 	dhcpsz = 0;
1408 
1409 	vr_sz = vring_size(VIONET_QUEUE_SIZE);
1410 	q_gpa = dev->vq[TXQ].qa;
1411 	q_gpa = q_gpa * VIRTIO_PAGE_SIZE;
1412 
1413 	vr = calloc(1, vr_sz);
1414 	if (vr == NULL) {
1415 		log_warn("calloc error getting vionet ring");
1416 		goto out;
1417 	}
1418 
1419 	if (read_mem(q_gpa, vr, vr_sz)) {
1420 		log_warnx("error reading gpa 0x%llx", q_gpa);
1421 		goto out;
1422 	}
1423 
1424 	/* Compute offsets in ring of descriptors, avail ring, and used ring */
1425 	desc = (struct vring_desc *)(vr);
1426 	avail = (struct vring_avail *)(vr + dev->vq[TXQ].vq_availoffset);
1427 	used = (struct vring_used *)(vr + dev->vq[TXQ].vq_usedoffset);
1428 
1429 	num_enq = 0;
1430 
1431 	idx = dev->vq[TXQ].last_avail & VIONET_QUEUE_MASK;
1432 
1433 	if ((avail->idx & VIONET_QUEUE_MASK) == idx) {
1434 		log_warnx("vionet tx queue notify - nothing to do?");
1435 		goto out;
1436 	}
1437 
1438 	while ((avail->idx & VIONET_QUEUE_MASK) != idx) {
1439 		hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK;
1440 		hdr_desc = &desc[hdr_desc_idx];
1441 		pktsz = 0;
1442 
1443 		dxx = hdr_desc_idx;
1444 		do {
1445 			pktsz += desc[dxx].len;
1446 			dxx = desc[dxx].next;
1447 		} while (desc[dxx].flags & VRING_DESC_F_NEXT);
1448 
1449 		pktsz += desc[dxx].len;
1450 
1451 		/* Remove virtio header descriptor len */
1452 		pktsz -= hdr_desc->len;
1453 
1454 		/*
1455 		 * XXX check sanity pktsz
1456 		 * XXX too long and  > PAGE_SIZE checks
1457 		 *     (PAGE_SIZE can be relaxed to 16384 later)
1458 		 */
1459 		pkt = malloc(pktsz);
1460 		if (pkt == NULL) {
1461 			log_warn("malloc error alloc packet buf");
1462 			goto out;
1463 		}
1464 
1465 		ofs = 0;
1466 		pkt_desc_idx = hdr_desc->next & VIONET_QUEUE_MASK;
1467 		pkt_desc = &desc[pkt_desc_idx];
1468 
1469 		while (pkt_desc->flags & VRING_DESC_F_NEXT) {
1470 			/* must be not writable */
1471 			if (pkt_desc->flags & VRING_DESC_F_WRITE) {
1472 				log_warnx("unexpected writable tx desc "
1473 				    "%d", pkt_desc_idx);
1474 				goto out;
1475 			}
1476 
1477 			/* Read packet from descriptor ring */
1478 			if (read_mem(pkt_desc->addr, pkt + ofs,
1479 			    pkt_desc->len)) {
1480 				log_warnx("vionet: packet read_mem error "
1481 				    "@ 0x%llx", pkt_desc->addr);
1482 				goto out;
1483 			}
1484 
1485 			ofs += pkt_desc->len;
1486 			pkt_desc_idx = pkt_desc->next & VIONET_QUEUE_MASK;
1487 			pkt_desc = &desc[pkt_desc_idx];
1488 		}
1489 
1490 		/* Now handle tail descriptor - must be not writable */
1491 		if (pkt_desc->flags & VRING_DESC_F_WRITE) {
1492 			log_warnx("unexpected writable tx descriptor %d",
1493 			    pkt_desc_idx);
1494 			goto out;
1495 		}
1496 
1497 		/* Read packet from descriptor ring */
1498 		if (read_mem(pkt_desc->addr, pkt + ofs,
1499 		    pkt_desc->len)) {
1500 			log_warnx("vionet: packet read_mem error @ "
1501 			    "0x%llx", pkt_desc->addr);
1502 			goto out;
1503 		}
1504 
1505 		/* reject other source addresses */
1506 		if (dev->lockedmac && pktsz >= ETHER_HDR_LEN &&
1507 		    (eh = (struct ether_header *)pkt) &&
1508 		    memcmp(eh->ether_shost, dev->mac,
1509 		    sizeof(eh->ether_shost)) != 0)
1510 			log_debug("vionet: wrong source address %s for vm %d",
1511 			    ether_ntoa((struct ether_addr *)
1512 			    eh->ether_shost), dev->vm_id);
1513 		else if (dev->local && dhcpsz == 0 &&
1514 		    (dhcpsz = dhcp_request(dev, pkt, pktsz, &dhcppkt)) != -1) {
1515 			log_debug("vionet: dhcp request,"
1516 			    " local response size %zd", dhcpsz);
1517 
1518 		/* XXX signed vs unsigned here, funky cast */
1519 		} else if (write(dev->fd, pkt, pktsz) != (int)pktsz) {
1520 			log_warnx("vionet: tx failed writing to tap: "
1521 			    "%d", errno);
1522 			goto out;
1523 		}
1524 
1525 		ret = 1;
1526 		dev->cfg.isr_status = 1;
1527 		used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_desc_idx;
1528 		used->ring[used->idx & VIONET_QUEUE_MASK].len = hdr_desc->len;
1529 		used->idx++;
1530 
1531 		dev->vq[TXQ].last_avail++;
1532 		num_enq++;
1533 
1534 		idx = dev->vq[TXQ].last_avail & VIONET_QUEUE_MASK;
1535 
1536 		free(pkt);
1537 		pkt = NULL;
1538 	}
1539 
1540 	if (write_mem(q_gpa, vr, vr_sz)) {
1541 		log_warnx("vionet: tx error writing vio ring");
1542 	}
1543 
1544 	if (dhcpsz > 0) {
1545 		if (vionet_enq_rx(dev, dhcppkt, dhcpsz, &spc))
1546 			ret = 1;
1547 	}
1548 
1549 out:
1550 	free(vr);
1551 	free(pkt);
1552 	free(dhcppkt);
1553 
1554 	return (ret);
1555 }
1556 
1557 int
1558 vmmci_ctl(unsigned int cmd)
1559 {
1560 	struct timeval tv = { 0, 0 };
1561 
1562 	if ((vmmci.cfg.device_status &
1563 	    VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0)
1564 		return (-1);
1565 
1566 	if (cmd == vmmci.cmd)
1567 		return (0);
1568 
1569 	switch (cmd) {
1570 	case VMMCI_NONE:
1571 		break;
1572 	case VMMCI_SHUTDOWN:
1573 	case VMMCI_REBOOT:
1574 		/* Update command */
1575 		vmmci.cmd = cmd;
1576 
1577 		/*
1578 		 * vmm VMs do not support powerdown, send a reboot request
1579 		 * instead and turn it off after the triple fault.
1580 		 */
1581 		if (cmd == VMMCI_SHUTDOWN)
1582 			cmd = VMMCI_REBOOT;
1583 
1584 		/* Trigger interrupt */
1585 		vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
1586 		vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
1587 
1588 		/* Add ACK timeout */
1589 		tv.tv_sec = VMMCI_TIMEOUT;
1590 		evtimer_add(&vmmci.timeout, &tv);
1591 		break;
1592 	case VMMCI_SYNCRTC:
1593 		if (vmmci.cfg.guest_feature & VMMCI_F_SYNCRTC) {
1594 			/* RTC updated, request guest VM resync of its RTC */
1595 			vmmci.cmd = cmd;
1596 
1597 			vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
1598 			vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
1599 		} else {
1600 			log_debug("%s: RTC sync skipped (guest does not "
1601 			    "support RTC sync)\n", __func__);
1602 		}
1603 		break;
1604 	default:
1605 		fatalx("invalid vmmci command: %d", cmd);
1606 	}
1607 
1608 	return (0);
1609 }
1610 
1611 void
1612 vmmci_ack(unsigned int cmd)
1613 {
1614 	struct timeval	 tv = { 0, 0 };
1615 
1616 	switch (cmd) {
1617 	case VMMCI_NONE:
1618 		break;
1619 	case VMMCI_SHUTDOWN:
1620 		/*
1621 		 * The shutdown was requested by the VM if we don't have
1622 		 * a pending shutdown request.  In this case add a short
1623 		 * timeout to give the VM a chance to reboot before the
1624 		 * timer is expired.
1625 		 */
1626 		if (vmmci.cmd == 0) {
1627 			log_debug("%s: vm %u requested shutdown", __func__,
1628 			    vmmci.vm_id);
1629 			tv.tv_sec = VMMCI_TIMEOUT;
1630 			evtimer_add(&vmmci.timeout, &tv);
1631 			return;
1632 		}
1633 		/* FALLTHROUGH */
1634 	case VMMCI_REBOOT:
1635 		/*
1636 		 * If the VM acknowleged our shutdown request, give it
1637 		 * enough time to shutdown or reboot gracefully.  This
1638 		 * might take a considerable amount of time (running
1639 		 * rc.shutdown on the VM), so increase the timeout before
1640 		 * killing it forcefully.
1641 		 */
1642 		if (cmd == vmmci.cmd &&
1643 		    evtimer_pending(&vmmci.timeout, NULL)) {
1644 			log_debug("%s: vm %u acknowledged shutdown request",
1645 			    __func__, vmmci.vm_id);
1646 			tv.tv_sec = VMMCI_SHUTDOWN_TIMEOUT;
1647 			evtimer_add(&vmmci.timeout, &tv);
1648 		}
1649 		break;
1650 	case VMMCI_SYNCRTC:
1651 		log_debug("%s: vm %u acknowledged RTC sync request",
1652 		    __func__, vmmci.vm_id);
1653 		vmmci.cmd = VMMCI_NONE;
1654 		break;
1655 	default:
1656 		log_warnx("%s: illegal request %u", __func__, cmd);
1657 		break;
1658 	}
1659 }
1660 
1661 void
1662 vmmci_timeout(int fd, short type, void *arg)
1663 {
1664 	log_debug("%s: vm %u shutdown", __progname, vmmci.vm_id);
1665 	vm_shutdown(vmmci.cmd == VMMCI_REBOOT ? VMMCI_REBOOT : VMMCI_SHUTDOWN);
1666 }
1667 
1668 int
1669 vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
1670     void *unused, uint8_t sz)
1671 {
1672 	*intr = 0xFF;
1673 
1674 	if (dir == 0) {
1675 		switch (reg) {
1676 		case VIRTIO_CONFIG_DEVICE_FEATURES:
1677 		case VIRTIO_CONFIG_QUEUE_SIZE:
1678 		case VIRTIO_CONFIG_ISR_STATUS:
1679 			log_warnx("%s: illegal write %x to %s",
1680 			    __progname, *data, virtio_reg_name(reg));
1681 			break;
1682 		case VIRTIO_CONFIG_GUEST_FEATURES:
1683 			vmmci.cfg.guest_feature = *data;
1684 			break;
1685 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
1686 			vmmci.cfg.queue_address = *data;
1687 			break;
1688 		case VIRTIO_CONFIG_QUEUE_SELECT:
1689 			vmmci.cfg.queue_select = *data;
1690 			break;
1691 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
1692 			vmmci.cfg.queue_notify = *data;
1693 			break;
1694 		case VIRTIO_CONFIG_DEVICE_STATUS:
1695 			vmmci.cfg.device_status = *data;
1696 			break;
1697 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
1698 			vmmci_ack(*data);
1699 			break;
1700 		}
1701 	} else {
1702 		switch (reg) {
1703 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
1704 			*data = vmmci.cmd;
1705 			break;
1706 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
1707 			/* Update time once when reading the first register */
1708 			gettimeofday(&vmmci.time, NULL);
1709 			*data = (uint64_t)vmmci.time.tv_sec;
1710 			break;
1711 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
1712 			*data = (uint64_t)vmmci.time.tv_sec << 32;
1713 			break;
1714 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12:
1715 			*data = (uint64_t)vmmci.time.tv_usec;
1716 			break;
1717 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16:
1718 			*data = (uint64_t)vmmci.time.tv_usec << 32;
1719 			break;
1720 		case VIRTIO_CONFIG_DEVICE_FEATURES:
1721 			*data = vmmci.cfg.device_feature;
1722 			break;
1723 		case VIRTIO_CONFIG_GUEST_FEATURES:
1724 			*data = vmmci.cfg.guest_feature;
1725 			break;
1726 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
1727 			*data = vmmci.cfg.queue_address;
1728 			break;
1729 		case VIRTIO_CONFIG_QUEUE_SIZE:
1730 			*data = vmmci.cfg.queue_size;
1731 			break;
1732 		case VIRTIO_CONFIG_QUEUE_SELECT:
1733 			*data = vmmci.cfg.queue_select;
1734 			break;
1735 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
1736 			*data = vmmci.cfg.queue_notify;
1737 			break;
1738 		case VIRTIO_CONFIG_DEVICE_STATUS:
1739 			*data = vmmci.cfg.device_status;
1740 			break;
1741 		case VIRTIO_CONFIG_ISR_STATUS:
1742 			*data = vmmci.cfg.isr_status;
1743 			vmmci.cfg.isr_status = 0;
1744 			vcpu_deassert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
1745 			break;
1746 		}
1747 	}
1748 	return (0);
1749 }
1750 
1751 int
1752 virtio_get_base(int fd, char *path, size_t npath, int type, const char *dpath)
1753 {
1754 	switch (type) {
1755 	case VMDF_RAW:
1756 		return 0;
1757 	case VMDF_QCOW2:
1758 		return virtio_qcow2_get_base(fd, path, npath, dpath);
1759 	}
1760 	log_warnx("%s: invalid disk format", __func__);
1761 	return -1;
1762 }
1763 
1764 /*
1765  * Initializes a struct virtio_backing using the list of fds.
1766  */
1767 static int
1768 virtio_init_disk(struct virtio_backing *file, off_t *sz,
1769     int *fd, size_t nfd, int type)
1770 {
1771 	/*
1772 	 * probe disk types in order of preference, first one to work wins.
1773 	 * TODO: provide a way of specifying the type and options.
1774 	 */
1775 	switch (type) {
1776 	case VMDF_RAW:
1777 		return virtio_raw_init(file, sz, fd, nfd);
1778 	case VMDF_QCOW2:
1779 		return virtio_qcow2_init(file, sz, fd, nfd);
1780 	}
1781 	log_warnx("%s: invalid disk format", __func__);
1782 	return -1;
1783 }
1784 
1785 void
1786 virtio_init(struct vmd_vm *vm, int child_cdrom,
1787     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1788 {
1789 	struct vmop_create_params *vmc = &vm->vm_params;
1790 	struct vm_create_params *vcp = &vmc->vmc_params;
1791 	uint8_t id;
1792 	uint8_t i;
1793 	int ret;
1794 
1795 	/* Virtio entropy device */
1796 	if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
1797 	    PCI_PRODUCT_QUMRANET_VIO_RNG, PCI_CLASS_SYSTEM,
1798 	    PCI_SUBCLASS_SYSTEM_MISC,
1799 	    PCI_VENDOR_OPENBSD,
1800 	    PCI_PRODUCT_VIRTIO_ENTROPY, 1, NULL)) {
1801 		log_warnx("%s: can't add PCI virtio rng device",
1802 		    __progname);
1803 		return;
1804 	}
1805 
1806 	if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_rnd_io, NULL)) {
1807 		log_warnx("%s: can't add bar for virtio rng device",
1808 		    __progname);
1809 		return;
1810 	}
1811 
1812 	memset(&viornd, 0, sizeof(viornd));
1813 	viornd.vq[0].qs = VIORND_QUEUE_SIZE;
1814 	viornd.vq[0].vq_availoffset = sizeof(struct vring_desc) *
1815 	    VIORND_QUEUE_SIZE;
1816 	viornd.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
1817 	    sizeof(struct vring_desc) * VIORND_QUEUE_SIZE
1818 	    + sizeof(uint16_t) * (2 + VIORND_QUEUE_SIZE));
1819 	viornd.pci_id = id;
1820 	viornd.irq = pci_get_dev_irq(id);
1821 	viornd.vm_id = vcp->vcp_id;
1822 
1823 	if (vcp->vcp_nnics > 0) {
1824 		vionet = calloc(vcp->vcp_nnics, sizeof(struct vionet_dev));
1825 		if (vionet == NULL) {
1826 			log_warn("%s: calloc failure allocating vionets",
1827 			    __progname);
1828 			return;
1829 		}
1830 
1831 		nr_vionet = vcp->vcp_nnics;
1832 		/* Virtio network */
1833 		for (i = 0; i < vcp->vcp_nnics; i++) {
1834 			if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
1835 			    PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM,
1836 			    PCI_SUBCLASS_SYSTEM_MISC,
1837 			    PCI_VENDOR_OPENBSD,
1838 			    PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL)) {
1839 				log_warnx("%s: can't add PCI virtio net device",
1840 				    __progname);
1841 				return;
1842 			}
1843 
1844 			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_net_io,
1845 			    &vionet[i])) {
1846 				log_warnx("%s: can't add bar for virtio net "
1847 				    "device", __progname);
1848 				return;
1849 			}
1850 
1851 			ret = pthread_mutex_init(&vionet[i].mutex, NULL);
1852 			if (ret) {
1853 				errno = ret;
1854 				log_warn("%s: could not initialize mutex "
1855 				    "for vionet device", __progname);
1856 				return;
1857 			}
1858 
1859 			vionet[i].vq[RXQ].qs = VIONET_QUEUE_SIZE;
1860 			vionet[i].vq[RXQ].vq_availoffset =
1861 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
1862 			vionet[i].vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
1863 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
1864 			    + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
1865 			vionet[i].vq[RXQ].last_avail = 0;
1866 			vionet[i].vq[TXQ].qs = VIONET_QUEUE_SIZE;
1867 			vionet[i].vq[TXQ].vq_availoffset =
1868 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
1869 			vionet[i].vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
1870 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
1871 			    + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
1872 			vionet[i].vq[TXQ].last_avail = 0;
1873 			vionet[i].vq[TXQ].notified_avail = 0;
1874 			vionet[i].fd = child_taps[i];
1875 			vionet[i].rx_pending = 0;
1876 			vionet[i].vm_id = vcp->vcp_id;
1877 			vionet[i].vm_vmid = vm->vm_vmid;
1878 			vionet[i].irq = pci_get_dev_irq(id);
1879 
1880 			event_set(&vionet[i].event, vionet[i].fd,
1881 			    EV_READ | EV_PERSIST, vionet_rx_event, &vionet[i]);
1882 			if (event_add(&vionet[i].event, NULL)) {
1883 				log_warn("could not initialize vionet event "
1884 				    "handler");
1885 				return;
1886 			}
1887 
1888 			/* MAC address has been assigned by the parent */
1889 			memcpy(&vionet[i].mac, &vcp->vcp_macs[i], 6);
1890 			vionet[i].cfg.device_feature = VIRTIO_NET_F_MAC;
1891 
1892 			vionet[i].lockedmac =
1893 			    vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0;
1894 			vionet[i].local =
1895 			    vmc->vmc_ifflags[i] & VMIFF_LOCAL ? 1 : 0;
1896 			if (i == 0 && vmc->vmc_bootdevice & VMBOOTDEV_NET)
1897 				vionet[i].pxeboot = 1;
1898 			vionet[i].idx = i;
1899 			vionet[i].pci_id = id;
1900 
1901 			log_debug("%s: vm \"%s\" vio%u lladdr %s%s%s%s",
1902 			    __func__, vcp->vcp_name, i,
1903 			    ether_ntoa((void *)vionet[i].mac),
1904 			    vionet[i].lockedmac ? ", locked" : "",
1905 			    vionet[i].local ? ", local" : "",
1906 			    vionet[i].pxeboot ? ", pxeboot" : "");
1907 		}
1908 	}
1909 
1910 	if (vcp->vcp_ndisks > 0) {
1911 		nr_vioblk = vcp->vcp_ndisks;
1912 		vioblk = calloc(vcp->vcp_ndisks, sizeof(struct vioblk_dev));
1913 		if (vioblk == NULL) {
1914 			log_warn("%s: calloc failure allocating vioblks",
1915 			    __progname);
1916 			return;
1917 		}
1918 
1919 		/* One virtio block device for each disk defined in vcp */
1920 		for (i = 0; i < vcp->vcp_ndisks; i++) {
1921 			if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
1922 			    PCI_PRODUCT_QUMRANET_VIO_BLOCK,
1923 			    PCI_CLASS_MASS_STORAGE,
1924 			    PCI_SUBCLASS_MASS_STORAGE_SCSI,
1925 			    PCI_VENDOR_OPENBSD,
1926 			    PCI_PRODUCT_VIRTIO_BLOCK, 1, NULL)) {
1927 				log_warnx("%s: can't add PCI virtio block "
1928 				    "device", __progname);
1929 				return;
1930 			}
1931 			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_blk_io,
1932 			    &vioblk[i])) {
1933 				log_warnx("%s: can't add bar for virtio block "
1934 				    "device", __progname);
1935 				return;
1936 			}
1937 			vioblk[i].vq[0].qs = VIOBLK_QUEUE_SIZE;
1938 			vioblk[i].vq[0].vq_availoffset =
1939 			    sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE;
1940 			vioblk[i].vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
1941 			    sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE
1942 			    + sizeof(uint16_t) * (2 + VIOBLK_QUEUE_SIZE));
1943 			vioblk[i].vq[0].last_avail = 0;
1944 			vioblk[i].cfg.device_feature = VIRTIO_BLK_F_SIZE_MAX;
1945 			vioblk[i].max_xfer = 1048576;
1946 			vioblk[i].pci_id = id;
1947 			vioblk[i].vm_id = vcp->vcp_id;
1948 			vioblk[i].irq = pci_get_dev_irq(id);
1949 			if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
1950 			    child_disks[i], vmc->vmc_diskbases[i],
1951 			    vmc->vmc_disktypes[i]) == -1) {
1952 				log_warnx("%s: unable to determine disk format",
1953 				    __func__);
1954 				return;
1955 			}
1956 			vioblk[i].sz /= 512;
1957 		}
1958 	}
1959 
1960 	/* vioscsi cdrom */
1961 	if (strlen(vcp->vcp_cdrom)) {
1962 		vioscsi = calloc(1, sizeof(struct vioscsi_dev));
1963 		if (vioscsi == NULL) {
1964 			log_warn("%s: calloc failure allocating vioscsi",
1965 			    __progname);
1966 			return;
1967 		}
1968 
1969 		if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
1970 		    PCI_PRODUCT_QUMRANET_VIO_SCSI,
1971 		    PCI_CLASS_MASS_STORAGE,
1972 		    PCI_SUBCLASS_MASS_STORAGE_SCSI,
1973 		    PCI_VENDOR_OPENBSD,
1974 		    PCI_PRODUCT_VIRTIO_SCSI, 1, NULL)) {
1975 			log_warnx("%s: can't add PCI vioscsi device",
1976 			    __progname);
1977 			return;
1978 		}
1979 
1980 		if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vioscsi_io, vioscsi)) {
1981 			log_warnx("%s: can't add bar for vioscsi device",
1982 			    __progname);
1983 			return;
1984 		}
1985 
1986 		for ( i = 0; i < VIRTIO_MAX_QUEUES; i++) {
1987 			vioscsi->vq[i].qs = VIOSCSI_QUEUE_SIZE;
1988 			vioscsi->vq[i].vq_availoffset =
1989 			    sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE;
1990 			vioscsi->vq[i].vq_usedoffset = VIRTQUEUE_ALIGN(
1991 			    sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE
1992 			    + sizeof(uint16_t) * (2 + VIOSCSI_QUEUE_SIZE));
1993 			vioscsi->vq[i].last_avail = 0;
1994 		}
1995 		if (virtio_init_disk(&vioscsi->file, &vioscsi->sz,
1996 		    &child_cdrom, 1, VMDF_RAW) == -1) {
1997 			log_warnx("%s: unable to determine iso format",
1998 			    __func__);
1999 			return;
2000 		}
2001 		vioscsi->locked = 0;
2002 		vioscsi->lba = 0;
2003 		vioscsi->n_blocks = vioscsi->sz >> 11; /* num of 2048 blocks in file */
2004 		vioscsi->max_xfer = VIOSCSI_BLOCK_SIZE_CDROM;
2005 		vioscsi->pci_id = id;
2006 		vioscsi->vm_id = vcp->vcp_id;
2007 		vioscsi->irq = pci_get_dev_irq(id);
2008 	}
2009 
2010 	/* virtio control device */
2011 	if (pci_add_device(&id, PCI_VENDOR_OPENBSD,
2012 	    PCI_PRODUCT_OPENBSD_CONTROL,
2013 	    PCI_CLASS_COMMUNICATIONS,
2014 	    PCI_SUBCLASS_COMMUNICATIONS_MISC,
2015 	    PCI_VENDOR_OPENBSD,
2016 	    PCI_PRODUCT_VIRTIO_VMMCI, 1, NULL)) {
2017 		log_warnx("%s: can't add PCI vmm control device",
2018 		    __progname);
2019 		return;
2020 	}
2021 
2022 	if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vmmci_io, NULL)) {
2023 		log_warnx("%s: can't add bar for vmm control device",
2024 		    __progname);
2025 		return;
2026 	}
2027 
2028 	memset(&vmmci, 0, sizeof(vmmci));
2029 	vmmci.cfg.device_feature = VMMCI_F_TIMESYNC | VMMCI_F_ACK |
2030 	    VMMCI_F_SYNCRTC;
2031 	vmmci.vm_id = vcp->vcp_id;
2032 	vmmci.irq = pci_get_dev_irq(id);
2033 	vmmci.pci_id = id;
2034 
2035 	evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
2036 }
2037 
2038 void
2039 virtio_shutdown(struct vmd_vm *vm)
2040 {
2041 	int i;
2042 
2043 	/* ensure that our disks are synced */
2044 	if (vioscsi != NULL)
2045 		vioscsi->file.close(vioscsi->file.p, 0);
2046 
2047 	for (i = 0; i < nr_vioblk; i++)
2048 		vioblk[i].file.close(vioblk[i].file.p, 0);
2049 }
2050 
2051 int
2052 vmmci_restore(int fd, uint32_t vm_id)
2053 {
2054 	log_debug("%s: receiving vmmci", __func__);
2055 	if (atomicio(read, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
2056 		log_warnx("%s: error reading vmmci from fd", __func__);
2057 		return (-1);
2058 	}
2059 
2060 	if (pci_set_bar_fn(vmmci.pci_id, 0, vmmci_io, NULL)) {
2061 		log_warnx("%s: can't set bar fn for vmm control device",
2062 		    __progname);
2063 		return (-1);
2064 	}
2065 	vmmci.vm_id = vm_id;
2066 	vmmci.irq = pci_get_dev_irq(vmmci.pci_id);
2067 	memset(&vmmci.timeout, 0, sizeof(struct event));
2068 	evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
2069 	return (0);
2070 }
2071 
2072 int
2073 viornd_restore(int fd, struct vm_create_params *vcp)
2074 {
2075 	log_debug("%s: receiving viornd", __func__);
2076 	if (atomicio(read, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
2077 		log_warnx("%s: error reading viornd from fd", __func__);
2078 		return (-1);
2079 	}
2080 	if (pci_set_bar_fn(viornd.pci_id, 0, virtio_rnd_io, NULL)) {
2081 		log_warnx("%s: can't set bar fn for virtio rng device",
2082 		    __progname);
2083 		return (-1);
2084 	}
2085 	viornd.vm_id = vcp->vcp_id;
2086 	viornd.irq = pci_get_dev_irq(viornd.pci_id);
2087 
2088 	return (0);
2089 }
2090 
2091 int
2092 vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
2093 {
2094 	struct vmop_create_params *vmc = &vm->vm_params;
2095 	struct vm_create_params *vcp = &vmc->vmc_params;
2096 	uint8_t i;
2097 	int ret;
2098 
2099 	nr_vionet = vcp->vcp_nnics;
2100 	if (vcp->vcp_nnics > 0) {
2101 		vionet = calloc(vcp->vcp_nnics, sizeof(struct vionet_dev));
2102 		if (vionet == NULL) {
2103 			log_warn("%s: calloc failure allocating vionets",
2104 			    __progname);
2105 			return (-1);
2106 		}
2107 		log_debug("%s: receiving vionet", __func__);
2108 		if (atomicio(read, fd, vionet,
2109 		    vcp->vcp_nnics * sizeof(struct vionet_dev)) !=
2110 		    vcp->vcp_nnics * sizeof(struct vionet_dev)) {
2111 			log_warnx("%s: error reading vionet from fd",
2112 			    __func__);
2113 			return (-1);
2114 		}
2115 
2116 		/* Virtio network */
2117 		for (i = 0; i < vcp->vcp_nnics; i++) {
2118 			if (pci_set_bar_fn(vionet[i].pci_id, 0, virtio_net_io,
2119 			    &vionet[i])) {
2120 				log_warnx("%s: can't set bar fn for virtio net "
2121 				    "device", __progname);
2122 				return (-1);
2123 			}
2124 
2125 			memset(&vionet[i].mutex, 0, sizeof(pthread_mutex_t));
2126 			ret = pthread_mutex_init(&vionet[i].mutex, NULL);
2127 
2128 			if (ret) {
2129 				errno = ret;
2130 				log_warn("%s: could not initialize mutex "
2131 				    "for vionet device", __progname);
2132 				return (-1);
2133 			}
2134 			vionet[i].fd = child_taps[i];
2135 			vionet[i].rx_pending = 0;
2136 			vionet[i].vm_id = vcp->vcp_id;
2137 			vionet[i].vm_vmid = vm->vm_vmid;
2138 			vionet[i].irq = pci_get_dev_irq(vionet[i].pci_id);
2139 
2140 			memset(&vionet[i].event, 0, sizeof(struct event));
2141 			event_set(&vionet[i].event, vionet[i].fd,
2142 			    EV_READ | EV_PERSIST, vionet_rx_event, &vionet[i]);
2143 		}
2144 	}
2145 	return (0);
2146 }
2147 
2148 int
2149 vioblk_restore(int fd, struct vmop_create_params *vmc,
2150     int child_disks[][VM_MAX_BASE_PER_DISK])
2151 {
2152 	struct vm_create_params *vcp = &vmc->vmc_params;
2153 	uint8_t i;
2154 
2155 	nr_vioblk = vcp->vcp_ndisks;
2156 	vioblk = calloc(vcp->vcp_ndisks, sizeof(struct vioblk_dev));
2157 	if (vioblk == NULL) {
2158 		log_warn("%s: calloc failure allocating vioblks", __progname);
2159 		return (-1);
2160 	}
2161 	log_debug("%s: receiving vioblk", __func__);
2162 	if (atomicio(read, fd, vioblk,
2163 	    nr_vioblk * sizeof(struct vioblk_dev)) !=
2164 	    nr_vioblk * sizeof(struct vioblk_dev)) {
2165 		log_warnx("%s: error reading vioblk from fd", __func__);
2166 		return (-1);
2167 	}
2168 	for (i = 0; i < vcp->vcp_ndisks; i++) {
2169 		if (pci_set_bar_fn(vioblk[i].pci_id, 0, virtio_blk_io,
2170 		    &vioblk[i])) {
2171 			log_warnx("%s: can't set bar fn for virtio block "
2172 			    "device", __progname);
2173 			return (-1);
2174 		}
2175 		if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
2176 		    child_disks[i], vmc->vmc_diskbases[i],
2177 		    vmc->vmc_disktypes[i]) == -1)  {
2178 			log_warnx("%s: unable to determine disk format",
2179 			    __func__);
2180 			return (-1);
2181 		}
2182 		vioblk[i].vm_id = vcp->vcp_id;
2183 		vioblk[i].irq = pci_get_dev_irq(vioblk[i].pci_id);
2184 	}
2185 	return (0);
2186 }
2187 
2188 int
2189 vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
2190 {
2191 	if (!strlen(vcp->vcp_cdrom))
2192 		return (0);
2193 
2194 	vioscsi = calloc(1, sizeof(struct vioscsi_dev));
2195 	if (vioscsi == NULL) {
2196 		log_warn("%s: calloc failure allocating vioscsi", __progname);
2197 		return (-1);
2198 	}
2199 
2200 	log_debug("%s: receiving vioscsi", __func__);
2201 
2202 	if (atomicio(read, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
2203 	    sizeof(struct vioscsi_dev)) {
2204 		log_warnx("%s: error reading vioscsi from fd", __func__);
2205 		return (-1);
2206 	}
2207 
2208 	if (pci_set_bar_fn(vioscsi->pci_id, 0, vioscsi_io, vioscsi)) {
2209 		log_warnx("%s: can't set bar fn for vmm control device",
2210 		    __progname);
2211 		return (-1);
2212 	}
2213 
2214 	if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, &child_cdrom, 1,
2215 	    VMDF_RAW) == -1) {
2216 		log_warnx("%s: unable to determine iso format", __func__);
2217 		return (-1);
2218 	}
2219 	vioscsi->vm_id = vcp->vcp_id;
2220 	vioscsi->irq = pci_get_dev_irq(vioscsi->pci_id);
2221 
2222 	return (0);
2223 }
2224 
2225 int
2226 virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
2227     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
2228 {
2229 	struct vmop_create_params *vmc = &vm->vm_params;
2230 	struct vm_create_params *vcp = &vmc->vmc_params;
2231 	int ret;
2232 
2233 	if ((ret = viornd_restore(fd, vcp)) == -1)
2234 		return ret;
2235 
2236 	if ((ret = vioblk_restore(fd, vmc, child_disks)) == -1)
2237 		return ret;
2238 
2239 	if ((ret = vioscsi_restore(fd, vcp, child_cdrom)) == -1)
2240 		return ret;
2241 
2242 	if ((ret = vionet_restore(fd, vm, child_taps)) == -1)
2243 		return ret;
2244 
2245 	if ((ret = vmmci_restore(fd, vcp->vcp_id)) == -1)
2246 		return ret;
2247 
2248 	return (0);
2249 }
2250 
2251 int
2252 viornd_dump(int fd)
2253 {
2254 	log_debug("%s: sending viornd", __func__);
2255 	if (atomicio(vwrite, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
2256 		log_warnx("%s: error writing viornd to fd", __func__);
2257 		return (-1);
2258 	}
2259 	return (0);
2260 }
2261 
2262 int
2263 vmmci_dump(int fd)
2264 {
2265 	log_debug("%s: sending vmmci", __func__);
2266 	if (atomicio(vwrite, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
2267 		log_warnx("%s: error writing vmmci to fd", __func__);
2268 		return (-1);
2269 	}
2270 	return (0);
2271 }
2272 
2273 int
2274 vionet_dump(int fd)
2275 {
2276 	log_debug("%s: sending vionet", __func__);
2277 	if (atomicio(vwrite, fd, vionet,
2278 	    nr_vionet * sizeof(struct vionet_dev)) !=
2279 	    nr_vionet * sizeof(struct vionet_dev)) {
2280 		log_warnx("%s: error writing vionet to fd", __func__);
2281 		return (-1);
2282 	}
2283 	return (0);
2284 }
2285 
2286 int
2287 vioblk_dump(int fd)
2288 {
2289 	log_debug("%s: sending vioblk", __func__);
2290 	if (atomicio(vwrite, fd, vioblk,
2291 	    nr_vioblk * sizeof(struct vioblk_dev)) !=
2292 	    nr_vioblk * sizeof(struct vioblk_dev)) {
2293 		log_warnx("%s: error writing vioblk to fd", __func__);
2294 		return (-1);
2295 	}
2296 	return (0);
2297 }
2298 
2299 int
2300 vioscsi_dump(int fd)
2301 {
2302 	if (vioscsi == NULL)
2303 		return (0);
2304 
2305 	log_debug("%s: sending vioscsi", __func__);
2306 	if (atomicio(vwrite, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
2307 	    sizeof(struct vioscsi_dev)) {
2308 		log_warnx("%s: error writing vioscsi to fd", __func__);
2309 		return (-1);
2310 	}
2311 	return (0);
2312 }
2313 
2314 int
2315 virtio_dump(int fd)
2316 {
2317 	int ret;
2318 
2319 	if ((ret = viornd_dump(fd)) == -1)
2320 		return ret;
2321 
2322 	if ((ret = vioblk_dump(fd)) == -1)
2323 		return ret;
2324 
2325 	if ((ret = vioscsi_dump(fd)) == -1)
2326 		return ret;
2327 
2328 	if ((ret = vionet_dump(fd)) == -1)
2329 		return ret;
2330 
2331 	if ((ret = vmmci_dump(fd)) == -1)
2332 		return ret;
2333 
2334 	return (0);
2335 }
2336 
2337 void
2338 virtio_stop(struct vm_create_params *vcp)
2339 {
2340 	uint8_t i;
2341 	for (i = 0; i < vcp->vcp_nnics; i++) {
2342 		if (event_del(&vionet[i].event)) {
2343 			log_warn("could not initialize vionet event "
2344 			    "handler");
2345 			return;
2346 		}
2347 	}
2348 }
2349 
2350 void
2351 virtio_start(struct vm_create_params *vcp)
2352 {
2353 	uint8_t i;
2354 	for (i = 0; i < vcp->vcp_nnics; i++) {
2355 		if (event_add(&vionet[i].event, NULL)) {
2356 			log_warn("could not initialize vionet event "
2357 			    "handler");
2358 			return;
2359 		}
2360 	}
2361 }
2362