xref: /openbsd/usr.sbin/vmd/virtio.c (revision 5a38ef86)
1 /*	$OpenBSD: virtio.c,v 1.97 2021/08/29 18:01:32 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* PAGE_SIZE */
20 #include <sys/socket.h>
21 
22 #include <machine/vmmvar.h>
23 #include <dev/pci/pcireg.h>
24 #include <dev/pci/pcidevs.h>
25 #include <dev/pv/virtioreg.h>
26 #include <dev/pci/virtio_pcireg.h>
27 #include <dev/pv/vioblkreg.h>
28 #include <dev/pv/vioscsireg.h>
29 
30 #include <net/if.h>
31 #include <netinet/in.h>
32 #include <netinet/if_ether.h>
33 #include <netinet/ip.h>
34 
35 #include <errno.h>
36 #include <event.h>
37 #include <poll.h>
38 #include <stddef.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <unistd.h>
42 
43 #include "atomicio.h"
44 #include "pci.h"
45 #include "vioscsi.h"
46 #include "virtio.h"
47 #include "vmd.h"
48 #include "vmm.h"
49 
50 extern char *__progname;
51 struct viornd_dev viornd;
52 struct vioblk_dev *vioblk;
53 struct vionet_dev *vionet;
54 struct vioscsi_dev *vioscsi;
55 struct vmmci_dev vmmci;
56 
57 int nr_vionet;
58 int nr_vioblk;
59 
60 #define MAXPHYS	(64 * 1024)	/* max raw I/O transfer size */
61 
62 #define VIRTIO_NET_F_MAC	(1<<5)
63 
64 #define VMMCI_F_TIMESYNC	(1<<0)
65 #define VMMCI_F_ACK		(1<<1)
66 #define VMMCI_F_SYNCRTC		(1<<2)
67 
68 #define RXQ	0
69 #define TXQ	1
70 
71 const char *
72 vioblk_cmd_name(uint32_t type)
73 {
74 	switch (type) {
75 	case VIRTIO_BLK_T_IN: return "read";
76 	case VIRTIO_BLK_T_OUT: return "write";
77 	case VIRTIO_BLK_T_SCSI_CMD: return "scsi read";
78 	case VIRTIO_BLK_T_SCSI_CMD_OUT: return "scsi write";
79 	case VIRTIO_BLK_T_FLUSH: return "flush";
80 	case VIRTIO_BLK_T_FLUSH_OUT: return "flush out";
81 	case VIRTIO_BLK_T_GET_ID: return "get id";
82 	default: return "unknown";
83 	}
84 }
85 
86 static const char *
87 virtio_reg_name(uint8_t reg)
88 {
89 	switch (reg) {
90 	case VIRTIO_CONFIG_DEVICE_FEATURES: return "device feature";
91 	case VIRTIO_CONFIG_GUEST_FEATURES: return "guest feature";
92 	case VIRTIO_CONFIG_QUEUE_ADDRESS: return "queue address";
93 	case VIRTIO_CONFIG_QUEUE_SIZE: return "queue size";
94 	case VIRTIO_CONFIG_QUEUE_SELECT: return "queue select";
95 	case VIRTIO_CONFIG_QUEUE_NOTIFY: return "queue notify";
96 	case VIRTIO_CONFIG_DEVICE_STATUS: return "device status";
97 	case VIRTIO_CONFIG_ISR_STATUS: return "isr status";
98 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI: return "device config 0";
99 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: return "device config 1";
100 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: return "device config 2";
101 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: return "device config 3";
102 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: return "device config 4";
103 	default: return "unknown";
104 	}
105 }
106 
107 uint32_t
108 vring_size(uint32_t vq_size)
109 {
110 	uint32_t allocsize1, allocsize2;
111 
112 	/* allocsize1: descriptor table + avail ring + pad */
113 	allocsize1 = VIRTQUEUE_ALIGN(sizeof(struct vring_desc) * vq_size
114 	    + sizeof(uint16_t) * (2 + vq_size));
115 	/* allocsize2: used ring + pad */
116 	allocsize2 = VIRTQUEUE_ALIGN(sizeof(uint16_t) * 2
117 	    + sizeof(struct vring_used_elem) * vq_size);
118 
119 	return allocsize1 + allocsize2;
120 }
121 
122 /* Update queue select */
123 void
124 viornd_update_qs(void)
125 {
126 	/* Invalid queue? */
127 	if (viornd.cfg.queue_select > 0) {
128 		viornd.cfg.queue_size = 0;
129 		return;
130 	}
131 
132 	/* Update queue address/size based on queue select */
133 	viornd.cfg.queue_address = viornd.vq[viornd.cfg.queue_select].qa;
134 	viornd.cfg.queue_size = viornd.vq[viornd.cfg.queue_select].qs;
135 }
136 
137 /* Update queue address */
138 void
139 viornd_update_qa(void)
140 {
141 	/* Invalid queue? */
142 	if (viornd.cfg.queue_select > 0)
143 		return;
144 
145 	viornd.vq[viornd.cfg.queue_select].qa = viornd.cfg.queue_address;
146 }
147 
148 int
149 viornd_notifyq(void)
150 {
151 	uint64_t q_gpa;
152 	uint32_t vr_sz;
153 	size_t sz;
154 	int dxx, ret;
155 	uint16_t aidx, uidx;
156 	char *buf, *rnd_data;
157 	struct vring_desc *desc;
158 	struct vring_avail *avail;
159 	struct vring_used *used;
160 
161 	ret = 0;
162 
163 	/* Invalid queue? */
164 	if (viornd.cfg.queue_notify > 0)
165 		return (0);
166 
167 	vr_sz = vring_size(VIORND_QUEUE_SIZE);
168 	q_gpa = viornd.vq[viornd.cfg.queue_notify].qa;
169 	q_gpa = q_gpa * VIRTIO_PAGE_SIZE;
170 
171 	buf = calloc(1, vr_sz);
172 	if (buf == NULL) {
173 		log_warn("calloc error getting viornd ring");
174 		return (0);
175 	}
176 
177 	if (read_mem(q_gpa, buf, vr_sz)) {
178 		free(buf);
179 		return (0);
180 	}
181 
182 	desc = (struct vring_desc *)(buf);
183 	avail = (struct vring_avail *)(buf +
184 	    viornd.vq[viornd.cfg.queue_notify].vq_availoffset);
185 	used = (struct vring_used *)(buf +
186 	    viornd.vq[viornd.cfg.queue_notify].vq_usedoffset);
187 
188 	aidx = avail->idx & VIORND_QUEUE_MASK;
189 	uidx = used->idx & VIORND_QUEUE_MASK;
190 
191 	dxx = avail->ring[aidx] & VIORND_QUEUE_MASK;
192 
193 	sz = desc[dxx].len;
194 	if (sz > MAXPHYS)
195 		fatalx("viornd descriptor size too large (%zu)", sz);
196 
197 	rnd_data = malloc(sz);
198 
199 	if (rnd_data != NULL) {
200 		arc4random_buf(rnd_data, sz);
201 		if (write_mem(desc[dxx].addr, rnd_data, sz)) {
202 			log_warnx("viornd: can't write random data @ "
203 			    "0x%llx",
204 			    desc[dxx].addr);
205 		} else {
206 			/* ret == 1 -> interrupt needed */
207 			/* XXX check VIRTIO_F_NO_INTR */
208 			ret = 1;
209 			viornd.cfg.isr_status = 1;
210 			used->ring[uidx].id = dxx;
211 			used->ring[uidx].len = sz;
212 			used->idx++;
213 
214 			if (write_mem(q_gpa, buf, vr_sz)) {
215 				log_warnx("viornd: error writing vio ring");
216 			}
217 		}
218 		free(rnd_data);
219 	} else
220 		fatal("memory allocation error for viornd data");
221 
222 	free(buf);
223 
224 	return (ret);
225 }
226 
227 int
228 virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
229     void *unused, uint8_t sz)
230 {
231 	*intr = 0xFF;
232 
233 	if (dir == 0) {
234 		switch (reg) {
235 		case VIRTIO_CONFIG_DEVICE_FEATURES:
236 		case VIRTIO_CONFIG_QUEUE_SIZE:
237 		case VIRTIO_CONFIG_ISR_STATUS:
238 			log_warnx("%s: illegal write %x to %s",
239 			    __progname, *data, virtio_reg_name(reg));
240 			break;
241 		case VIRTIO_CONFIG_GUEST_FEATURES:
242 			viornd.cfg.guest_feature = *data;
243 			break;
244 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
245 			viornd.cfg.queue_address = *data;
246 			viornd_update_qa();
247 			break;
248 		case VIRTIO_CONFIG_QUEUE_SELECT:
249 			viornd.cfg.queue_select = *data;
250 			viornd_update_qs();
251 			break;
252 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
253 			viornd.cfg.queue_notify = *data;
254 			if (viornd_notifyq())
255 				*intr = 1;
256 			break;
257 		case VIRTIO_CONFIG_DEVICE_STATUS:
258 			viornd.cfg.device_status = *data;
259 			break;
260 		}
261 	} else {
262 		switch (reg) {
263 		case VIRTIO_CONFIG_DEVICE_FEATURES:
264 			*data = viornd.cfg.device_feature;
265 			break;
266 		case VIRTIO_CONFIG_GUEST_FEATURES:
267 			*data = viornd.cfg.guest_feature;
268 			break;
269 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
270 			*data = viornd.cfg.queue_address;
271 			break;
272 		case VIRTIO_CONFIG_QUEUE_SIZE:
273 			*data = viornd.cfg.queue_size;
274 			break;
275 		case VIRTIO_CONFIG_QUEUE_SELECT:
276 			*data = viornd.cfg.queue_select;
277 			break;
278 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
279 			*data = viornd.cfg.queue_notify;
280 			break;
281 		case VIRTIO_CONFIG_DEVICE_STATUS:
282 			*data = viornd.cfg.device_status;
283 			break;
284 		case VIRTIO_CONFIG_ISR_STATUS:
285 			*data = viornd.cfg.isr_status;
286 			viornd.cfg.isr_status = 0;
287 			vcpu_deassert_pic_irq(viornd.vm_id, 0, viornd.irq);
288 			break;
289 		}
290 	}
291 	return (0);
292 }
293 
294 void
295 vioblk_update_qa(struct vioblk_dev *dev)
296 {
297 	/* Invalid queue? */
298 	if (dev->cfg.queue_select > 0)
299 		return;
300 
301 	dev->vq[dev->cfg.queue_select].qa = dev->cfg.queue_address;
302 }
303 
304 void
305 vioblk_update_qs(struct vioblk_dev *dev)
306 {
307 	/* Invalid queue? */
308 	if (dev->cfg.queue_select > 0) {
309 		dev->cfg.queue_size = 0;
310 		return;
311 	}
312 
313 	/* Update queue address/size based on queue select */
314 	dev->cfg.queue_address = dev->vq[dev->cfg.queue_select].qa;
315 	dev->cfg.queue_size = dev->vq[dev->cfg.queue_select].qs;
316 }
317 
318 static void
319 vioblk_free_info(struct ioinfo *info)
320 {
321 	if (!info)
322 		return;
323 	free(info->buf);
324 	free(info);
325 }
326 
327 static struct ioinfo *
328 vioblk_start_read(struct vioblk_dev *dev, off_t sector, size_t sz)
329 {
330 	struct ioinfo *info;
331 
332 	/* Limit to 64M for now */
333 	if (sz > (1 << 26)) {
334 		log_warnx("%s: read size exceeded 64M", __func__);
335 		return (NULL);
336 	}
337 
338 	info = calloc(1, sizeof(*info));
339 	if (!info)
340 		goto nomem;
341 	info->buf = malloc(sz);
342 	if (info->buf == NULL)
343 		goto nomem;
344 	info->len = sz;
345 	info->offset = sector * VIRTIO_BLK_SECTOR_SIZE;
346 	info->file = &dev->file;
347 
348 	return info;
349 
350 nomem:
351 	free(info);
352 	log_warn("malloc error vioblk read");
353 	return (NULL);
354 }
355 
356 
357 static const uint8_t *
358 vioblk_finish_read(struct ioinfo *info)
359 {
360 	struct virtio_backing *file;
361 
362 	file = info->file;
363 	if (file->pread(file->p, info->buf, info->len, info->offset) != info->len) {
364 		info->error = errno;
365 		log_warn("vioblk read error");
366 		return NULL;
367 	}
368 
369 	return info->buf;
370 }
371 
372 static struct ioinfo *
373 vioblk_start_write(struct vioblk_dev *dev, off_t sector,
374     paddr_t addr, size_t len)
375 {
376 	struct ioinfo *info;
377 
378 	/* Limit to 64M for now */
379 	if (len > (1 << 26)) {
380 		log_warnx("%s: write size exceeded 64M", __func__);
381 		return (NULL);
382 	}
383 
384 	info = calloc(1, sizeof(*info));
385 	if (!info)
386 		goto nomem;
387 
388 	info->buf = malloc(len);
389 	if (info->buf == NULL)
390 		goto nomem;
391 	info->len = len;
392 	info->offset = sector * VIRTIO_BLK_SECTOR_SIZE;
393 	info->file = &dev->file;
394 
395 	if (read_mem(addr, info->buf, info->len)) {
396 		vioblk_free_info(info);
397 		return NULL;
398 	}
399 
400 	return info;
401 
402 nomem:
403 	free(info);
404 	log_warn("malloc error vioblk write");
405 	return (NULL);
406 }
407 
408 static int
409 vioblk_finish_write(struct ioinfo *info)
410 {
411 	struct virtio_backing *file;
412 
413 	file = info->file;
414 	if (file->pwrite(file->p, info->buf, info->len, info->offset) != info->len) {
415 		log_warn("vioblk write error");
416 		return EIO;
417 	}
418 	return 0;
419 }
420 
421 /*
422  * XXX in various cases, ds should be set to VIRTIO_BLK_S_IOERR, if we can
423  */
424 int
425 vioblk_notifyq(struct vioblk_dev *dev)
426 {
427 	uint64_t q_gpa;
428 	uint32_t vr_sz;
429 	uint16_t idx, cmd_desc_idx, secdata_desc_idx, ds_desc_idx;
430 	uint8_t ds;
431 	int cnt, ret;
432 	off_t secbias;
433 	char *vr;
434 	struct vring_desc *desc, *cmd_desc, *secdata_desc, *ds_desc;
435 	struct vring_avail *avail;
436 	struct vring_used *used;
437 	struct virtio_blk_req_hdr cmd;
438 
439 	ret = 0;
440 
441 	/* Invalid queue? */
442 	if (dev->cfg.queue_notify > 0)
443 		return (0);
444 
445 	vr_sz = vring_size(VIOBLK_QUEUE_SIZE);
446 	q_gpa = dev->vq[dev->cfg.queue_notify].qa;
447 	q_gpa = q_gpa * VIRTIO_PAGE_SIZE;
448 
449 	vr = calloc(1, vr_sz);
450 	if (vr == NULL) {
451 		log_warn("calloc error getting vioblk ring");
452 		return (0);
453 	}
454 
455 	if (read_mem(q_gpa, vr, vr_sz)) {
456 		log_warnx("error reading gpa 0x%llx", q_gpa);
457 		goto out;
458 	}
459 
460 	/* Compute offsets in ring of descriptors, avail ring, and used ring */
461 	desc = (struct vring_desc *)(vr);
462 	avail = (struct vring_avail *)(vr +
463 	    dev->vq[dev->cfg.queue_notify].vq_availoffset);
464 	used = (struct vring_used *)(vr +
465 	    dev->vq[dev->cfg.queue_notify].vq_usedoffset);
466 
467 	idx = dev->vq[dev->cfg.queue_notify].last_avail & VIOBLK_QUEUE_MASK;
468 
469 	if ((avail->idx & VIOBLK_QUEUE_MASK) == idx) {
470 		log_warnx("vioblk queue notify - nothing to do?");
471 		goto out;
472 	}
473 
474 	while (idx != (avail->idx & VIOBLK_QUEUE_MASK)) {
475 
476 		cmd_desc_idx = avail->ring[idx] & VIOBLK_QUEUE_MASK;
477 		cmd_desc = &desc[cmd_desc_idx];
478 
479 		if ((cmd_desc->flags & VRING_DESC_F_NEXT) == 0) {
480 			log_warnx("unchained vioblk cmd descriptor received "
481 			    "(idx %d)", cmd_desc_idx);
482 			goto out;
483 		}
484 
485 		/* Read command from descriptor ring */
486 		if (cmd_desc->flags & VRING_DESC_F_WRITE) {
487 			log_warnx("vioblk: unexpected writable cmd descriptor "
488 			    "%d", cmd_desc_idx);
489 			goto out;
490 		}
491 		if (read_mem(cmd_desc->addr, &cmd, sizeof(cmd))) {
492 			log_warnx("vioblk: command read_mem error @ 0x%llx",
493 			    cmd_desc->addr);
494 			goto out;
495 		}
496 
497 		switch (cmd.type) {
498 		case VIRTIO_BLK_T_IN:
499 			/* first descriptor */
500 			secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
501 			secdata_desc = &desc[secdata_desc_idx];
502 
503 			if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
504 				log_warnx("unchained vioblk data descriptor "
505 				    "received (idx %d)", cmd_desc_idx);
506 				goto out;
507 			}
508 
509 			cnt = 0;
510 			secbias = 0;
511 			do {
512 				struct ioinfo *info;
513 				const uint8_t *secdata;
514 
515 				if ((secdata_desc->flags & VRING_DESC_F_WRITE)
516 				    == 0) {
517 					log_warnx("vioblk: unwritable data "
518 					    "descriptor %d", secdata_desc_idx);
519 					goto out;
520 				}
521 
522 				info = vioblk_start_read(dev,
523 				    cmd.sector + secbias, secdata_desc->len);
524 
525 				if (info == NULL) {
526 					log_warnx("vioblk: can't start read");
527 					goto out;
528 				}
529 
530 				/* read the data, use current data descriptor */
531 				secdata = vioblk_finish_read(info);
532 				if (secdata == NULL) {
533 					vioblk_free_info(info);
534 					log_warnx("vioblk: block read error, "
535 					    "sector %lld", cmd.sector);
536 					goto out;
537 				}
538 
539 				if (write_mem(secdata_desc->addr, secdata,
540 					secdata_desc->len)) {
541 					log_warnx("can't write sector "
542 					    "data to gpa @ 0x%llx",
543 					    secdata_desc->addr);
544 					vioblk_free_info(info);
545 					goto out;
546 				}
547 
548 				vioblk_free_info(info);
549 
550 				secbias += (secdata_desc->len /
551 				    VIRTIO_BLK_SECTOR_SIZE);
552 				secdata_desc_idx = secdata_desc->next &
553 				    VIOBLK_QUEUE_MASK;
554 				secdata_desc = &desc[secdata_desc_idx];
555 
556 				/* Guard against infinite chains */
557 				if (++cnt >= VIOBLK_QUEUE_SIZE) {
558 					log_warnx("%s: descriptor table "
559 					    "invalid", __func__);
560 					goto out;
561 				}
562 			} while (secdata_desc->flags & VRING_DESC_F_NEXT);
563 
564 			ds_desc_idx = secdata_desc_idx;
565 			ds_desc = secdata_desc;
566 
567 			ds = VIRTIO_BLK_S_OK;
568 			break;
569 		case VIRTIO_BLK_T_OUT:
570 			secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
571 			secdata_desc = &desc[secdata_desc_idx];
572 
573 			if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
574 				log_warnx("wr vioblk: unchained vioblk data "
575 				    "descriptor received (idx %d)",
576 				    cmd_desc_idx);
577 				goto out;
578 			}
579 
580 			if (secdata_desc->len > dev->max_xfer) {
581 				log_warnx("%s: invalid read size %d requested",
582 				    __func__, secdata_desc->len);
583 				goto out;
584 			}
585 
586 			cnt = 0;
587 			secbias = 0;
588 			do {
589 				struct ioinfo *info;
590 
591 				if (secdata_desc->flags & VRING_DESC_F_WRITE) {
592 					log_warnx("wr vioblk: unexpected "
593 					    "writable data descriptor %d",
594 					    secdata_desc_idx);
595 					goto out;
596 				}
597 
598 				info = vioblk_start_write(dev,
599 				    cmd.sector + secbias,
600 				    secdata_desc->addr, secdata_desc->len);
601 
602 				if (info == NULL) {
603 					log_warnx("wr vioblk: can't read "
604 					    "sector data @ 0x%llx",
605 					    secdata_desc->addr);
606 					goto out;
607 				}
608 
609 				if (vioblk_finish_write(info)) {
610 					log_warnx("wr vioblk: disk write "
611 					    "error");
612 					vioblk_free_info(info);
613 					goto out;
614 				}
615 
616 				vioblk_free_info(info);
617 
618 				secbias += secdata_desc->len /
619 				    VIRTIO_BLK_SECTOR_SIZE;
620 
621 				secdata_desc_idx = secdata_desc->next &
622 				    VIOBLK_QUEUE_MASK;
623 				secdata_desc = &desc[secdata_desc_idx];
624 
625 				/* Guard against infinite chains */
626 				if (++cnt >= VIOBLK_QUEUE_SIZE) {
627 					log_warnx("%s: descriptor table "
628 					    "invalid", __func__);
629 					goto out;
630 				}
631 			} while (secdata_desc->flags & VRING_DESC_F_NEXT);
632 
633 			ds_desc_idx = secdata_desc_idx;
634 			ds_desc = secdata_desc;
635 
636 			ds = VIRTIO_BLK_S_OK;
637 			break;
638 		case VIRTIO_BLK_T_FLUSH:
639 		case VIRTIO_BLK_T_FLUSH_OUT:
640 			ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
641 			ds_desc = &desc[ds_desc_idx];
642 
643 			ds = VIRTIO_BLK_S_UNSUPP;
644 			break;
645 		case VIRTIO_BLK_T_GET_ID:
646 			secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
647 			secdata_desc = &desc[secdata_desc_idx];
648 
649 			/*
650 			 * We don't support this command yet. While it's not
651 			 * officially part of the virtio spec (will be in v1.2)
652 			 * there's no feature to negotiate. Linux drivers will
653 			 * often send this command regardless.
654 			 *
655 			 * When the command is received, it should appear as a
656 			 * chain of 3 descriptors, similar to the IN/OUT
657 			 * commands. The middle descriptor should have have a
658 			 * length of VIRTIO_BLK_ID_BYTES bytes.
659 			 */
660 			if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
661 				log_warnx("id vioblk: unchained vioblk data "
662 				    "descriptor received (idx %d)",
663 				    cmd_desc_idx);
664 				goto out;
665 			}
666 
667 			/* Skip the data descriptor. */
668 			ds_desc_idx = secdata_desc->next & VIOBLK_QUEUE_MASK;
669 			ds_desc = &desc[ds_desc_idx];
670 
671 			ds = VIRTIO_BLK_S_UNSUPP;
672 			break;
673 		default:
674 			log_warnx("%s: unsupported command 0x%x", __func__,
675 			    cmd.type);
676 			ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
677 			ds_desc = &desc[ds_desc_idx];
678 
679 			ds = VIRTIO_BLK_S_UNSUPP;
680 			break;
681 		}
682 
683 		if ((ds_desc->flags & VRING_DESC_F_WRITE) == 0) {
684 			log_warnx("%s: ds descriptor %d unwritable", __func__,
685 			    ds_desc_idx);
686 			goto out;
687 		}
688 		if (write_mem(ds_desc->addr, &ds, sizeof(ds))) {
689 			log_warnx("%s: can't write device status data @ 0x%llx",
690 			    __func__, ds_desc->addr);
691 			goto out;
692 		}
693 
694 		ret = 1;
695 		dev->cfg.isr_status = 1;
696 		used->ring[used->idx & VIOBLK_QUEUE_MASK].id = cmd_desc_idx;
697 		used->ring[used->idx & VIOBLK_QUEUE_MASK].len = cmd_desc->len;
698 		used->idx++;
699 
700 		dev->vq[dev->cfg.queue_notify].last_avail = avail->idx &
701 		    VIOBLK_QUEUE_MASK;
702 		if (write_mem(q_gpa, vr, vr_sz))
703 			log_warnx("%s: error writing vio ring", __func__);
704 
705 		idx = (idx + 1) & VIOBLK_QUEUE_MASK;
706 	}
707 out:
708 	free(vr);
709 	return (ret);
710 }
711 
712 int
713 virtio_blk_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
714     void *cookie, uint8_t sz)
715 {
716 	struct vioblk_dev *dev = (struct vioblk_dev *)cookie;
717 
718 	*intr = 0xFF;
719 
720 
721 	if (dir == 0) {
722 		switch (reg) {
723 		case VIRTIO_CONFIG_DEVICE_FEATURES:
724 		case VIRTIO_CONFIG_QUEUE_SIZE:
725 		case VIRTIO_CONFIG_ISR_STATUS:
726 			log_warnx("%s: illegal write %x to %s",
727 			    __progname, *data, virtio_reg_name(reg));
728 			break;
729 		case VIRTIO_CONFIG_GUEST_FEATURES:
730 			dev->cfg.guest_feature = *data;
731 			break;
732 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
733 			dev->cfg.queue_address = *data;
734 			vioblk_update_qa(dev);
735 			break;
736 		case VIRTIO_CONFIG_QUEUE_SELECT:
737 			dev->cfg.queue_select = *data;
738 			vioblk_update_qs(dev);
739 			break;
740 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
741 			dev->cfg.queue_notify = *data;
742 			if (vioblk_notifyq(dev))
743 				*intr = 1;
744 			break;
745 		case VIRTIO_CONFIG_DEVICE_STATUS:
746 			dev->cfg.device_status = *data;
747 			if (dev->cfg.device_status == 0) {
748 				log_debug("%s: device reset", __func__);
749 				dev->cfg.guest_feature = 0;
750 				dev->cfg.queue_address = 0;
751 				vioblk_update_qa(dev);
752 				dev->cfg.queue_size = 0;
753 				vioblk_update_qs(dev);
754 				dev->cfg.queue_select = 0;
755 				dev->cfg.queue_notify = 0;
756 				dev->cfg.isr_status = 0;
757 				dev->vq[0].last_avail = 0;
758 				vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
759 			}
760 			break;
761 		default:
762 			break;
763 		}
764 	} else {
765 		switch (reg) {
766 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
767 			switch (sz) {
768 			case 4:
769 				*data = (uint32_t)(dev->sz);
770 				break;
771 			case 2:
772 				*data &= 0xFFFF0000;
773 				*data |= (uint32_t)(dev->sz) & 0xFFFF;
774 				break;
775 			case 1:
776 				*data &= 0xFFFFFF00;
777 				*data |= (uint32_t)(dev->sz) & 0xFF;
778 				break;
779 			}
780 			/* XXX handle invalid sz */
781 			break;
782 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
783 			if (sz == 1) {
784 				*data &= 0xFFFFFF00;
785 				*data |= (uint32_t)(dev->sz >> 8) & 0xFF;
786 			}
787 			/* XXX handle invalid sz */
788 			break;
789 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
790 			if (sz == 1) {
791 				*data &= 0xFFFFFF00;
792 				*data |= (uint32_t)(dev->sz >> 16) & 0xFF;
793 			} else if (sz == 2) {
794 				*data &= 0xFFFF0000;
795 				*data |= (uint32_t)(dev->sz >> 16) & 0xFFFF;
796 			}
797 			/* XXX handle invalid sz */
798 			break;
799 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
800 			if (sz == 1) {
801 				*data &= 0xFFFFFF00;
802 				*data |= (uint32_t)(dev->sz >> 24) & 0xFF;
803 			}
804 			/* XXX handle invalid sz */
805 			break;
806 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
807 			switch (sz) {
808 			case 4:
809 				*data = (uint32_t)(dev->sz >> 32);
810 				break;
811 			case 2:
812 				*data &= 0xFFFF0000;
813 				*data |= (uint32_t)(dev->sz >> 32) & 0xFFFF;
814 				break;
815 			case 1:
816 				*data &= 0xFFFFFF00;
817 				*data |= (uint32_t)(dev->sz >> 32) & 0xFF;
818 				break;
819 			}
820 			/* XXX handle invalid sz */
821 			break;
822 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
823 			if (sz == 1) {
824 				*data &= 0xFFFFFF00;
825 				*data |= (uint32_t)(dev->sz >> 40) & 0xFF;
826 			}
827 			/* XXX handle invalid sz */
828 			break;
829 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 6:
830 			if (sz == 1) {
831 				*data &= 0xFFFFFF00;
832 				*data |= (uint32_t)(dev->sz >> 48) & 0xFF;
833 			} else if (sz == 2) {
834 				*data &= 0xFFFF0000;
835 				*data |= (uint32_t)(dev->sz >> 48) & 0xFFFF;
836 			}
837 			/* XXX handle invalid sz */
838 			break;
839 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 7:
840 			if (sz == 1) {
841 				*data &= 0xFFFFFF00;
842 				*data |= (uint32_t)(dev->sz >> 56) & 0xFF;
843 			}
844 			/* XXX handle invalid sz */
845 			break;
846 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
847 			switch (sz) {
848 			case 4:
849 				*data = (uint32_t)(dev->max_xfer);
850 				break;
851 			case 2:
852 				*data &= 0xFFFF0000;
853 				*data |= (uint32_t)(dev->max_xfer) & 0xFFFF;
854 				break;
855 			case 1:
856 				*data &= 0xFFFFFF00;
857 				*data |= (uint32_t)(dev->max_xfer) & 0xFF;
858 				break;
859 			}
860 			/* XXX handle invalid sz */
861 			break;
862 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 9:
863 			if (sz == 1) {
864 				*data &= 0xFFFFFF00;
865 				*data |= (uint32_t)(dev->max_xfer >> 8) & 0xFF;
866 			}
867 			/* XXX handle invalid sz */
868 			break;
869 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 10:
870 			if (sz == 1) {
871 				*data &= 0xFFFFFF00;
872 				*data |= (uint32_t)(dev->max_xfer >> 16) & 0xFF;
873 			} else if (sz == 2) {
874 				*data &= 0xFFFF0000;
875 				*data |= (uint32_t)(dev->max_xfer >> 16)
876 				    & 0xFFFF;
877 			}
878 			/* XXX handle invalid sz */
879 			break;
880 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 11:
881 			if (sz == 1) {
882 				*data &= 0xFFFFFF00;
883 				*data |= (uint32_t)(dev->max_xfer >> 24) & 0xFF;
884 			}
885 			/* XXX handle invalid sz */
886 			break;
887 		case VIRTIO_CONFIG_DEVICE_FEATURES:
888 			*data = dev->cfg.device_feature;
889 			break;
890 		case VIRTIO_CONFIG_GUEST_FEATURES:
891 			*data = dev->cfg.guest_feature;
892 			break;
893 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
894 			*data = dev->cfg.queue_address;
895 			break;
896 		case VIRTIO_CONFIG_QUEUE_SIZE:
897 			if (sz == 4)
898 				*data = dev->cfg.queue_size;
899 			else if (sz == 2) {
900 				*data &= 0xFFFF0000;
901 				*data |= (uint16_t)dev->cfg.queue_size;
902 			} else if (sz == 1) {
903 				*data &= 0xFFFFFF00;
904 				*data |= (uint8_t)dev->cfg.queue_size;
905 			}
906 			break;
907 		case VIRTIO_CONFIG_QUEUE_SELECT:
908 			*data = dev->cfg.queue_select;
909 			break;
910 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
911 			*data = dev->cfg.queue_notify;
912 			break;
913 		case VIRTIO_CONFIG_DEVICE_STATUS:
914 			if (sz == 4)
915 				*data = dev->cfg.device_status;
916 			else if (sz == 2) {
917 				*data &= 0xFFFF0000;
918 				*data |= (uint16_t)dev->cfg.device_status;
919 			} else if (sz == 1) {
920 				*data &= 0xFFFFFF00;
921 				*data |= (uint8_t)dev->cfg.device_status;
922 			}
923 			break;
924 		case VIRTIO_CONFIG_ISR_STATUS:
925 			*data = dev->cfg.isr_status;
926 			dev->cfg.isr_status = 0;
927 			vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
928 			break;
929 		}
930 	}
931 	return (0);
932 }
933 
934 int
935 virtio_net_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
936     void *cookie, uint8_t sz)
937 {
938 	struct vionet_dev *dev = (struct vionet_dev *)cookie;
939 
940 	*intr = 0xFF;
941 	mutex_lock(&dev->mutex);
942 
943 	if (dir == 0) {
944 		switch (reg) {
945 		case VIRTIO_CONFIG_DEVICE_FEATURES:
946 		case VIRTIO_CONFIG_QUEUE_SIZE:
947 		case VIRTIO_CONFIG_ISR_STATUS:
948 			log_warnx("%s: illegal write %x to %s",
949 			    __progname, *data, virtio_reg_name(reg));
950 			break;
951 		case VIRTIO_CONFIG_GUEST_FEATURES:
952 			dev->cfg.guest_feature = *data;
953 			break;
954 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
955 			dev->cfg.queue_address = *data;
956 			vionet_update_qa(dev);
957 			break;
958 		case VIRTIO_CONFIG_QUEUE_SELECT:
959 			dev->cfg.queue_select = *data;
960 			vionet_update_qs(dev);
961 			break;
962 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
963 			dev->cfg.queue_notify = *data;
964 			if (vionet_notifyq(dev))
965 				*intr = 1;
966 			break;
967 		case VIRTIO_CONFIG_DEVICE_STATUS:
968 			dev->cfg.device_status = *data;
969 			if (dev->cfg.device_status == 0) {
970 				log_debug("%s: device reset", __func__);
971 				dev->cfg.guest_feature = 0;
972 				dev->cfg.queue_address = 0;
973 				vionet_update_qa(dev);
974 				dev->cfg.queue_size = 0;
975 				vionet_update_qs(dev);
976 				dev->cfg.queue_select = 0;
977 				dev->cfg.queue_notify = 0;
978 				dev->cfg.isr_status = 0;
979 				dev->vq[RXQ].last_avail = 0;
980 				dev->vq[RXQ].notified_avail = 0;
981 				dev->vq[TXQ].last_avail = 0;
982 				dev->vq[TXQ].notified_avail = 0;
983 				vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
984 			}
985 			break;
986 		default:
987 			break;
988 		}
989 	} else {
990 		switch (reg) {
991 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
992 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
993 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
994 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
995 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
996 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
997 			*data = dev->mac[reg -
998 			    VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI];
999 			break;
1000 		case VIRTIO_CONFIG_DEVICE_FEATURES:
1001 			*data = dev->cfg.device_feature;
1002 			break;
1003 		case VIRTIO_CONFIG_GUEST_FEATURES:
1004 			*data = dev->cfg.guest_feature;
1005 			break;
1006 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
1007 			*data = dev->cfg.queue_address;
1008 			break;
1009 		case VIRTIO_CONFIG_QUEUE_SIZE:
1010 			*data = dev->cfg.queue_size;
1011 			break;
1012 		case VIRTIO_CONFIG_QUEUE_SELECT:
1013 			*data = dev->cfg.queue_select;
1014 			break;
1015 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
1016 			*data = dev->cfg.queue_notify;
1017 			break;
1018 		case VIRTIO_CONFIG_DEVICE_STATUS:
1019 			*data = dev->cfg.device_status;
1020 			break;
1021 		case VIRTIO_CONFIG_ISR_STATUS:
1022 			*data = dev->cfg.isr_status;
1023 			dev->cfg.isr_status = 0;
1024 			vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
1025 			break;
1026 		}
1027 	}
1028 
1029 	mutex_unlock(&dev->mutex);
1030 	return (0);
1031 }
1032 
1033 /*
1034  * Must be called with dev->mutex acquired.
1035  */
1036 void
1037 vionet_update_qa(struct vionet_dev *dev)
1038 {
1039 	/* Invalid queue? */
1040 	if (dev->cfg.queue_select > 1)
1041 		return;
1042 
1043 	dev->vq[dev->cfg.queue_select].qa = dev->cfg.queue_address;
1044 }
1045 
1046 /*
1047  * Must be called with dev->mutex acquired.
1048  */
1049 void
1050 vionet_update_qs(struct vionet_dev *dev)
1051 {
1052 	/* Invalid queue? */
1053 	if (dev->cfg.queue_select > 1) {
1054 		dev->cfg.queue_size = 0;
1055 		return;
1056 	}
1057 
1058 	/* Update queue address/size based on queue select */
1059 	dev->cfg.queue_address = dev->vq[dev->cfg.queue_select].qa;
1060 	dev->cfg.queue_size = dev->vq[dev->cfg.queue_select].qs;
1061 }
1062 
1063 /*
1064  * vionet_enq_rx
1065  *
1066  * Take a given packet from the host-side tap and copy it into the guest's
1067  * buffers utilizing the rx virtio ring. If the packet length is invalid
1068  * (too small or too large) or if there are not enough buffers available,
1069  * the packet is dropped.
1070  *
1071  * Must be called with dev->mutex acquired.
1072  */
1073 int
1074 vionet_enq_rx(struct vionet_dev *dev, char *pkt, size_t sz, int *spc)
1075 {
1076 	uint64_t q_gpa;
1077 	uint32_t vr_sz;
1078 	uint16_t dxx, idx, hdr_desc_idx, chain_hdr_idx;
1079 	int ret = 0;
1080 	char *vr = NULL;
1081 	size_t bufsz = 0, off = 0, pkt_offset = 0, chunk_size = 0;
1082 	size_t chain_len = 0;
1083 	struct vring_desc *desc, *pkt_desc, *hdr_desc;
1084 	struct vring_avail *avail;
1085 	struct vring_used *used;
1086 	struct vring_used_elem *ue;
1087 	struct virtio_net_hdr hdr;
1088 	size_t hdr_sz;
1089 
1090 	if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) {
1091 		log_warn("%s: invalid packet size", __func__);
1092 		return (0);
1093 	}
1094 
1095 	hdr_sz = sizeof(hdr);
1096 
1097 	if (!(dev->cfg.device_status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK))
1098 		return ret;
1099 
1100 	vr_sz = vring_size(VIONET_QUEUE_SIZE);
1101 	q_gpa = dev->vq[RXQ].qa;
1102 	q_gpa = q_gpa * VIRTIO_PAGE_SIZE;
1103 
1104 	vr = calloc(1, vr_sz);
1105 	if (vr == NULL) {
1106 		log_warn("rx enq: calloc error getting vionet ring");
1107 		return (0);
1108 	}
1109 
1110 	if (read_mem(q_gpa, vr, vr_sz)) {
1111 		log_warnx("rx enq: error reading gpa 0x%llx", q_gpa);
1112 		goto out;
1113 	}
1114 
1115 	/* Compute offsets in ring of descriptors, avail ring, and used ring */
1116 	desc = (struct vring_desc *)(vr);
1117 	avail = (struct vring_avail *)(vr + dev->vq[RXQ].vq_availoffset);
1118 	used = (struct vring_used *)(vr + dev->vq[RXQ].vq_usedoffset);
1119 
1120 	idx = dev->vq[RXQ].last_avail & VIONET_QUEUE_MASK;
1121 	if ((dev->vq[RXQ].notified_avail & VIONET_QUEUE_MASK) == idx) {
1122 		log_debug("%s: insufficient available buffer capacity, "
1123 		    "dropping packet.", __func__);
1124 		goto out;
1125 	}
1126 
1127 	hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK;
1128 	hdr_desc = &desc[hdr_desc_idx];
1129 
1130 	dxx = hdr_desc_idx;
1131 	chain_hdr_idx = dxx;
1132 	chain_len = 0;
1133 
1134 	/* Process the descriptor and walk any potential chain. */
1135 	do {
1136 		off = 0;
1137 		pkt_desc = &desc[dxx];
1138 		if (!(pkt_desc->flags & VRING_DESC_F_WRITE)) {
1139 			log_warnx("%s: invalid descriptor, not writable",
1140 			    __func__);
1141 			goto out;
1142 		}
1143 
1144 		/* How much data do we get to write? */
1145 		if (sz - bufsz > pkt_desc->len)
1146 			chunk_size = pkt_desc->len;
1147 		else
1148 			chunk_size = sz - bufsz;
1149 
1150 		if (chain_len == 0) {
1151 			off = hdr_sz;
1152 			if (chunk_size == pkt_desc->len)
1153 				chunk_size -= off;
1154 		}
1155 
1156 		/* Write a chunk of data if we need to */
1157 		if (chunk_size && write_mem(pkt_desc->addr + off,
1158 			pkt + pkt_offset, chunk_size)) {
1159 			log_warnx("%s: failed to write to buffer 0x%llx",
1160 			    __func__, pkt_desc->addr);
1161 			goto out;
1162 		}
1163 
1164 		chain_len += chunk_size + off;
1165 		bufsz += chunk_size;
1166 		pkt_offset += chunk_size;
1167 
1168 		dxx = pkt_desc->next & VIONET_QUEUE_MASK;
1169 	} while (bufsz < sz && pkt_desc->flags & VRING_DESC_F_NEXT);
1170 
1171 	/* Update the list of used buffers. */
1172 	ue = &used->ring[(used->idx) & VIONET_QUEUE_MASK];
1173 	ue->id = chain_hdr_idx;
1174 	ue->len = chain_len;
1175 	off = ((char *)ue - vr);
1176 	if (write_mem(q_gpa + off, ue, sizeof(*ue))) {
1177 		log_warnx("%s: error updating rx used ring", __func__);
1178 		goto out;
1179 	}
1180 
1181 	/* Move our marker in the ring...*/
1182 	used->idx++;
1183 	dev->vq[RXQ].last_avail = (dev->vq[RXQ].last_avail + 1) &
1184 	    VIONET_QUEUE_MASK;
1185 
1186 	/* Prepend the virtio net header in the first buffer. */
1187 	memset(&hdr, 0, sizeof(hdr));
1188 	hdr.hdr_len = hdr_sz;
1189 	if (write_mem(hdr_desc->addr, &hdr, hdr_sz)) {
1190 	    log_warnx("vionet: rx enq header write_mem error @ 0x%llx",
1191 		hdr_desc->addr);
1192 		goto out;
1193 	}
1194 
1195 	/* Update the index field in the used ring. This must be done last. */
1196 	dev->cfg.isr_status = 1;
1197 	off = (char *)&used->idx - vr;
1198 	*spc = (dev->vq[RXQ].notified_avail - dev->vq[RXQ].last_avail) &
1199 	    VIONET_QUEUE_MASK;
1200 
1201 	if (write_mem(q_gpa + off, &used->idx, sizeof(used->idx)))
1202 		log_warnx("vionet: error writing vio ring");
1203 
1204 	ret = 1;
1205 
1206 out:
1207 	free(vr);
1208 	return (ret);
1209 }
1210 
1211 /*
1212  * vionet_rx
1213  *
1214  * Enqueue data that was received on a tap file descriptor
1215  * to the vionet device queue.
1216  *
1217  * Must be called with dev->mutex acquired.
1218  */
1219 static int
1220 vionet_rx(struct vionet_dev *dev)
1221 {
1222 	char buf[PAGE_SIZE];
1223 	int num_enq = 0, spc = 0;
1224 	struct ether_header *eh;
1225 	ssize_t sz;
1226 
1227 	do {
1228 		sz = read(dev->fd, buf, sizeof(buf));
1229 		if (sz == -1) {
1230 			/*
1231 			 * If we get EAGAIN, No data is currently available.
1232 			 * Do not treat this as an error.
1233 			 */
1234 			if (errno != EAGAIN)
1235 				log_warn("unexpected read error on vionet "
1236 				    "device");
1237 		} else if (sz > 0) {
1238 			eh = (struct ether_header *)buf;
1239 			if (!dev->lockedmac ||
1240 			    ETHER_IS_MULTICAST(eh->ether_dhost) ||
1241 			    memcmp(eh->ether_dhost, dev->mac,
1242 			    sizeof(eh->ether_dhost)) == 0)
1243 				num_enq += vionet_enq_rx(dev, buf, sz, &spc);
1244 		} else if (sz == 0) {
1245 			log_debug("process_rx: no data");
1246 			break;
1247 		}
1248 	} while (spc > 0 && sz > 0);
1249 
1250 	return (num_enq);
1251 }
1252 
1253 /*
1254  * vionet_rx_event
1255  *
1256  * Called from the event handling thread when new data can be
1257  * received on the tap fd of a vionet device.
1258  */
1259 static void
1260 vionet_rx_event(int fd, short kind, void *arg)
1261 {
1262 	struct vionet_dev *dev = arg;
1263 
1264 	mutex_lock(&dev->mutex);
1265 
1266 	if (vionet_rx(dev) > 0) {
1267 		/* XXX: vcpu_id */
1268 		vcpu_assert_pic_irq(dev->vm_id, 0, dev->irq);
1269 	}
1270 
1271 	mutex_unlock(&dev->mutex);
1272 }
1273 
1274 /*
1275  * Must be called with dev->mutex acquired.
1276  */
1277 void
1278 vionet_notify_rx(struct vionet_dev *dev)
1279 {
1280 	uint64_t q_gpa;
1281 	uint32_t vr_sz;
1282 	char *vr;
1283 	struct vring_avail *avail;
1284 
1285 	vr_sz = vring_size(VIONET_QUEUE_SIZE);
1286 	q_gpa = dev->vq[RXQ].qa;
1287 	q_gpa = q_gpa * VIRTIO_PAGE_SIZE;
1288 
1289 	vr = malloc(vr_sz);
1290 	if (vr == NULL) {
1291 		log_warn("malloc error getting vionet ring");
1292 		return;
1293 	}
1294 
1295 	if (read_mem(q_gpa, vr, vr_sz)) {
1296 		log_warnx("error reading gpa 0x%llx", q_gpa);
1297 		free(vr);
1298 		return;
1299 	}
1300 
1301 	/* Compute offset into avail ring */
1302 	avail = (struct vring_avail *)(vr + dev->vq[RXQ].vq_availoffset);
1303 
1304 	dev->vq[RXQ].notified_avail = avail->idx - 1;
1305 
1306 	free(vr);
1307 }
1308 
1309 /*
1310  * Must be called with dev->mutex acquired.
1311  */
1312 int
1313 vionet_notifyq(struct vionet_dev *dev)
1314 {
1315 	int ret;
1316 
1317 	switch (dev->cfg.queue_notify) {
1318 	case RXQ:
1319 		vionet_notify_rx(dev);
1320 		ret = 0;
1321 		break;
1322 	case TXQ:
1323 		ret = vionet_notify_tx(dev);
1324 		break;
1325 	default:
1326 		/*
1327 		 * Catch the unimplemented queue ID 2 (control queue) as
1328 		 * well as any bogus queue IDs.
1329 		 */
1330 		log_debug("%s: notify for unimplemented queue ID %d",
1331 		    __func__, dev->cfg.queue_notify);
1332 		ret = 0;
1333 		break;
1334 	}
1335 
1336 	return (ret);
1337 }
1338 
1339 /*
1340  * Must be called with dev->mutex acquired.
1341  */
1342 int
1343 vionet_notify_tx(struct vionet_dev *dev)
1344 {
1345 	uint64_t q_gpa;
1346 	uint32_t vr_sz;
1347 	uint16_t idx, pkt_desc_idx, hdr_desc_idx, dxx, cnt;
1348 	size_t pktsz, chunk_size = 0;
1349 	ssize_t dhcpsz;
1350 	int ret, num_enq, ofs, spc;
1351 	char *vr, *pkt, *dhcppkt;
1352 	struct vring_desc *desc, *pkt_desc, *hdr_desc;
1353 	struct vring_avail *avail;
1354 	struct vring_used *used;
1355 	struct ether_header *eh;
1356 
1357 	dhcpsz = 0;
1358 	vr = pkt = dhcppkt = NULL;
1359 	ret = spc = 0;
1360 
1361 	vr_sz = vring_size(VIONET_QUEUE_SIZE);
1362 	q_gpa = dev->vq[TXQ].qa;
1363 	q_gpa = q_gpa * VIRTIO_PAGE_SIZE;
1364 
1365 	vr = calloc(1, vr_sz);
1366 	if (vr == NULL) {
1367 		log_warn("calloc error getting vionet ring");
1368 		goto out;
1369 	}
1370 
1371 	if (read_mem(q_gpa, vr, vr_sz)) {
1372 		log_warnx("error reading gpa 0x%llx", q_gpa);
1373 		goto out;
1374 	}
1375 
1376 	/* Compute offsets in ring of descriptors, avail ring, and used ring */
1377 	desc = (struct vring_desc *)(vr);
1378 	avail = (struct vring_avail *)(vr + dev->vq[TXQ].vq_availoffset);
1379 	used = (struct vring_used *)(vr + dev->vq[TXQ].vq_usedoffset);
1380 
1381 	num_enq = 0;
1382 
1383 	idx = dev->vq[TXQ].last_avail & VIONET_QUEUE_MASK;
1384 
1385 	if ((avail->idx & VIONET_QUEUE_MASK) == idx) {
1386 		log_warnx("vionet tx queue notify - nothing to do?");
1387 		goto out;
1388 	}
1389 
1390 	while ((avail->idx & VIONET_QUEUE_MASK) != idx) {
1391 		hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK;
1392 		hdr_desc = &desc[hdr_desc_idx];
1393 		pktsz = 0;
1394 
1395 		cnt = 0;
1396 		dxx = hdr_desc_idx;
1397 		do {
1398 			pktsz += desc[dxx].len;
1399 			dxx = desc[dxx].next & VIONET_QUEUE_MASK;
1400 
1401 			/*
1402 			 * Virtio 1.0, cs04, section 2.4.5:
1403 			 *  "The number of descriptors in the table is defined
1404 			 *   by the queue size for this virtqueue: this is the
1405 			 *   maximum possible descriptor chain length."
1406 			 */
1407 			if (++cnt >= VIONET_QUEUE_SIZE) {
1408 				log_warnx("%s: descriptor table invalid",
1409 				    __func__);
1410 				goto out;
1411 			}
1412 		} while (desc[dxx].flags & VRING_DESC_F_NEXT);
1413 
1414 		pktsz += desc[dxx].len;
1415 
1416 		/* Remove virtio header descriptor len */
1417 		pktsz -= hdr_desc->len;
1418 
1419 		/* Drop packets violating device MTU-based limits */
1420 		if (pktsz < VIONET_MIN_TXLEN || pktsz > VIONET_MAX_TXLEN) {
1421 			log_warnx("%s: invalid packet size %lu", __func__,
1422 			    pktsz);
1423 			goto drop_packet;
1424 		}
1425 		pkt = malloc(pktsz);
1426 		if (pkt == NULL) {
1427 			log_warn("malloc error alloc packet buf");
1428 			goto out;
1429 		}
1430 
1431 		ofs = 0;
1432 		pkt_desc_idx = hdr_desc->next & VIONET_QUEUE_MASK;
1433 		pkt_desc = &desc[pkt_desc_idx];
1434 
1435 		while (pkt_desc->flags & VRING_DESC_F_NEXT) {
1436 			/* must be not writable */
1437 			if (pkt_desc->flags & VRING_DESC_F_WRITE) {
1438 				log_warnx("unexpected writable tx desc "
1439 				    "%d", pkt_desc_idx);
1440 				goto out;
1441 			}
1442 
1443 			/* Check we don't read beyond allocated pktsz */
1444 			if (pkt_desc->len > pktsz - ofs) {
1445 				log_warnx("%s: descriptor len past pkt len",
1446 				    __func__);
1447 				chunk_size = pktsz - ofs;
1448 			} else
1449 				chunk_size = pkt_desc->len;
1450 
1451 			/* Read packet from descriptor ring */
1452 			if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) {
1453 				log_warnx("vionet: packet read_mem error "
1454 				    "@ 0x%llx", pkt_desc->addr);
1455 				goto out;
1456 			}
1457 
1458 			ofs += pkt_desc->len;
1459 			pkt_desc_idx = pkt_desc->next & VIONET_QUEUE_MASK;
1460 			pkt_desc = &desc[pkt_desc_idx];
1461 		}
1462 
1463 		/* Now handle tail descriptor - must be not writable */
1464 		if (pkt_desc->flags & VRING_DESC_F_WRITE) {
1465 			log_warnx("unexpected writable tx descriptor %d",
1466 			    pkt_desc_idx);
1467 			goto out;
1468 		}
1469 
1470 		/* Check we don't read beyond allocated pktsz */
1471 		if (pkt_desc->len > pktsz - ofs) {
1472 			log_warnx("%s: descriptor len past pkt len", __func__);
1473 			chunk_size = pktsz - ofs - pkt_desc->len;
1474 		} else
1475 			chunk_size = pkt_desc->len;
1476 
1477 		/* Read packet from descriptor ring */
1478 		if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) {
1479 			log_warnx("vionet: packet read_mem error @ "
1480 			    "0x%llx", pkt_desc->addr);
1481 			goto out;
1482 		}
1483 
1484 		/* reject other source addresses */
1485 		if (dev->lockedmac && pktsz >= ETHER_HDR_LEN &&
1486 		    (eh = (struct ether_header *)pkt) &&
1487 		    memcmp(eh->ether_shost, dev->mac,
1488 		    sizeof(eh->ether_shost)) != 0)
1489 			log_debug("vionet: wrong source address %s for vm %d",
1490 			    ether_ntoa((struct ether_addr *)
1491 			    eh->ether_shost), dev->vm_id);
1492 		else if (dev->local &&
1493 		    (dhcpsz = dhcp_request(dev, pkt, pktsz, &dhcppkt)) != -1) {
1494 			log_debug("vionet: dhcp request,"
1495 			    " local response size %zd", dhcpsz);
1496 
1497 		/* XXX signed vs unsigned here, funky cast */
1498 		} else if (write(dev->fd, pkt, pktsz) != (int)pktsz) {
1499 			log_warnx("vionet: tx failed writing to tap: "
1500 			    "%d", errno);
1501 			goto out;
1502 		}
1503 
1504 	drop_packet:
1505 		ret = 1;
1506 		dev->cfg.isr_status = 1;
1507 		used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_desc_idx;
1508 		used->ring[used->idx & VIONET_QUEUE_MASK].len = hdr_desc->len;
1509 		used->idx++;
1510 
1511 		dev->vq[TXQ].last_avail++;
1512 		num_enq++;
1513 
1514 		idx = dev->vq[TXQ].last_avail & VIONET_QUEUE_MASK;
1515 
1516 		free(pkt);
1517 		pkt = NULL;
1518 	}
1519 
1520 	if (write_mem(q_gpa, vr, vr_sz)) {
1521 		log_warnx("vionet: tx error writing vio ring");
1522 	}
1523 
1524 	if (dhcpsz > 0) {
1525 		if (vionet_enq_rx(dev, dhcppkt, dhcpsz, &spc))
1526 			ret = 1;
1527 	}
1528 
1529 out:
1530 	free(vr);
1531 	free(pkt);
1532 	free(dhcppkt);
1533 
1534 	return (ret);
1535 }
1536 
1537 int
1538 vmmci_ctl(unsigned int cmd)
1539 {
1540 	struct timeval tv = { 0, 0 };
1541 
1542 	if ((vmmci.cfg.device_status &
1543 	    VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0)
1544 		return (-1);
1545 
1546 	if (cmd == vmmci.cmd)
1547 		return (0);
1548 
1549 	switch (cmd) {
1550 	case VMMCI_NONE:
1551 		break;
1552 	case VMMCI_SHUTDOWN:
1553 	case VMMCI_REBOOT:
1554 		/* Update command */
1555 		vmmci.cmd = cmd;
1556 
1557 		/*
1558 		 * vmm VMs do not support powerdown, send a reboot request
1559 		 * instead and turn it off after the triple fault.
1560 		 */
1561 		if (cmd == VMMCI_SHUTDOWN)
1562 			cmd = VMMCI_REBOOT;
1563 
1564 		/* Trigger interrupt */
1565 		vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
1566 		vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
1567 
1568 		/* Add ACK timeout */
1569 		tv.tv_sec = VMMCI_TIMEOUT;
1570 		evtimer_add(&vmmci.timeout, &tv);
1571 		break;
1572 	case VMMCI_SYNCRTC:
1573 		if (vmmci.cfg.guest_feature & VMMCI_F_SYNCRTC) {
1574 			/* RTC updated, request guest VM resync of its RTC */
1575 			vmmci.cmd = cmd;
1576 
1577 			vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
1578 			vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
1579 		} else {
1580 			log_debug("%s: RTC sync skipped (guest does not "
1581 			    "support RTC sync)\n", __func__);
1582 		}
1583 		break;
1584 	default:
1585 		fatalx("invalid vmmci command: %d", cmd);
1586 	}
1587 
1588 	return (0);
1589 }
1590 
1591 void
1592 vmmci_ack(unsigned int cmd)
1593 {
1594 	struct timeval	 tv = { 0, 0 };
1595 
1596 	switch (cmd) {
1597 	case VMMCI_NONE:
1598 		break;
1599 	case VMMCI_SHUTDOWN:
1600 		/*
1601 		 * The shutdown was requested by the VM if we don't have
1602 		 * a pending shutdown request.  In this case add a short
1603 		 * timeout to give the VM a chance to reboot before the
1604 		 * timer is expired.
1605 		 */
1606 		if (vmmci.cmd == 0) {
1607 			log_debug("%s: vm %u requested shutdown", __func__,
1608 			    vmmci.vm_id);
1609 			tv.tv_sec = VMMCI_TIMEOUT;
1610 			evtimer_add(&vmmci.timeout, &tv);
1611 			return;
1612 		}
1613 		/* FALLTHROUGH */
1614 	case VMMCI_REBOOT:
1615 		/*
1616 		 * If the VM acknowleged our shutdown request, give it
1617 		 * enough time to shutdown or reboot gracefully.  This
1618 		 * might take a considerable amount of time (running
1619 		 * rc.shutdown on the VM), so increase the timeout before
1620 		 * killing it forcefully.
1621 		 */
1622 		if (cmd == vmmci.cmd &&
1623 		    evtimer_pending(&vmmci.timeout, NULL)) {
1624 			log_debug("%s: vm %u acknowledged shutdown request",
1625 			    __func__, vmmci.vm_id);
1626 			tv.tv_sec = VMMCI_SHUTDOWN_TIMEOUT;
1627 			evtimer_add(&vmmci.timeout, &tv);
1628 		}
1629 		break;
1630 	case VMMCI_SYNCRTC:
1631 		log_debug("%s: vm %u acknowledged RTC sync request",
1632 		    __func__, vmmci.vm_id);
1633 		vmmci.cmd = VMMCI_NONE;
1634 		break;
1635 	default:
1636 		log_warnx("%s: illegal request %u", __func__, cmd);
1637 		break;
1638 	}
1639 }
1640 
1641 void
1642 vmmci_timeout(int fd, short type, void *arg)
1643 {
1644 	log_debug("%s: vm %u shutdown", __progname, vmmci.vm_id);
1645 	vm_shutdown(vmmci.cmd == VMMCI_REBOOT ? VMMCI_REBOOT : VMMCI_SHUTDOWN);
1646 }
1647 
1648 int
1649 vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
1650     void *unused, uint8_t sz)
1651 {
1652 	*intr = 0xFF;
1653 
1654 	if (dir == 0) {
1655 		switch (reg) {
1656 		case VIRTIO_CONFIG_DEVICE_FEATURES:
1657 		case VIRTIO_CONFIG_QUEUE_SIZE:
1658 		case VIRTIO_CONFIG_ISR_STATUS:
1659 			log_warnx("%s: illegal write %x to %s",
1660 			    __progname, *data, virtio_reg_name(reg));
1661 			break;
1662 		case VIRTIO_CONFIG_GUEST_FEATURES:
1663 			vmmci.cfg.guest_feature = *data;
1664 			break;
1665 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
1666 			vmmci.cfg.queue_address = *data;
1667 			break;
1668 		case VIRTIO_CONFIG_QUEUE_SELECT:
1669 			vmmci.cfg.queue_select = *data;
1670 			break;
1671 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
1672 			vmmci.cfg.queue_notify = *data;
1673 			break;
1674 		case VIRTIO_CONFIG_DEVICE_STATUS:
1675 			vmmci.cfg.device_status = *data;
1676 			break;
1677 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
1678 			vmmci_ack(*data);
1679 			break;
1680 		}
1681 	} else {
1682 		switch (reg) {
1683 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
1684 			*data = vmmci.cmd;
1685 			break;
1686 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
1687 			/* Update time once when reading the first register */
1688 			gettimeofday(&vmmci.time, NULL);
1689 			*data = (uint64_t)vmmci.time.tv_sec;
1690 			break;
1691 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
1692 			*data = (uint64_t)vmmci.time.tv_sec << 32;
1693 			break;
1694 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12:
1695 			*data = (uint64_t)vmmci.time.tv_usec;
1696 			break;
1697 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16:
1698 			*data = (uint64_t)vmmci.time.tv_usec << 32;
1699 			break;
1700 		case VIRTIO_CONFIG_DEVICE_FEATURES:
1701 			*data = vmmci.cfg.device_feature;
1702 			break;
1703 		case VIRTIO_CONFIG_GUEST_FEATURES:
1704 			*data = vmmci.cfg.guest_feature;
1705 			break;
1706 		case VIRTIO_CONFIG_QUEUE_ADDRESS:
1707 			*data = vmmci.cfg.queue_address;
1708 			break;
1709 		case VIRTIO_CONFIG_QUEUE_SIZE:
1710 			*data = vmmci.cfg.queue_size;
1711 			break;
1712 		case VIRTIO_CONFIG_QUEUE_SELECT:
1713 			*data = vmmci.cfg.queue_select;
1714 			break;
1715 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
1716 			*data = vmmci.cfg.queue_notify;
1717 			break;
1718 		case VIRTIO_CONFIG_DEVICE_STATUS:
1719 			*data = vmmci.cfg.device_status;
1720 			break;
1721 		case VIRTIO_CONFIG_ISR_STATUS:
1722 			*data = vmmci.cfg.isr_status;
1723 			vmmci.cfg.isr_status = 0;
1724 			vcpu_deassert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
1725 			break;
1726 		}
1727 	}
1728 	return (0);
1729 }
1730 
1731 int
1732 virtio_get_base(int fd, char *path, size_t npath, int type, const char *dpath)
1733 {
1734 	switch (type) {
1735 	case VMDF_RAW:
1736 		return 0;
1737 	case VMDF_QCOW2:
1738 		return virtio_qcow2_get_base(fd, path, npath, dpath);
1739 	}
1740 	log_warnx("%s: invalid disk format", __func__);
1741 	return -1;
1742 }
1743 
1744 /*
1745  * Initializes a struct virtio_backing using the list of fds.
1746  */
1747 static int
1748 virtio_init_disk(struct virtio_backing *file, off_t *sz,
1749     int *fd, size_t nfd, int type)
1750 {
1751 	/*
1752 	 * probe disk types in order of preference, first one to work wins.
1753 	 * TODO: provide a way of specifying the type and options.
1754 	 */
1755 	switch (type) {
1756 	case VMDF_RAW:
1757 		return virtio_raw_init(file, sz, fd, nfd);
1758 	case VMDF_QCOW2:
1759 		return virtio_qcow2_init(file, sz, fd, nfd);
1760 	}
1761 	log_warnx("%s: invalid disk format", __func__);
1762 	return -1;
1763 }
1764 
1765 void
1766 virtio_init(struct vmd_vm *vm, int child_cdrom,
1767     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1768 {
1769 	struct vmop_create_params *vmc = &vm->vm_params;
1770 	struct vm_create_params *vcp = &vmc->vmc_params;
1771 	uint8_t id;
1772 	uint8_t i;
1773 	int ret;
1774 
1775 	/* Virtio entropy device */
1776 	if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
1777 	    PCI_PRODUCT_QUMRANET_VIO_RNG, PCI_CLASS_SYSTEM,
1778 	    PCI_SUBCLASS_SYSTEM_MISC,
1779 	    PCI_VENDOR_OPENBSD,
1780 	    PCI_PRODUCT_VIRTIO_ENTROPY, 1, NULL)) {
1781 		log_warnx("%s: can't add PCI virtio rng device",
1782 		    __progname);
1783 		return;
1784 	}
1785 
1786 	if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_rnd_io, NULL)) {
1787 		log_warnx("%s: can't add bar for virtio rng device",
1788 		    __progname);
1789 		return;
1790 	}
1791 
1792 	memset(&viornd, 0, sizeof(viornd));
1793 	viornd.vq[0].qs = VIORND_QUEUE_SIZE;
1794 	viornd.vq[0].vq_availoffset = sizeof(struct vring_desc) *
1795 	    VIORND_QUEUE_SIZE;
1796 	viornd.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
1797 	    sizeof(struct vring_desc) * VIORND_QUEUE_SIZE
1798 	    + sizeof(uint16_t) * (2 + VIORND_QUEUE_SIZE));
1799 	viornd.pci_id = id;
1800 	viornd.irq = pci_get_dev_irq(id);
1801 	viornd.vm_id = vcp->vcp_id;
1802 
1803 	if (vcp->vcp_nnics > 0) {
1804 		vionet = calloc(vcp->vcp_nnics, sizeof(struct vionet_dev));
1805 		if (vionet == NULL) {
1806 			log_warn("%s: calloc failure allocating vionets",
1807 			    __progname);
1808 			return;
1809 		}
1810 
1811 		nr_vionet = vcp->vcp_nnics;
1812 		/* Virtio network */
1813 		for (i = 0; i < vcp->vcp_nnics; i++) {
1814 			if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
1815 			    PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM,
1816 			    PCI_SUBCLASS_SYSTEM_MISC,
1817 			    PCI_VENDOR_OPENBSD,
1818 			    PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL)) {
1819 				log_warnx("%s: can't add PCI virtio net device",
1820 				    __progname);
1821 				return;
1822 			}
1823 
1824 			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_net_io,
1825 			    &vionet[i])) {
1826 				log_warnx("%s: can't add bar for virtio net "
1827 				    "device", __progname);
1828 				return;
1829 			}
1830 
1831 			ret = pthread_mutex_init(&vionet[i].mutex, NULL);
1832 			if (ret) {
1833 				errno = ret;
1834 				log_warn("%s: could not initialize mutex "
1835 				    "for vionet device", __progname);
1836 				return;
1837 			}
1838 
1839 			vionet[i].vq[RXQ].qs = VIONET_QUEUE_SIZE;
1840 			vionet[i].vq[RXQ].vq_availoffset =
1841 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
1842 			vionet[i].vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
1843 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
1844 			    + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
1845 			vionet[i].vq[RXQ].last_avail = 0;
1846 			vionet[i].vq[RXQ].notified_avail = 0;
1847 
1848 			vionet[i].vq[TXQ].qs = VIONET_QUEUE_SIZE;
1849 			vionet[i].vq[TXQ].vq_availoffset =
1850 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
1851 			vionet[i].vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
1852 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
1853 			    + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
1854 			vionet[i].vq[TXQ].last_avail = 0;
1855 			vionet[i].vq[TXQ].notified_avail = 0;
1856 			vionet[i].fd = child_taps[i];
1857 			vionet[i].vm_id = vcp->vcp_id;
1858 			vionet[i].vm_vmid = vm->vm_vmid;
1859 			vionet[i].irq = pci_get_dev_irq(id);
1860 
1861 			event_set(&vionet[i].event, vionet[i].fd,
1862 			    EV_READ | EV_PERSIST, vionet_rx_event, &vionet[i]);
1863 			if (event_add(&vionet[i].event, NULL)) {
1864 				log_warn("could not initialize vionet event "
1865 				    "handler");
1866 				return;
1867 			}
1868 
1869 			/* MAC address has been assigned by the parent */
1870 			memcpy(&vionet[i].mac, &vcp->vcp_macs[i], 6);
1871 			vionet[i].cfg.device_feature = VIRTIO_NET_F_MAC;
1872 
1873 			vionet[i].lockedmac =
1874 			    vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0;
1875 			vionet[i].local =
1876 			    vmc->vmc_ifflags[i] & VMIFF_LOCAL ? 1 : 0;
1877 			if (i == 0 && vmc->vmc_bootdevice & VMBOOTDEV_NET)
1878 				vionet[i].pxeboot = 1;
1879 			vionet[i].idx = i;
1880 			vionet[i].pci_id = id;
1881 
1882 			log_debug("%s: vm \"%s\" vio%u lladdr %s%s%s%s",
1883 			    __func__, vcp->vcp_name, i,
1884 			    ether_ntoa((void *)vionet[i].mac),
1885 			    vionet[i].lockedmac ? ", locked" : "",
1886 			    vionet[i].local ? ", local" : "",
1887 			    vionet[i].pxeboot ? ", pxeboot" : "");
1888 		}
1889 	}
1890 
1891 	if (vcp->vcp_ndisks > 0) {
1892 		nr_vioblk = vcp->vcp_ndisks;
1893 		vioblk = calloc(vcp->vcp_ndisks, sizeof(struct vioblk_dev));
1894 		if (vioblk == NULL) {
1895 			log_warn("%s: calloc failure allocating vioblks",
1896 			    __progname);
1897 			return;
1898 		}
1899 
1900 		/* One virtio block device for each disk defined in vcp */
1901 		for (i = 0; i < vcp->vcp_ndisks; i++) {
1902 			if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
1903 			    PCI_PRODUCT_QUMRANET_VIO_BLOCK,
1904 			    PCI_CLASS_MASS_STORAGE,
1905 			    PCI_SUBCLASS_MASS_STORAGE_SCSI,
1906 			    PCI_VENDOR_OPENBSD,
1907 			    PCI_PRODUCT_VIRTIO_BLOCK, 1, NULL)) {
1908 				log_warnx("%s: can't add PCI virtio block "
1909 				    "device", __progname);
1910 				return;
1911 			}
1912 			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_blk_io,
1913 			    &vioblk[i])) {
1914 				log_warnx("%s: can't add bar for virtio block "
1915 				    "device", __progname);
1916 				return;
1917 			}
1918 			vioblk[i].vq[0].qs = VIOBLK_QUEUE_SIZE;
1919 			vioblk[i].vq[0].vq_availoffset =
1920 			    sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE;
1921 			vioblk[i].vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
1922 			    sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE
1923 			    + sizeof(uint16_t) * (2 + VIOBLK_QUEUE_SIZE));
1924 			vioblk[i].vq[0].last_avail = 0;
1925 			vioblk[i].cfg.device_feature = VIRTIO_BLK_F_SIZE_MAX;
1926 			vioblk[i].max_xfer = 1048576;
1927 			vioblk[i].pci_id = id;
1928 			vioblk[i].vm_id = vcp->vcp_id;
1929 			vioblk[i].irq = pci_get_dev_irq(id);
1930 			if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
1931 			    child_disks[i], vmc->vmc_diskbases[i],
1932 			    vmc->vmc_disktypes[i]) == -1) {
1933 				log_warnx("%s: unable to determine disk format",
1934 				    __func__);
1935 				return;
1936 			}
1937 			vioblk[i].sz /= 512;
1938 		}
1939 	}
1940 
1941 	/* vioscsi cdrom */
1942 	if (strlen(vcp->vcp_cdrom)) {
1943 		vioscsi = calloc(1, sizeof(struct vioscsi_dev));
1944 		if (vioscsi == NULL) {
1945 			log_warn("%s: calloc failure allocating vioscsi",
1946 			    __progname);
1947 			return;
1948 		}
1949 
1950 		if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
1951 		    PCI_PRODUCT_QUMRANET_VIO_SCSI,
1952 		    PCI_CLASS_MASS_STORAGE,
1953 		    PCI_SUBCLASS_MASS_STORAGE_SCSI,
1954 		    PCI_VENDOR_OPENBSD,
1955 		    PCI_PRODUCT_VIRTIO_SCSI, 1, NULL)) {
1956 			log_warnx("%s: can't add PCI vioscsi device",
1957 			    __progname);
1958 			return;
1959 		}
1960 
1961 		if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vioscsi_io, vioscsi)) {
1962 			log_warnx("%s: can't add bar for vioscsi device",
1963 			    __progname);
1964 			return;
1965 		}
1966 
1967 		for ( i = 0; i < VIRTIO_MAX_QUEUES; i++) {
1968 			vioscsi->vq[i].qs = VIOSCSI_QUEUE_SIZE;
1969 			vioscsi->vq[i].vq_availoffset =
1970 			    sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE;
1971 			vioscsi->vq[i].vq_usedoffset = VIRTQUEUE_ALIGN(
1972 			    sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE
1973 			    + sizeof(uint16_t) * (2 + VIOSCSI_QUEUE_SIZE));
1974 			vioscsi->vq[i].last_avail = 0;
1975 		}
1976 		if (virtio_init_disk(&vioscsi->file, &vioscsi->sz,
1977 		    &child_cdrom, 1, VMDF_RAW) == -1) {
1978 			log_warnx("%s: unable to determine iso format",
1979 			    __func__);
1980 			return;
1981 		}
1982 		vioscsi->locked = 0;
1983 		vioscsi->lba = 0;
1984 		vioscsi->n_blocks = vioscsi->sz >> 11; /* num of 2048 blocks in file */
1985 		vioscsi->max_xfer = VIOSCSI_BLOCK_SIZE_CDROM;
1986 		vioscsi->pci_id = id;
1987 		vioscsi->vm_id = vcp->vcp_id;
1988 		vioscsi->irq = pci_get_dev_irq(id);
1989 	}
1990 
1991 	/* virtio control device */
1992 	if (pci_add_device(&id, PCI_VENDOR_OPENBSD,
1993 	    PCI_PRODUCT_OPENBSD_CONTROL,
1994 	    PCI_CLASS_COMMUNICATIONS,
1995 	    PCI_SUBCLASS_COMMUNICATIONS_MISC,
1996 	    PCI_VENDOR_OPENBSD,
1997 	    PCI_PRODUCT_VIRTIO_VMMCI, 1, NULL)) {
1998 		log_warnx("%s: can't add PCI vmm control device",
1999 		    __progname);
2000 		return;
2001 	}
2002 
2003 	if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vmmci_io, NULL)) {
2004 		log_warnx("%s: can't add bar for vmm control device",
2005 		    __progname);
2006 		return;
2007 	}
2008 
2009 	memset(&vmmci, 0, sizeof(vmmci));
2010 	vmmci.cfg.device_feature = VMMCI_F_TIMESYNC | VMMCI_F_ACK |
2011 	    VMMCI_F_SYNCRTC;
2012 	vmmci.vm_id = vcp->vcp_id;
2013 	vmmci.irq = pci_get_dev_irq(id);
2014 	vmmci.pci_id = id;
2015 
2016 	evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
2017 }
2018 
2019 /*
2020  * vionet_set_hostmac
2021  *
2022  * Sets the hardware address for the host-side tap(4) on a vionet_dev.
2023  *
2024  * This should only be called from the event-loop thread
2025  *
2026  * vm: pointer to the current vmd_vm instance
2027  * idx: index into the array of vionet_dev's for the target vionet_dev
2028  * addr: ethernet address to set
2029  */
2030 void
2031 vionet_set_hostmac(struct vmd_vm *vm, unsigned int idx, uint8_t *addr)
2032 {
2033 	struct vmop_create_params *vmc = &vm->vm_params;
2034 	struct vm_create_params	  *vcp = &vmc->vmc_params;
2035 	struct vionet_dev	  *dev;
2036 
2037 	if (idx > vcp->vcp_nnics)
2038 		fatalx("vionet_set_hostmac");
2039 
2040 	dev = &vionet[idx];
2041 	memcpy(dev->hostmac, addr, sizeof(dev->hostmac));
2042 }
2043 
2044 void
2045 virtio_shutdown(struct vmd_vm *vm)
2046 {
2047 	int i;
2048 
2049 	/* ensure that our disks are synced */
2050 	if (vioscsi != NULL)
2051 		vioscsi->file.close(vioscsi->file.p, 0);
2052 
2053 	for (i = 0; i < nr_vioblk; i++)
2054 		vioblk[i].file.close(vioblk[i].file.p, 0);
2055 }
2056 
2057 int
2058 vmmci_restore(int fd, uint32_t vm_id)
2059 {
2060 	log_debug("%s: receiving vmmci", __func__);
2061 	if (atomicio(read, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
2062 		log_warnx("%s: error reading vmmci from fd", __func__);
2063 		return (-1);
2064 	}
2065 
2066 	if (pci_set_bar_fn(vmmci.pci_id, 0, vmmci_io, NULL)) {
2067 		log_warnx("%s: can't set bar fn for vmm control device",
2068 		    __progname);
2069 		return (-1);
2070 	}
2071 	vmmci.vm_id = vm_id;
2072 	vmmci.irq = pci_get_dev_irq(vmmci.pci_id);
2073 	memset(&vmmci.timeout, 0, sizeof(struct event));
2074 	evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
2075 	return (0);
2076 }
2077 
2078 int
2079 viornd_restore(int fd, struct vm_create_params *vcp)
2080 {
2081 	log_debug("%s: receiving viornd", __func__);
2082 	if (atomicio(read, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
2083 		log_warnx("%s: error reading viornd from fd", __func__);
2084 		return (-1);
2085 	}
2086 	if (pci_set_bar_fn(viornd.pci_id, 0, virtio_rnd_io, NULL)) {
2087 		log_warnx("%s: can't set bar fn for virtio rng device",
2088 		    __progname);
2089 		return (-1);
2090 	}
2091 	viornd.vm_id = vcp->vcp_id;
2092 	viornd.irq = pci_get_dev_irq(viornd.pci_id);
2093 
2094 	return (0);
2095 }
2096 
2097 int
2098 vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
2099 {
2100 	struct vmop_create_params *vmc = &vm->vm_params;
2101 	struct vm_create_params *vcp = &vmc->vmc_params;
2102 	uint8_t i;
2103 	int ret;
2104 
2105 	nr_vionet = vcp->vcp_nnics;
2106 	if (vcp->vcp_nnics > 0) {
2107 		vionet = calloc(vcp->vcp_nnics, sizeof(struct vionet_dev));
2108 		if (vionet == NULL) {
2109 			log_warn("%s: calloc failure allocating vionets",
2110 			    __progname);
2111 			return (-1);
2112 		}
2113 		log_debug("%s: receiving vionet", __func__);
2114 		if (atomicio(read, fd, vionet,
2115 		    vcp->vcp_nnics * sizeof(struct vionet_dev)) !=
2116 		    vcp->vcp_nnics * sizeof(struct vionet_dev)) {
2117 			log_warnx("%s: error reading vionet from fd",
2118 			    __func__);
2119 			return (-1);
2120 		}
2121 
2122 		/* Virtio network */
2123 		for (i = 0; i < vcp->vcp_nnics; i++) {
2124 			if (pci_set_bar_fn(vionet[i].pci_id, 0, virtio_net_io,
2125 			    &vionet[i])) {
2126 				log_warnx("%s: can't set bar fn for virtio net "
2127 				    "device", __progname);
2128 				return (-1);
2129 			}
2130 
2131 			memset(&vionet[i].mutex, 0, sizeof(pthread_mutex_t));
2132 			ret = pthread_mutex_init(&vionet[i].mutex, NULL);
2133 
2134 			if (ret) {
2135 				errno = ret;
2136 				log_warn("%s: could not initialize mutex "
2137 				    "for vionet device", __progname);
2138 				return (-1);
2139 			}
2140 			vionet[i].fd = child_taps[i];
2141 			vionet[i].vm_id = vcp->vcp_id;
2142 			vionet[i].vm_vmid = vm->vm_vmid;
2143 			vionet[i].irq = pci_get_dev_irq(vionet[i].pci_id);
2144 
2145 			memset(&vionet[i].event, 0, sizeof(struct event));
2146 			event_set(&vionet[i].event, vionet[i].fd,
2147 			    EV_READ | EV_PERSIST, vionet_rx_event, &vionet[i]);
2148 		}
2149 	}
2150 	return (0);
2151 }
2152 
2153 int
2154 vioblk_restore(int fd, struct vmop_create_params *vmc,
2155     int child_disks[][VM_MAX_BASE_PER_DISK])
2156 {
2157 	struct vm_create_params *vcp = &vmc->vmc_params;
2158 	uint8_t i;
2159 
2160 	nr_vioblk = vcp->vcp_ndisks;
2161 	vioblk = calloc(vcp->vcp_ndisks, sizeof(struct vioblk_dev));
2162 	if (vioblk == NULL) {
2163 		log_warn("%s: calloc failure allocating vioblks", __progname);
2164 		return (-1);
2165 	}
2166 	log_debug("%s: receiving vioblk", __func__);
2167 	if (atomicio(read, fd, vioblk,
2168 	    nr_vioblk * sizeof(struct vioblk_dev)) !=
2169 	    nr_vioblk * sizeof(struct vioblk_dev)) {
2170 		log_warnx("%s: error reading vioblk from fd", __func__);
2171 		return (-1);
2172 	}
2173 	for (i = 0; i < vcp->vcp_ndisks; i++) {
2174 		if (pci_set_bar_fn(vioblk[i].pci_id, 0, virtio_blk_io,
2175 		    &vioblk[i])) {
2176 			log_warnx("%s: can't set bar fn for virtio block "
2177 			    "device", __progname);
2178 			return (-1);
2179 		}
2180 		if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
2181 		    child_disks[i], vmc->vmc_diskbases[i],
2182 		    vmc->vmc_disktypes[i]) == -1)  {
2183 			log_warnx("%s: unable to determine disk format",
2184 			    __func__);
2185 			return (-1);
2186 		}
2187 		vioblk[i].vm_id = vcp->vcp_id;
2188 		vioblk[i].irq = pci_get_dev_irq(vioblk[i].pci_id);
2189 	}
2190 	return (0);
2191 }
2192 
2193 int
2194 vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
2195 {
2196 	if (!strlen(vcp->vcp_cdrom))
2197 		return (0);
2198 
2199 	vioscsi = calloc(1, sizeof(struct vioscsi_dev));
2200 	if (vioscsi == NULL) {
2201 		log_warn("%s: calloc failure allocating vioscsi", __progname);
2202 		return (-1);
2203 	}
2204 
2205 	log_debug("%s: receiving vioscsi", __func__);
2206 
2207 	if (atomicio(read, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
2208 	    sizeof(struct vioscsi_dev)) {
2209 		log_warnx("%s: error reading vioscsi from fd", __func__);
2210 		return (-1);
2211 	}
2212 
2213 	if (pci_set_bar_fn(vioscsi->pci_id, 0, vioscsi_io, vioscsi)) {
2214 		log_warnx("%s: can't set bar fn for vmm control device",
2215 		    __progname);
2216 		return (-1);
2217 	}
2218 
2219 	if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, &child_cdrom, 1,
2220 	    VMDF_RAW) == -1) {
2221 		log_warnx("%s: unable to determine iso format", __func__);
2222 		return (-1);
2223 	}
2224 	vioscsi->vm_id = vcp->vcp_id;
2225 	vioscsi->irq = pci_get_dev_irq(vioscsi->pci_id);
2226 
2227 	return (0);
2228 }
2229 
2230 int
2231 virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
2232     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
2233 {
2234 	struct vmop_create_params *vmc = &vm->vm_params;
2235 	struct vm_create_params *vcp = &vmc->vmc_params;
2236 	int ret;
2237 
2238 	if ((ret = viornd_restore(fd, vcp)) == -1)
2239 		return ret;
2240 
2241 	if ((ret = vioblk_restore(fd, vmc, child_disks)) == -1)
2242 		return ret;
2243 
2244 	if ((ret = vioscsi_restore(fd, vcp, child_cdrom)) == -1)
2245 		return ret;
2246 
2247 	if ((ret = vionet_restore(fd, vm, child_taps)) == -1)
2248 		return ret;
2249 
2250 	if ((ret = vmmci_restore(fd, vcp->vcp_id)) == -1)
2251 		return ret;
2252 
2253 	return (0);
2254 }
2255 
2256 int
2257 viornd_dump(int fd)
2258 {
2259 	log_debug("%s: sending viornd", __func__);
2260 	if (atomicio(vwrite, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
2261 		log_warnx("%s: error writing viornd to fd", __func__);
2262 		return (-1);
2263 	}
2264 	return (0);
2265 }
2266 
2267 int
2268 vmmci_dump(int fd)
2269 {
2270 	log_debug("%s: sending vmmci", __func__);
2271 	if (atomicio(vwrite, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
2272 		log_warnx("%s: error writing vmmci to fd", __func__);
2273 		return (-1);
2274 	}
2275 	return (0);
2276 }
2277 
2278 int
2279 vionet_dump(int fd)
2280 {
2281 	log_debug("%s: sending vionet", __func__);
2282 	if (atomicio(vwrite, fd, vionet,
2283 	    nr_vionet * sizeof(struct vionet_dev)) !=
2284 	    nr_vionet * sizeof(struct vionet_dev)) {
2285 		log_warnx("%s: error writing vionet to fd", __func__);
2286 		return (-1);
2287 	}
2288 	return (0);
2289 }
2290 
2291 int
2292 vioblk_dump(int fd)
2293 {
2294 	log_debug("%s: sending vioblk", __func__);
2295 	if (atomicio(vwrite, fd, vioblk,
2296 	    nr_vioblk * sizeof(struct vioblk_dev)) !=
2297 	    nr_vioblk * sizeof(struct vioblk_dev)) {
2298 		log_warnx("%s: error writing vioblk to fd", __func__);
2299 		return (-1);
2300 	}
2301 	return (0);
2302 }
2303 
2304 int
2305 vioscsi_dump(int fd)
2306 {
2307 	if (vioscsi == NULL)
2308 		return (0);
2309 
2310 	log_debug("%s: sending vioscsi", __func__);
2311 	if (atomicio(vwrite, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
2312 	    sizeof(struct vioscsi_dev)) {
2313 		log_warnx("%s: error writing vioscsi to fd", __func__);
2314 		return (-1);
2315 	}
2316 	return (0);
2317 }
2318 
2319 int
2320 virtio_dump(int fd)
2321 {
2322 	int ret;
2323 
2324 	if ((ret = viornd_dump(fd)) == -1)
2325 		return ret;
2326 
2327 	if ((ret = vioblk_dump(fd)) == -1)
2328 		return ret;
2329 
2330 	if ((ret = vioscsi_dump(fd)) == -1)
2331 		return ret;
2332 
2333 	if ((ret = vionet_dump(fd)) == -1)
2334 		return ret;
2335 
2336 	if ((ret = vmmci_dump(fd)) == -1)
2337 		return ret;
2338 
2339 	return (0);
2340 }
2341 
2342 void
2343 virtio_stop(struct vm_create_params *vcp)
2344 {
2345 	uint8_t i;
2346 	for (i = 0; i < vcp->vcp_nnics; i++) {
2347 		if (event_del(&vionet[i].event)) {
2348 			log_warn("could not initialize vionet event "
2349 			    "handler");
2350 			return;
2351 		}
2352 	}
2353 }
2354 
2355 void
2356 virtio_start(struct vm_create_params *vcp)
2357 {
2358 	uint8_t i;
2359 	for (i = 0; i < vcp->vcp_nnics; i++) {
2360 		if (event_add(&vionet[i].event, NULL)) {
2361 			log_warn("could not initialize vionet event "
2362 			    "handler");
2363 			return;
2364 		}
2365 	}
2366 }
2367