1 /* $OpenBSD: virtio.c,v 1.123 2025/01/08 15:46:10 dv Exp $ */
2
3 /*
4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19 #include <sys/param.h> /* PAGE_SIZE */
20 #include <sys/socket.h>
21 #include <sys/wait.h>
22
23 #include <dev/pci/pcireg.h>
24 #include <dev/pci/pcidevs.h>
25 #include <dev/pv/virtioreg.h>
26 #include <dev/pci/virtio_pcireg.h>
27 #include <dev/pv/vioblkreg.h>
28 #include <dev/vmm/vmm.h>
29
30 #include <net/if.h>
31 #include <netinet/in.h>
32 #include <netinet/if_ether.h>
33
34 #include <errno.h>
35 #include <event.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <unistd.h>
39
40 #include "atomicio.h"
41 #include "pci.h"
42 #include "vioscsi.h"
43 #include "virtio.h"
44 #include "vmd.h"
45
46 extern struct vmd *env;
47 extern char *__progname;
48
49 struct viornd_dev viornd;
50 struct vioscsi_dev *vioscsi;
51 struct vmmci_dev vmmci;
52
53 /* Devices emulated in subprocesses are inserted into this list. */
54 SLIST_HEAD(virtio_dev_head, virtio_dev) virtio_devs;
55
56 #define MAXPHYS (64 * 1024) /* max raw I/O transfer size */
57
58 #define VIRTIO_NET_F_MAC (1<<5)
59
60 #define VMMCI_F_TIMESYNC (1<<0)
61 #define VMMCI_F_ACK (1<<1)
62 #define VMMCI_F_SYNCRTC (1<<2)
63
64 #define RXQ 0
65 #define TXQ 1
66
67 static int virtio_dev_launch(struct vmd_vm *, struct virtio_dev *);
68 static void virtio_dispatch_dev(int, short, void *);
69 static int handle_dev_msg(struct viodev_msg *, struct virtio_dev *);
70 static int virtio_dev_closefds(struct virtio_dev *);
71 static void vmmci_pipe_dispatch(int, short, void *);
72
73 const char *
virtio_reg_name(uint8_t reg)74 virtio_reg_name(uint8_t reg)
75 {
76 switch (reg) {
77 case VIRTIO_CONFIG_DEVICE_FEATURES: return "device feature";
78 case VIRTIO_CONFIG_GUEST_FEATURES: return "guest feature";
79 case VIRTIO_CONFIG_QUEUE_PFN: return "queue address";
80 case VIRTIO_CONFIG_QUEUE_SIZE: return "queue size";
81 case VIRTIO_CONFIG_QUEUE_SELECT: return "queue select";
82 case VIRTIO_CONFIG_QUEUE_NOTIFY: return "queue notify";
83 case VIRTIO_CONFIG_DEVICE_STATUS: return "device status";
84 case VIRTIO_CONFIG_ISR_STATUS: return "isr status";
85 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI...VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
86 return "device config 0";
87 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
88 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
89 return "device config 1";
90 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: return "device config 2";
91 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: return "device config 3";
92 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: return "device config 4";
93 default: return "unknown";
94 }
95 }
96
97 uint32_t
vring_size(uint32_t vq_size)98 vring_size(uint32_t vq_size)
99 {
100 uint32_t allocsize1, allocsize2;
101
102 /* allocsize1: descriptor table + avail ring + pad */
103 allocsize1 = VIRTQUEUE_ALIGN(sizeof(struct vring_desc) * vq_size
104 + sizeof(uint16_t) * (2 + vq_size));
105 /* allocsize2: used ring + pad */
106 allocsize2 = VIRTQUEUE_ALIGN(sizeof(uint16_t) * 2
107 + sizeof(struct vring_used_elem) * vq_size);
108
109 return allocsize1 + allocsize2;
110 }
111
112 /* Update queue select */
113 void
viornd_update_qs(void)114 viornd_update_qs(void)
115 {
116 struct virtio_vq_info *vq_info;
117
118 /* Invalid queue? */
119 if (viornd.cfg.queue_select > 0) {
120 viornd.cfg.queue_size = 0;
121 return;
122 }
123
124 vq_info = &viornd.vq[viornd.cfg.queue_select];
125
126 /* Update queue pfn/size based on queue select */
127 viornd.cfg.queue_pfn = vq_info->q_gpa >> 12;
128 viornd.cfg.queue_size = vq_info->qs;
129 }
130
131 /* Update queue address */
132 void
viornd_update_qa(void)133 viornd_update_qa(void)
134 {
135 struct virtio_vq_info *vq_info;
136 void *hva = NULL;
137
138 /* Invalid queue? */
139 if (viornd.cfg.queue_select > 0)
140 return;
141
142 vq_info = &viornd.vq[viornd.cfg.queue_select];
143 vq_info->q_gpa = (uint64_t)viornd.cfg.queue_pfn * VIRTIO_PAGE_SIZE;
144
145 hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIORND_QUEUE_SIZE));
146 if (hva == NULL)
147 fatalx("viornd_update_qa");
148 vq_info->q_hva = hva;
149 }
150
151 int
viornd_notifyq(void)152 viornd_notifyq(void)
153 {
154 size_t sz;
155 int dxx, ret;
156 uint16_t aidx, uidx;
157 char *vr, *rnd_data;
158 struct vring_desc *desc;
159 struct vring_avail *avail;
160 struct vring_used *used;
161 struct virtio_vq_info *vq_info;
162
163 ret = 0;
164
165 /* Invalid queue? */
166 if (viornd.cfg.queue_notify > 0)
167 return (0);
168
169 vq_info = &viornd.vq[viornd.cfg.queue_notify];
170 vr = vq_info->q_hva;
171 if (vr == NULL)
172 fatalx("%s: null vring", __func__);
173
174 desc = (struct vring_desc *)(vr);
175 avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
176 used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
177
178 aidx = avail->idx & VIORND_QUEUE_MASK;
179 uidx = used->idx & VIORND_QUEUE_MASK;
180
181 dxx = avail->ring[aidx] & VIORND_QUEUE_MASK;
182
183 sz = desc[dxx].len;
184 if (sz > MAXPHYS)
185 fatalx("viornd descriptor size too large (%zu)", sz);
186
187 rnd_data = malloc(sz);
188
189 if (rnd_data != NULL) {
190 arc4random_buf(rnd_data, sz);
191 if (write_mem(desc[dxx].addr, rnd_data, sz)) {
192 log_warnx("viornd: can't write random data @ "
193 "0x%llx",
194 desc[dxx].addr);
195 } else {
196 /* ret == 1 -> interrupt needed */
197 /* XXX check VIRTIO_F_NO_INTR */
198 ret = 1;
199 viornd.cfg.isr_status = 1;
200 used->ring[uidx].id = dxx;
201 used->ring[uidx].len = sz;
202 __sync_synchronize();
203 used->idx++;
204 }
205 free(rnd_data);
206 } else
207 fatal("memory allocation error for viornd data");
208
209 return (ret);
210 }
211
212 int
virtio_rnd_io(int dir,uint16_t reg,uint32_t * data,uint8_t * intr,void * unused,uint8_t sz)213 virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
214 void *unused, uint8_t sz)
215 {
216 *intr = 0xFF;
217
218 if (dir == 0) {
219 switch (reg) {
220 case VIRTIO_CONFIG_DEVICE_FEATURES:
221 case VIRTIO_CONFIG_QUEUE_SIZE:
222 case VIRTIO_CONFIG_ISR_STATUS:
223 log_warnx("%s: illegal write %x to %s",
224 __progname, *data, virtio_reg_name(reg));
225 break;
226 case VIRTIO_CONFIG_GUEST_FEATURES:
227 viornd.cfg.guest_feature = *data;
228 break;
229 case VIRTIO_CONFIG_QUEUE_PFN:
230 viornd.cfg.queue_pfn = *data;
231 viornd_update_qa();
232 break;
233 case VIRTIO_CONFIG_QUEUE_SELECT:
234 viornd.cfg.queue_select = *data;
235 viornd_update_qs();
236 break;
237 case VIRTIO_CONFIG_QUEUE_NOTIFY:
238 viornd.cfg.queue_notify = *data;
239 if (viornd_notifyq())
240 *intr = 1;
241 break;
242 case VIRTIO_CONFIG_DEVICE_STATUS:
243 viornd.cfg.device_status = *data;
244 break;
245 }
246 } else {
247 switch (reg) {
248 case VIRTIO_CONFIG_DEVICE_FEATURES:
249 *data = viornd.cfg.device_feature;
250 break;
251 case VIRTIO_CONFIG_GUEST_FEATURES:
252 *data = viornd.cfg.guest_feature;
253 break;
254 case VIRTIO_CONFIG_QUEUE_PFN:
255 *data = viornd.cfg.queue_pfn;
256 break;
257 case VIRTIO_CONFIG_QUEUE_SIZE:
258 *data = viornd.cfg.queue_size;
259 break;
260 case VIRTIO_CONFIG_QUEUE_SELECT:
261 *data = viornd.cfg.queue_select;
262 break;
263 case VIRTIO_CONFIG_QUEUE_NOTIFY:
264 *data = viornd.cfg.queue_notify;
265 break;
266 case VIRTIO_CONFIG_DEVICE_STATUS:
267 *data = viornd.cfg.device_status;
268 break;
269 case VIRTIO_CONFIG_ISR_STATUS:
270 *data = viornd.cfg.isr_status;
271 viornd.cfg.isr_status = 0;
272 vcpu_deassert_irq(viornd.vm_id, 0, viornd.irq);
273 break;
274 }
275 }
276 return (0);
277 }
278
279 /*
280 * vmmci_ctl
281 *
282 * Inject a command into the vmmci device, potentially delivering interrupt.
283 *
284 * Called by the vm process's event(3) loop.
285 */
286 int
vmmci_ctl(unsigned int cmd)287 vmmci_ctl(unsigned int cmd)
288 {
289 int ret = 0;
290 struct timeval tv = { 0, 0 };
291
292 mutex_lock(&vmmci.mutex);
293
294 if ((vmmci.cfg.device_status &
295 VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0) {
296 ret = -1;
297 goto unlock;
298 }
299
300 if (cmd == vmmci.cmd)
301 goto unlock;
302
303 switch (cmd) {
304 case VMMCI_NONE:
305 break;
306 case VMMCI_SHUTDOWN:
307 case VMMCI_REBOOT:
308 /* Update command */
309 vmmci.cmd = cmd;
310
311 /*
312 * vmm VMs do not support powerdown, send a reboot request
313 * instead and turn it off after the triple fault.
314 */
315 if (cmd == VMMCI_SHUTDOWN)
316 cmd = VMMCI_REBOOT;
317
318 /* Trigger interrupt */
319 vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
320 vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq);
321
322 /* Add ACK timeout */
323 tv.tv_sec = VMMCI_TIMEOUT_SHORT;
324 evtimer_add(&vmmci.timeout, &tv);
325 break;
326 case VMMCI_SYNCRTC:
327 if (vmmci.cfg.guest_feature & VMMCI_F_SYNCRTC) {
328 /* RTC updated, request guest VM resync of its RTC */
329 vmmci.cmd = cmd;
330
331 vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
332 vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq);
333 } else {
334 log_debug("%s: RTC sync skipped (guest does not "
335 "support RTC sync)\n", __func__);
336 }
337 break;
338 default:
339 fatalx("invalid vmmci command: %d", cmd);
340 }
341
342 unlock:
343 mutex_unlock(&vmmci.mutex);
344
345 return (ret);
346 }
347
348 /*
349 * vmmci_ack
350 *
351 * Process a write to the command register.
352 *
353 * Called by the vcpu thread. Must be called with the mutex held.
354 */
355 void
vmmci_ack(unsigned int cmd)356 vmmci_ack(unsigned int cmd)
357 {
358 switch (cmd) {
359 case VMMCI_NONE:
360 break;
361 case VMMCI_SHUTDOWN:
362 /*
363 * The shutdown was requested by the VM if we don't have
364 * a pending shutdown request. In this case add a short
365 * timeout to give the VM a chance to reboot before the
366 * timer is expired.
367 */
368 if (vmmci.cmd == 0) {
369 log_debug("%s: vm %u requested shutdown", __func__,
370 vmmci.vm_id);
371 vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_SHORT);
372 return;
373 }
374 /* FALLTHROUGH */
375 case VMMCI_REBOOT:
376 /*
377 * If the VM acknowledged our shutdown request, give it
378 * enough time to shutdown or reboot gracefully. This
379 * might take a considerable amount of time (running
380 * rc.shutdown on the VM), so increase the timeout before
381 * killing it forcefully.
382 */
383 if (cmd == vmmci.cmd) {
384 log_debug("%s: vm %u acknowledged shutdown request",
385 __func__, vmmci.vm_id);
386 vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_LONG);
387 }
388 break;
389 case VMMCI_SYNCRTC:
390 log_debug("%s: vm %u acknowledged RTC sync request",
391 __func__, vmmci.vm_id);
392 vmmci.cmd = VMMCI_NONE;
393 break;
394 default:
395 log_warnx("%s: illegal request %u", __func__, cmd);
396 break;
397 }
398 }
399
400 void
vmmci_timeout(int fd,short type,void * arg)401 vmmci_timeout(int fd, short type, void *arg)
402 {
403 log_debug("%s: vm %u shutdown", __progname, vmmci.vm_id);
404 vm_shutdown(vmmci.cmd == VMMCI_REBOOT ? VMMCI_REBOOT : VMMCI_SHUTDOWN);
405 }
406
407 int
vmmci_io(int dir,uint16_t reg,uint32_t * data,uint8_t * intr,void * unused,uint8_t sz)408 vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
409 void *unused, uint8_t sz)
410 {
411 *intr = 0xFF;
412
413 mutex_lock(&vmmci.mutex);
414 if (dir == 0) {
415 switch (reg) {
416 case VIRTIO_CONFIG_DEVICE_FEATURES:
417 case VIRTIO_CONFIG_QUEUE_SIZE:
418 case VIRTIO_CONFIG_ISR_STATUS:
419 log_warnx("%s: illegal write %x to %s",
420 __progname, *data, virtio_reg_name(reg));
421 break;
422 case VIRTIO_CONFIG_GUEST_FEATURES:
423 vmmci.cfg.guest_feature = *data;
424 break;
425 case VIRTIO_CONFIG_QUEUE_PFN:
426 vmmci.cfg.queue_pfn = *data;
427 break;
428 case VIRTIO_CONFIG_QUEUE_SELECT:
429 vmmci.cfg.queue_select = *data;
430 break;
431 case VIRTIO_CONFIG_QUEUE_NOTIFY:
432 vmmci.cfg.queue_notify = *data;
433 break;
434 case VIRTIO_CONFIG_DEVICE_STATUS:
435 vmmci.cfg.device_status = *data;
436 break;
437 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
438 vmmci_ack(*data);
439 break;
440 }
441 } else {
442 switch (reg) {
443 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
444 *data = vmmci.cmd;
445 break;
446 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
447 /* Update time once when reading the first register */
448 gettimeofday(&vmmci.time, NULL);
449 *data = (uint64_t)vmmci.time.tv_sec;
450 break;
451 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
452 *data = (uint64_t)vmmci.time.tv_sec << 32;
453 break;
454 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12:
455 *data = (uint64_t)vmmci.time.tv_usec;
456 break;
457 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16:
458 *data = (uint64_t)vmmci.time.tv_usec << 32;
459 break;
460 case VIRTIO_CONFIG_DEVICE_FEATURES:
461 *data = vmmci.cfg.device_feature;
462 break;
463 case VIRTIO_CONFIG_GUEST_FEATURES:
464 *data = vmmci.cfg.guest_feature;
465 break;
466 case VIRTIO_CONFIG_QUEUE_PFN:
467 *data = vmmci.cfg.queue_pfn;
468 break;
469 case VIRTIO_CONFIG_QUEUE_SIZE:
470 *data = vmmci.cfg.queue_size;
471 break;
472 case VIRTIO_CONFIG_QUEUE_SELECT:
473 *data = vmmci.cfg.queue_select;
474 break;
475 case VIRTIO_CONFIG_QUEUE_NOTIFY:
476 *data = vmmci.cfg.queue_notify;
477 break;
478 case VIRTIO_CONFIG_DEVICE_STATUS:
479 *data = vmmci.cfg.device_status;
480 break;
481 case VIRTIO_CONFIG_ISR_STATUS:
482 *data = vmmci.cfg.isr_status;
483 vmmci.cfg.isr_status = 0;
484 vcpu_deassert_irq(vmmci.vm_id, 0, vmmci.irq);
485 break;
486 }
487 }
488 mutex_unlock(&vmmci.mutex);
489
490 return (0);
491 }
492
493 int
virtio_get_base(int fd,char * path,size_t npath,int type,const char * dpath)494 virtio_get_base(int fd, char *path, size_t npath, int type, const char *dpath)
495 {
496 switch (type) {
497 case VMDF_RAW:
498 return 0;
499 case VMDF_QCOW2:
500 return virtio_qcow2_get_base(fd, path, npath, dpath);
501 }
502 log_warnx("%s: invalid disk format", __func__);
503 return -1;
504 }
505
506 static void
vmmci_pipe_dispatch(int fd,short event,void * arg)507 vmmci_pipe_dispatch(int fd, short event, void *arg)
508 {
509 enum pipe_msg_type msg;
510 struct timeval tv = { 0, 0 };
511
512 msg = vm_pipe_recv(&vmmci.dev_pipe);
513 switch (msg) {
514 case VMMCI_SET_TIMEOUT_SHORT:
515 tv.tv_sec = VMMCI_TIMEOUT_SHORT;
516 evtimer_add(&vmmci.timeout, &tv);
517 break;
518 case VMMCI_SET_TIMEOUT_LONG:
519 tv.tv_sec = VMMCI_TIMEOUT_LONG;
520 evtimer_add(&vmmci.timeout, &tv);
521 break;
522 default:
523 log_warnx("%s: invalid pipe message type %d", __func__, msg);
524 }
525 }
526
527 void
virtio_init(struct vmd_vm * vm,int child_cdrom,int child_disks[][VM_MAX_BASE_PER_DISK],int * child_taps)528 virtio_init(struct vmd_vm *vm, int child_cdrom,
529 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
530 {
531 struct vmop_create_params *vmc = &vm->vm_params;
532 struct vm_create_params *vcp = &vmc->vmc_params;
533 struct virtio_dev *dev;
534 uint8_t id;
535 uint8_t i, j;
536 int ret = 0;
537
538 /* Virtio entropy device */
539 if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
540 PCI_PRODUCT_QUMRANET_VIO_RNG, PCI_CLASS_SYSTEM,
541 PCI_SUBCLASS_SYSTEM_MISC,
542 PCI_VENDOR_OPENBSD,
543 PCI_PRODUCT_VIRTIO_ENTROPY, 1, NULL)) {
544 log_warnx("%s: can't add PCI virtio rng device",
545 __progname);
546 return;
547 }
548
549 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_rnd_io, NULL)) {
550 log_warnx("%s: can't add bar for virtio rng device",
551 __progname);
552 return;
553 }
554
555 memset(&viornd, 0, sizeof(viornd));
556 viornd.vq[0].qs = VIORND_QUEUE_SIZE;
557 viornd.vq[0].vq_availoffset = sizeof(struct vring_desc) *
558 VIORND_QUEUE_SIZE;
559 viornd.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
560 sizeof(struct vring_desc) * VIORND_QUEUE_SIZE
561 + sizeof(uint16_t) * (2 + VIORND_QUEUE_SIZE));
562 viornd.pci_id = id;
563 viornd.irq = pci_get_dev_irq(id);
564 viornd.vm_id = vcp->vcp_id;
565
566 SLIST_INIT(&virtio_devs);
567
568 if (vmc->vmc_nnics > 0) {
569 for (i = 0; i < vmc->vmc_nnics; i++) {
570 dev = calloc(1, sizeof(struct virtio_dev));
571 if (dev == NULL) {
572 log_warn("%s: calloc failure allocating vionet",
573 __progname);
574 return;
575 }
576 /* Virtio network */
577 dev->dev_type = VMD_DEVTYPE_NET;
578
579 if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
580 PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM,
581 PCI_SUBCLASS_SYSTEM_MISC, PCI_VENDOR_OPENBSD,
582 PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL)) {
583 log_warnx("%s: can't add PCI virtio net device",
584 __progname);
585 return;
586 }
587 dev->pci_id = id;
588 dev->sync_fd = -1;
589 dev->async_fd = -1;
590 dev->vm_id = vcp->vcp_id;
591 dev->vm_vmid = vm->vm_vmid;
592 dev->irq = pci_get_dev_irq(id);
593
594 /* The vionet pci bar function is called by the vcpu. */
595 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
596 dev)) {
597 log_warnx("%s: can't add bar for virtio net "
598 "device", __progname);
599 return;
600 }
601
602 dev->vionet.vq[RXQ].qs = VIONET_QUEUE_SIZE;
603 dev->vionet.vq[RXQ].vq_availoffset =
604 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
605 dev->vionet.vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
606 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
607 + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
608 dev->vionet.vq[RXQ].last_avail = 0;
609 dev->vionet.vq[RXQ].notified_avail = 0;
610
611 dev->vionet.vq[TXQ].qs = VIONET_QUEUE_SIZE;
612 dev->vionet.vq[TXQ].vq_availoffset =
613 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
614 dev->vionet.vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
615 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
616 + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
617 dev->vionet.vq[TXQ].last_avail = 0;
618 dev->vionet.vq[TXQ].notified_avail = 0;
619
620 dev->vionet.data_fd = child_taps[i];
621
622 /* MAC address has been assigned by the parent */
623 memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6);
624 dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC;
625
626 dev->vionet.lockedmac =
627 vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0;
628 dev->vionet.local =
629 vmc->vmc_ifflags[i] & VMIFF_LOCAL ? 1 : 0;
630 if (i == 0 && vmc->vmc_bootdevice & VMBOOTDEV_NET)
631 dev->vionet.pxeboot = 1;
632 memcpy(&dev->vionet.local_prefix,
633 &env->vmd_cfg.cfg_localprefix,
634 sizeof(dev->vionet.local_prefix));
635 log_debug("%s: vm \"%s\" vio%u lladdr %s%s%s%s",
636 __func__, vcp->vcp_name, i,
637 ether_ntoa((void *)dev->vionet.mac),
638 dev->vionet.lockedmac ? ", locked" : "",
639 dev->vionet.local ? ", local" : "",
640 dev->vionet.pxeboot ? ", pxeboot" : "");
641
642 /* Add the vionet to our device list. */
643 dev->vionet.idx = i;
644 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
645 }
646 }
647
648 if (vmc->vmc_ndisks > 0) {
649 for (i = 0; i < vmc->vmc_ndisks; i++) {
650 dev = calloc(1, sizeof(struct virtio_dev));
651 if (dev == NULL) {
652 log_warn("%s: calloc failure allocating vioblk",
653 __progname);
654 return;
655 }
656
657 /* One vioblk device for each disk defined in vcp */
658 dev->dev_type = VMD_DEVTYPE_DISK;
659
660 if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
661 PCI_PRODUCT_QUMRANET_VIO_BLOCK,
662 PCI_CLASS_MASS_STORAGE,
663 PCI_SUBCLASS_MASS_STORAGE_SCSI,
664 PCI_VENDOR_OPENBSD,
665 PCI_PRODUCT_VIRTIO_BLOCK, 1, NULL)) {
666 log_warnx("%s: can't add PCI virtio block "
667 "device", __progname);
668 return;
669 }
670 dev->pci_id = id;
671 dev->sync_fd = -1;
672 dev->async_fd = -1;
673 dev->vm_id = vcp->vcp_id;
674 dev->vm_vmid = vm->vm_vmid;
675 dev->irq = pci_get_dev_irq(id);
676
677 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
678 &dev->vioblk)) {
679 log_warnx("%s: can't add bar for virtio block "
680 "device", __progname);
681 return;
682 }
683 dev->vioblk.vq[0].qs = VIOBLK_QUEUE_SIZE;
684 dev->vioblk.vq[0].vq_availoffset =
685 sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE;
686 dev->vioblk.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
687 sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE
688 + sizeof(uint16_t) * (2 + VIOBLK_QUEUE_SIZE));
689 dev->vioblk.vq[0].last_avail = 0;
690 dev->vioblk.cfg.device_feature =
691 VIRTIO_BLK_F_SEG_MAX;
692 dev->vioblk.seg_max = VIOBLK_SEG_MAX;
693
694 /*
695 * Initialize disk fds to an invalid fd (-1), then
696 * set any child disk fds.
697 */
698 memset(&dev->vioblk.disk_fd, -1,
699 sizeof(dev->vioblk.disk_fd));
700 dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i];
701 for (j = 0; j < dev->vioblk.ndisk_fd; j++)
702 dev->vioblk.disk_fd[j] = child_disks[i][j];
703
704 dev->vioblk.idx = i;
705 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
706 }
707 }
708
709 /*
710 * Launch virtio devices that support subprocess execution.
711 */
712 SLIST_FOREACH(dev, &virtio_devs, dev_next) {
713 if (virtio_dev_launch(vm, dev) != 0)
714 fatalx("failed to launch virtio device");
715 }
716
717 /* vioscsi cdrom */
718 if (strlen(vmc->vmc_cdrom)) {
719 vioscsi = calloc(1, sizeof(struct vioscsi_dev));
720 if (vioscsi == NULL) {
721 log_warn("%s: calloc failure allocating vioscsi",
722 __progname);
723 return;
724 }
725
726 if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
727 PCI_PRODUCT_QUMRANET_VIO_SCSI,
728 PCI_CLASS_MASS_STORAGE,
729 PCI_SUBCLASS_MASS_STORAGE_SCSI,
730 PCI_VENDOR_OPENBSD,
731 PCI_PRODUCT_VIRTIO_SCSI, 1, NULL)) {
732 log_warnx("%s: can't add PCI vioscsi device",
733 __progname);
734 return;
735 }
736
737 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vioscsi_io, vioscsi)) {
738 log_warnx("%s: can't add bar for vioscsi device",
739 __progname);
740 return;
741 }
742
743 for (i = 0; i < VIRTIO_MAX_QUEUES; i++) {
744 vioscsi->vq[i].qs = VIOSCSI_QUEUE_SIZE;
745 vioscsi->vq[i].vq_availoffset =
746 sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE;
747 vioscsi->vq[i].vq_usedoffset = VIRTQUEUE_ALIGN(
748 sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE
749 + sizeof(uint16_t) * (2 + VIOSCSI_QUEUE_SIZE));
750 vioscsi->vq[i].last_avail = 0;
751 }
752 if (virtio_raw_init(&vioscsi->file, &vioscsi->sz, &child_cdrom,
753 1) == -1) {
754 log_warnx("%s: unable to determine iso format",
755 __func__);
756 return;
757 }
758 vioscsi->locked = 0;
759 vioscsi->lba = 0;
760 vioscsi->n_blocks = vioscsi->sz / VIOSCSI_BLOCK_SIZE_CDROM;
761 vioscsi->max_xfer = VIOSCSI_BLOCK_SIZE_CDROM;
762 vioscsi->pci_id = id;
763 vioscsi->vm_id = vcp->vcp_id;
764 vioscsi->irq = pci_get_dev_irq(id);
765 }
766
767 /* virtio control device */
768 if (pci_add_device(&id, PCI_VENDOR_OPENBSD,
769 PCI_PRODUCT_OPENBSD_CONTROL,
770 PCI_CLASS_COMMUNICATIONS,
771 PCI_SUBCLASS_COMMUNICATIONS_MISC,
772 PCI_VENDOR_OPENBSD,
773 PCI_PRODUCT_VIRTIO_VMMCI, 1, NULL)) {
774 log_warnx("%s: can't add PCI vmm control device",
775 __progname);
776 return;
777 }
778
779 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vmmci_io, NULL)) {
780 log_warnx("%s: can't add bar for vmm control device",
781 __progname);
782 return;
783 }
784
785 memset(&vmmci, 0, sizeof(vmmci));
786 vmmci.cfg.device_feature = VMMCI_F_TIMESYNC | VMMCI_F_ACK |
787 VMMCI_F_SYNCRTC;
788 vmmci.vm_id = vcp->vcp_id;
789 vmmci.irq = pci_get_dev_irq(id);
790 vmmci.pci_id = id;
791 ret = pthread_mutex_init(&vmmci.mutex, NULL);
792 if (ret) {
793 errno = ret;
794 fatal("could not initialize vmmci mutex");
795 }
796
797 evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
798 vm_pipe_init(&vmmci.dev_pipe, vmmci_pipe_dispatch);
799 event_add(&vmmci.dev_pipe.read_ev, NULL);
800 }
801
802 /*
803 * vionet_set_hostmac
804 *
805 * Sets the hardware address for the host-side tap(4) on a vionet_dev.
806 *
807 * This should only be called from the event-loop thread
808 *
809 * vm: pointer to the current vmd_vm instance
810 * idx: index into the array of vionet_dev's for the target vionet_dev
811 * addr: ethernet address to set
812 */
813 void
vionet_set_hostmac(struct vmd_vm * vm,unsigned int idx,uint8_t * addr)814 vionet_set_hostmac(struct vmd_vm *vm, unsigned int idx, uint8_t *addr)
815 {
816 struct vmop_create_params *vmc = &vm->vm_params;
817 struct virtio_dev *dev;
818 struct vionet_dev *vionet = NULL;
819 int ret;
820
821 if (idx > vmc->vmc_nnics)
822 fatalx("%s: invalid vionet index: %u", __func__, idx);
823
824 SLIST_FOREACH(dev, &virtio_devs, dev_next) {
825 if (dev->dev_type == VMD_DEVTYPE_NET
826 && dev->vionet.idx == idx) {
827 vionet = &dev->vionet;
828 break;
829 }
830 }
831 if (vionet == NULL)
832 fatalx("%s: dev == NULL, idx = %u", __func__, idx);
833
834 /* Set the local vm process copy. */
835 memcpy(vionet->hostmac, addr, sizeof(vionet->hostmac));
836
837 /* Send the information to the device process. */
838 ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_HOSTMAC, 0, 0, -1,
839 vionet->hostmac, sizeof(vionet->hostmac));
840 if (ret == -1) {
841 log_warnx("%s: failed to queue hostmac to vionet dev %u",
842 __func__, idx);
843 return;
844 }
845 }
846
847 void
virtio_shutdown(struct vmd_vm * vm)848 virtio_shutdown(struct vmd_vm *vm)
849 {
850 int ret, status;
851 pid_t pid = 0;
852 struct virtio_dev *dev, *tmp;
853 struct viodev_msg msg;
854 struct imsgbuf *ibuf;
855
856 /* Ensure that our disks are synced. */
857 if (vioscsi != NULL)
858 vioscsi->file.close(vioscsi->file.p, 0);
859
860 /*
861 * Broadcast shutdown to child devices. We need to do this
862 * synchronously as we have already stopped the async event thread.
863 */
864 SLIST_FOREACH(dev, &virtio_devs, dev_next) {
865 memset(&msg, 0, sizeof(msg));
866 msg.type = VIODEV_MSG_SHUTDOWN;
867 ibuf = &dev->sync_iev.ibuf;
868 ret = imsg_compose(ibuf, VIODEV_MSG_SHUTDOWN, 0, 0, -1,
869 &msg, sizeof(msg));
870 if (ret == -1)
871 fatalx("%s: failed to send shutdown to device",
872 __func__);
873 if (imsgbuf_flush(ibuf) == -1)
874 fatalx("%s: imsgbuf_flush", __func__);
875 }
876
877 /*
878 * Wait for all children to shutdown using a simple approach of
879 * iterating over known child devices and waiting for them to die.
880 */
881 SLIST_FOREACH_SAFE(dev, &virtio_devs, dev_next, tmp) {
882 log_debug("%s: waiting on device pid %d", __func__,
883 dev->dev_pid);
884 do {
885 pid = waitpid(dev->dev_pid, &status, WNOHANG);
886 } while (pid == 0 || (pid == -1 && errno == EINTR));
887 if (pid == dev->dev_pid)
888 log_debug("%s: device for pid %d is stopped",
889 __func__, pid);
890 else
891 log_warnx("%s: unexpected pid %d", __func__, pid);
892 free(dev);
893 }
894 }
895
896 int
vmmci_restore(int fd,uint32_t vm_id)897 vmmci_restore(int fd, uint32_t vm_id)
898 {
899 log_debug("%s: receiving vmmci", __func__);
900 if (atomicio(read, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
901 log_warnx("%s: error reading vmmci from fd", __func__);
902 return (-1);
903 }
904
905 if (pci_set_bar_fn(vmmci.pci_id, 0, vmmci_io, NULL)) {
906 log_warnx("%s: can't set bar fn for vmm control device",
907 __progname);
908 return (-1);
909 }
910 vmmci.vm_id = vm_id;
911 vmmci.irq = pci_get_dev_irq(vmmci.pci_id);
912 memset(&vmmci.timeout, 0, sizeof(struct event));
913 evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
914 return (0);
915 }
916
917 int
viornd_restore(int fd,struct vmd_vm * vm)918 viornd_restore(int fd, struct vmd_vm *vm)
919 {
920 void *hva = NULL;
921
922 log_debug("%s: receiving viornd", __func__);
923 if (atomicio(read, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
924 log_warnx("%s: error reading viornd from fd", __func__);
925 return (-1);
926 }
927 if (pci_set_bar_fn(viornd.pci_id, 0, virtio_rnd_io, NULL)) {
928 log_warnx("%s: can't set bar fn for virtio rng device",
929 __progname);
930 return (-1);
931 }
932 viornd.vm_id = vm->vm_params.vmc_params.vcp_id;
933 viornd.irq = pci_get_dev_irq(viornd.pci_id);
934
935 hva = hvaddr_mem(viornd.vq[0].q_gpa, vring_size(VIORND_QUEUE_SIZE));
936 if (hva == NULL)
937 fatal("failed to restore viornd virtqueue");
938 viornd.vq[0].q_hva = hva;
939
940 return (0);
941 }
942
943 int
vionet_restore(int fd,struct vmd_vm * vm,int * child_taps)944 vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
945 {
946 struct vmop_create_params *vmc = &vm->vm_params;
947 struct vm_create_params *vcp = &vmc->vmc_params;
948 struct virtio_dev *dev;
949 uint8_t i;
950
951 if (vmc->vmc_nnics == 0)
952 return (0);
953
954 for (i = 0; i < vmc->vmc_nnics; i++) {
955 dev = calloc(1, sizeof(struct virtio_dev));
956 if (dev == NULL) {
957 log_warn("%s: calloc failure allocating vionet",
958 __progname);
959 return (-1);
960 }
961
962 log_debug("%s: receiving virtio network device", __func__);
963 if (atomicio(read, fd, dev, sizeof(struct virtio_dev))
964 != sizeof(struct virtio_dev)) {
965 log_warnx("%s: error reading vionet from fd",
966 __func__);
967 return (-1);
968 }
969
970 /* Virtio network */
971 if (dev->dev_type != VMD_DEVTYPE_NET) {
972 log_warnx("%s: invalid device type", __func__);
973 return (-1);
974 }
975
976 dev->sync_fd = -1;
977 dev->async_fd = -1;
978 dev->vm_id = vcp->vcp_id;
979 dev->vm_vmid = vm->vm_vmid;
980 dev->irq = pci_get_dev_irq(dev->pci_id);
981
982 if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) {
983 log_warnx("%s: can't set bar fn for virtio net "
984 "device", __progname);
985 return (-1);
986 }
987
988 dev->vionet.data_fd = child_taps[i];
989 dev->vionet.idx = i;
990
991 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
992 }
993
994 return (0);
995 }
996
997 int
vioblk_restore(int fd,struct vmd_vm * vm,int child_disks[][VM_MAX_BASE_PER_DISK])998 vioblk_restore(int fd, struct vmd_vm *vm,
999 int child_disks[][VM_MAX_BASE_PER_DISK])
1000 {
1001 struct vmop_create_params *vmc = &vm->vm_params;
1002 struct virtio_dev *dev;
1003 uint8_t i, j;
1004
1005 if (vmc->vmc_ndisks == 0)
1006 return (0);
1007
1008 for (i = 0; i < vmc->vmc_ndisks; i++) {
1009 dev = calloc(1, sizeof(struct virtio_dev));
1010 if (dev == NULL) {
1011 log_warn("%s: calloc failure allocating vioblks",
1012 __progname);
1013 return (-1);
1014 }
1015
1016 log_debug("%s: receiving vioblk", __func__);
1017 if (atomicio(read, fd, dev, sizeof(struct virtio_dev))
1018 != sizeof(struct virtio_dev)) {
1019 log_warnx("%s: error reading vioblk from fd", __func__);
1020 return (-1);
1021 }
1022 if (dev->dev_type != VMD_DEVTYPE_DISK) {
1023 log_warnx("%s: invalid device type", __func__);
1024 return (-1);
1025 }
1026
1027 dev->sync_fd = -1;
1028 dev->async_fd = -1;
1029
1030 if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) {
1031 log_warnx("%s: can't set bar fn for virtio block "
1032 "device", __progname);
1033 return (-1);
1034 }
1035 dev->vm_id = vmc->vmc_params.vcp_id;
1036 dev->irq = pci_get_dev_irq(dev->pci_id);
1037
1038 memset(&dev->vioblk.disk_fd, -1, sizeof(dev->vioblk.disk_fd));
1039 dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i];
1040 for (j = 0; j < dev->vioblk.ndisk_fd; j++)
1041 dev->vioblk.disk_fd[j] = child_disks[i][j];
1042
1043 dev->vioblk.idx = i;
1044 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
1045 }
1046 return (0);
1047 }
1048
1049 int
vioscsi_restore(int fd,struct vmd_vm * vm,int child_cdrom)1050 vioscsi_restore(int fd, struct vmd_vm *vm, int child_cdrom)
1051 {
1052 void *hva = NULL;
1053 unsigned int i;
1054
1055 if (!strlen(vm->vm_params.vmc_cdrom))
1056 return (0);
1057
1058 vioscsi = calloc(1, sizeof(struct vioscsi_dev));
1059 if (vioscsi == NULL) {
1060 log_warn("%s: calloc failure allocating vioscsi", __progname);
1061 return (-1);
1062 }
1063
1064 log_debug("%s: receiving vioscsi", __func__);
1065
1066 if (atomicio(read, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
1067 sizeof(struct vioscsi_dev)) {
1068 log_warnx("%s: error reading vioscsi from fd", __func__);
1069 return (-1);
1070 }
1071
1072 if (pci_set_bar_fn(vioscsi->pci_id, 0, vioscsi_io, vioscsi)) {
1073 log_warnx("%s: can't set bar fn for vmm control device",
1074 __progname);
1075 return (-1);
1076 }
1077
1078 vioscsi->vm_id = vm->vm_params.vmc_params.vcp_id;
1079 vioscsi->irq = pci_get_dev_irq(vioscsi->pci_id);
1080
1081 /* vioscsi uses 3 virtqueues. */
1082 for (i = 0; i < 3; i++) {
1083 hva = hvaddr_mem(vioscsi->vq[i].q_gpa,
1084 vring_size(VIOSCSI_QUEUE_SIZE));
1085 if (hva == NULL)
1086 fatal("failed to restore vioscsi virtqueue");
1087 vioscsi->vq[i].q_hva = hva;
1088 }
1089
1090 return (0);
1091 }
1092
1093 int
virtio_restore(int fd,struct vmd_vm * vm,int child_cdrom,int child_disks[][VM_MAX_BASE_PER_DISK],int * child_taps)1094 virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
1095 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1096 {
1097 struct virtio_dev *dev;
1098 int ret;
1099
1100 SLIST_INIT(&virtio_devs);
1101
1102 if ((ret = viornd_restore(fd, vm)) == -1)
1103 return (ret);
1104
1105 if ((ret = vioblk_restore(fd, vm, child_disks)) == -1)
1106 return (ret);
1107
1108 if ((ret = vioscsi_restore(fd, vm, child_cdrom)) == -1)
1109 return (ret);
1110
1111 if ((ret = vionet_restore(fd, vm, child_taps)) == -1)
1112 return (ret);
1113
1114 if ((ret = vmmci_restore(fd, vm->vm_params.vmc_params.vcp_id)) == -1)
1115 return (ret);
1116
1117 SLIST_FOREACH(dev, &virtio_devs, dev_next) {
1118 if (virtio_dev_launch(vm, dev) != 0)
1119 fatalx("%s: failed to restore virtio dev", __func__);
1120 }
1121
1122 return (0);
1123 }
1124
1125 int
viornd_dump(int fd)1126 viornd_dump(int fd)
1127 {
1128 log_debug("%s: sending viornd", __func__);
1129
1130 viornd.vq[0].q_hva = NULL;
1131
1132 if (atomicio(vwrite, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
1133 log_warnx("%s: error writing viornd to fd", __func__);
1134 return (-1);
1135 }
1136 return (0);
1137 }
1138
1139 int
vmmci_dump(int fd)1140 vmmci_dump(int fd)
1141 {
1142 log_debug("%s: sending vmmci", __func__);
1143
1144 if (atomicio(vwrite, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
1145 log_warnx("%s: error writing vmmci to fd", __func__);
1146 return (-1);
1147 }
1148 return (0);
1149 }
1150
1151 int
vionet_dump(int fd)1152 vionet_dump(int fd)
1153 {
1154 struct virtio_dev *dev, temp;
1155 struct viodev_msg msg;
1156 struct imsg imsg;
1157 struct imsgbuf *ibuf = NULL;
1158 size_t sz;
1159 int ret;
1160
1161 log_debug("%s: dumping vionet", __func__);
1162
1163 SLIST_FOREACH(dev, &virtio_devs, dev_next) {
1164 if (dev->dev_type != VMD_DEVTYPE_NET)
1165 continue;
1166
1167 memset(&msg, 0, sizeof(msg));
1168 memset(&imsg, 0, sizeof(imsg));
1169
1170 ibuf = &dev->sync_iev.ibuf;
1171 msg.type = VIODEV_MSG_DUMP;
1172
1173 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
1174 sizeof(msg));
1175 if (ret == -1) {
1176 log_warnx("%s: failed requesting dump of vionet[%d]",
1177 __func__, dev->vionet.idx);
1178 return (-1);
1179 }
1180 if (imsgbuf_flush(ibuf) == -1) {
1181 log_warnx("%s: imsgbuf_flush", __func__);
1182 return (-1);
1183 }
1184
1185 sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp));
1186 if (sz != sizeof(temp)) {
1187 log_warnx("%s: failed to dump vionet[%d]", __func__,
1188 dev->vionet.idx);
1189 return (-1);
1190 }
1191
1192 /* Clear volatile state. Will reinitialize on restore. */
1193 temp.vionet.vq[RXQ].q_hva = NULL;
1194 temp.vionet.vq[TXQ].q_hva = NULL;
1195 temp.async_fd = -1;
1196 temp.sync_fd = -1;
1197 memset(&temp.async_iev, 0, sizeof(temp.async_iev));
1198 memset(&temp.sync_iev, 0, sizeof(temp.sync_iev));
1199
1200 if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) {
1201 log_warnx("%s: error writing vionet to fd", __func__);
1202 return (-1);
1203 }
1204 }
1205
1206 return (0);
1207 }
1208
1209 int
vioblk_dump(int fd)1210 vioblk_dump(int fd)
1211 {
1212 struct virtio_dev *dev, temp;
1213 struct viodev_msg msg;
1214 struct imsg imsg;
1215 struct imsgbuf *ibuf = NULL;
1216 size_t sz;
1217 int ret;
1218
1219 log_debug("%s: dumping vioblk", __func__);
1220
1221 SLIST_FOREACH(dev, &virtio_devs, dev_next) {
1222 if (dev->dev_type != VMD_DEVTYPE_DISK)
1223 continue;
1224
1225 memset(&msg, 0, sizeof(msg));
1226 memset(&imsg, 0, sizeof(imsg));
1227
1228 ibuf = &dev->sync_iev.ibuf;
1229 msg.type = VIODEV_MSG_DUMP;
1230
1231 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
1232 sizeof(msg));
1233 if (ret == -1) {
1234 log_warnx("%s: failed requesting dump of vioblk[%d]",
1235 __func__, dev->vioblk.idx);
1236 return (-1);
1237 }
1238 if (imsgbuf_flush(ibuf) == -1) {
1239 log_warnx("%s: imsgbuf_flush", __func__);
1240 return (-1);
1241 }
1242
1243
1244 sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp));
1245 if (sz != sizeof(temp)) {
1246 log_warnx("%s: failed to dump vioblk[%d]", __func__,
1247 dev->vioblk.idx);
1248 return (-1);
1249 }
1250
1251 /* Clear volatile state. Will reinitialize on restore. */
1252 temp.vioblk.vq[0].q_hva = NULL;
1253 temp.async_fd = -1;
1254 temp.sync_fd = -1;
1255 memset(&temp.async_iev, 0, sizeof(temp.async_iev));
1256 memset(&temp.sync_iev, 0, sizeof(temp.sync_iev));
1257
1258 if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) {
1259 log_warnx("%s: error writing vioblk to fd", __func__);
1260 return (-1);
1261 }
1262 }
1263
1264 return (0);
1265 }
1266
1267 int
vioscsi_dump(int fd)1268 vioscsi_dump(int fd)
1269 {
1270 unsigned int i;
1271
1272 if (vioscsi == NULL)
1273 return (0);
1274
1275 log_debug("%s: sending vioscsi", __func__);
1276
1277 for (i = 0; i < 3; i++)
1278 vioscsi->vq[i].q_hva = NULL;
1279
1280 if (atomicio(vwrite, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
1281 sizeof(struct vioscsi_dev)) {
1282 log_warnx("%s: error writing vioscsi to fd", __func__);
1283 return (-1);
1284 }
1285 return (0);
1286 }
1287
1288 int
virtio_dump(int fd)1289 virtio_dump(int fd)
1290 {
1291 int ret;
1292
1293 if ((ret = viornd_dump(fd)) == -1)
1294 return ret;
1295
1296 if ((ret = vioblk_dump(fd)) == -1)
1297 return ret;
1298
1299 if ((ret = vioscsi_dump(fd)) == -1)
1300 return ret;
1301
1302 if ((ret = vionet_dump(fd)) == -1)
1303 return ret;
1304
1305 if ((ret = vmmci_dump(fd)) == -1)
1306 return ret;
1307
1308 return (0);
1309 }
1310
virtio_broadcast_imsg(struct vmd_vm * vm,uint16_t type,void * data,uint16_t datalen)1311 void virtio_broadcast_imsg(struct vmd_vm *vm, uint16_t type, void *data,
1312 uint16_t datalen)
1313 {
1314 struct virtio_dev *dev;
1315 int ret;
1316
1317 SLIST_FOREACH(dev, &virtio_devs, dev_next) {
1318 ret = imsg_compose_event(&dev->async_iev, type, 0, 0, -1, data,
1319 datalen);
1320 if (ret == -1) {
1321 log_warnx("%s: failed to broadcast imsg type %u",
1322 __func__, type);
1323 }
1324 }
1325
1326 }
1327
1328 void
virtio_stop(struct vmd_vm * vm)1329 virtio_stop(struct vmd_vm *vm)
1330 {
1331 return virtio_broadcast_imsg(vm, IMSG_VMDOP_PAUSE_VM, NULL, 0);
1332 }
1333
1334 void
virtio_start(struct vmd_vm * vm)1335 virtio_start(struct vmd_vm *vm)
1336 {
1337 return virtio_broadcast_imsg(vm, IMSG_VMDOP_UNPAUSE_VM, NULL, 0);
1338 }
1339
1340 /*
1341 * Fork+exec a child virtio device. Returns 0 on success.
1342 */
1343 static int
virtio_dev_launch(struct vmd_vm * vm,struct virtio_dev * dev)1344 virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev)
1345 {
1346 char *nargv[12], num[32], vmm_fd[32], vm_name[VM_NAME_MAX], t[2];
1347 pid_t dev_pid;
1348 int sync_fds[2], async_fds[2], ret = 0;
1349 size_t i, sz = 0;
1350 struct viodev_msg msg;
1351 struct virtio_dev *dev_entry;
1352 struct imsg imsg;
1353 struct imsgev *iev = &dev->sync_iev;
1354
1355 switch (dev->dev_type) {
1356 case VMD_DEVTYPE_NET:
1357 log_debug("%s: launching vionet%d",
1358 vm->vm_params.vmc_params.vcp_name, dev->vionet.idx);
1359 break;
1360 case VMD_DEVTYPE_DISK:
1361 log_debug("%s: launching vioblk%d",
1362 vm->vm_params.vmc_params.vcp_name, dev->vioblk.idx);
1363 break;
1364 /* NOTREACHED */
1365 default:
1366 log_warn("%s: invalid device type", __func__);
1367 return (EINVAL);
1368 }
1369
1370 /* We need two channels: one synchronous (IO reads) and one async. */
1371 if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, PF_UNSPEC,
1372 sync_fds) == -1) {
1373 log_warn("failed to create socketpair");
1374 return (errno);
1375 }
1376 if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, PF_UNSPEC,
1377 async_fds) == -1) {
1378 log_warn("failed to create async socketpair");
1379 return (errno);
1380 }
1381
1382 /* Fork... */
1383 dev_pid = fork();
1384 if (dev_pid == -1) {
1385 ret = errno;
1386 log_warn("%s: fork failed", __func__);
1387 goto err;
1388 }
1389
1390 if (dev_pid > 0) {
1391 /* Parent */
1392 close_fd(sync_fds[1]);
1393 close_fd(async_fds[1]);
1394
1395 /* Save the child's pid to help with cleanup. */
1396 dev->dev_pid = dev_pid;
1397
1398 /* Set the channel fds to the child's before sending. */
1399 dev->sync_fd = sync_fds[1];
1400 dev->async_fd = async_fds[1];
1401
1402 /* 1. Send over our configured device. */
1403 log_debug("%s: sending '%c' type device struct", __func__,
1404 dev->dev_type);
1405 sz = atomicio(vwrite, sync_fds[0], dev, sizeof(*dev));
1406 if (sz != sizeof(*dev)) {
1407 log_warnx("%s: failed to send device", __func__);
1408 ret = EIO;
1409 goto err;
1410 }
1411
1412 /* Close data fds. Only the child device needs them now. */
1413 if (virtio_dev_closefds(dev) == -1) {
1414 log_warnx("%s: failed to close device data fds",
1415 __func__);
1416 goto err;
1417 }
1418
1419 /* 2. Send over details on the VM (including memory fds). */
1420 log_debug("%s: sending vm message for '%s'", __func__,
1421 vm->vm_params.vmc_params.vcp_name);
1422 sz = atomicio(vwrite, sync_fds[0], vm, sizeof(*vm));
1423 if (sz != sizeof(*vm)) {
1424 log_warnx("%s: failed to send vm details", __func__);
1425 ret = EIO;
1426 goto err;
1427 }
1428
1429 /*
1430 * Initialize our imsg channel to the child device. The initial
1431 * communication will be synchronous. We expect the child to
1432 * report itself "ready" to confirm the launch was a success.
1433 */
1434 if (imsgbuf_init(&iev->ibuf, sync_fds[0]) == -1) {
1435 log_warn("%s: failed to init imsgbuf", __func__);
1436 goto err;
1437 }
1438 imsgbuf_allow_fdpass(&iev->ibuf);
1439 ret = imsgbuf_read_one(&iev->ibuf, &imsg);
1440 if (ret == 0 || ret == -1) {
1441 log_warnx("%s: failed to receive ready message from "
1442 "'%c' type device", __func__, dev->dev_type);
1443 ret = EIO;
1444 goto err;
1445 }
1446 ret = 0;
1447
1448 IMSG_SIZE_CHECK(&imsg, &msg);
1449 memcpy(&msg, imsg.data, sizeof(msg));
1450 imsg_free(&imsg);
1451
1452 if (msg.type != VIODEV_MSG_READY) {
1453 log_warnx("%s: expected ready message, got type %d",
1454 __func__, msg.type);
1455 ret = EINVAL;
1456 goto err;
1457 }
1458 log_debug("%s: device reports ready via sync channel",
1459 __func__);
1460
1461 /*
1462 * Wire in the async event handling, but after reverting back
1463 * to the parent's fd's.
1464 */
1465 dev->sync_fd = sync_fds[0];
1466 dev->async_fd = async_fds[0];
1467 vm_device_pipe(dev, virtio_dispatch_dev, NULL);
1468 } else {
1469 /* Child */
1470 close_fd(async_fds[0]);
1471 close_fd(sync_fds[0]);
1472
1473 /* Close pty. Virtio devices do not need it. */
1474 close_fd(vm->vm_tty);
1475 vm->vm_tty = -1;
1476
1477 if (vm->vm_cdrom != -1) {
1478 close_fd(vm->vm_cdrom);
1479 vm->vm_cdrom = -1;
1480 }
1481
1482 /* Keep data file descriptors open after exec. */
1483 SLIST_FOREACH(dev_entry, &virtio_devs, dev_next) {
1484 if (dev_entry == dev)
1485 continue;
1486 if (virtio_dev_closefds(dev_entry) == -1)
1487 fatalx("unable to close other virtio devs");
1488 }
1489
1490 memset(num, 0, sizeof(num));
1491 snprintf(num, sizeof(num), "%d", sync_fds[1]);
1492 memset(vmm_fd, 0, sizeof(vmm_fd));
1493 snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd);
1494 memset(vm_name, 0, sizeof(vm_name));
1495 snprintf(vm_name, sizeof(vm_name), "%s",
1496 vm->vm_params.vmc_params.vcp_name);
1497
1498 t[0] = dev->dev_type;
1499 t[1] = '\0';
1500
1501 i = 0;
1502 nargv[i++] = env->argv0;
1503 nargv[i++] = "-X";
1504 nargv[i++] = num;
1505 nargv[i++] = "-t";
1506 nargv[i++] = t;
1507 nargv[i++] = "-i";
1508 nargv[i++] = vmm_fd;
1509 nargv[i++] = "-p";
1510 nargv[i++] = vm_name;
1511 if (env->vmd_debug)
1512 nargv[i++] = "-d";
1513 if (env->vmd_verbose == 1)
1514 nargv[i++] = "-v";
1515 else if (env->vmd_verbose > 1)
1516 nargv[i++] = "-vv";
1517 nargv[i++] = NULL;
1518 if (i > sizeof(nargv) / sizeof(nargv[0]))
1519 fatalx("%s: nargv overflow", __func__);
1520
1521 /* Control resumes in vmd.c:main(). */
1522 execvp(nargv[0], nargv);
1523
1524 ret = errno;
1525 log_warn("%s: failed to exec device", __func__);
1526 _exit(ret);
1527 /* NOTREACHED */
1528 }
1529
1530 return (ret);
1531
1532 err:
1533 close_fd(sync_fds[0]);
1534 close_fd(sync_fds[1]);
1535 close_fd(async_fds[0]);
1536 close_fd(async_fds[1]);
1537 return (ret);
1538 }
1539
1540 /*
1541 * Initialize an async imsg channel for a virtio device.
1542 */
1543 int
vm_device_pipe(struct virtio_dev * dev,void (* cb)(int,short,void *),struct event_base * ev_base)1544 vm_device_pipe(struct virtio_dev *dev, void (*cb)(int, short, void *),
1545 struct event_base *ev_base)
1546 {
1547 struct imsgev *iev = &dev->async_iev;
1548 int fd = dev->async_fd;
1549
1550 log_debug("%s: initializing '%c' device pipe (fd=%d)", __func__,
1551 dev->dev_type, fd);
1552
1553 if (imsgbuf_init(&iev->ibuf, fd) == -1)
1554 fatal("imsgbuf_init");
1555 imsgbuf_allow_fdpass(&iev->ibuf);
1556 iev->handler = cb;
1557 iev->data = dev;
1558 iev->events = EV_READ;
1559 imsg_event_add2(iev, ev_base);
1560
1561 return (0);
1562 }
1563
1564 void
virtio_dispatch_dev(int fd,short event,void * arg)1565 virtio_dispatch_dev(int fd, short event, void *arg)
1566 {
1567 struct virtio_dev *dev = (struct virtio_dev*)arg;
1568 struct imsgev *iev = &dev->async_iev;
1569 struct imsgbuf *ibuf = &iev->ibuf;
1570 struct imsg imsg;
1571 struct viodev_msg msg;
1572 ssize_t n = 0;
1573
1574 if (event & EV_READ) {
1575 if ((n = imsgbuf_read(ibuf)) == -1)
1576 fatal("%s: imsgbuf_read", __func__);
1577 if (n == 0) {
1578 /* this pipe is dead, so remove the event handler */
1579 log_debug("%s: pipe dead (EV_READ)", __func__);
1580 event_del(&iev->ev);
1581 event_loopexit(NULL);
1582 return;
1583 }
1584 }
1585
1586 if (event & EV_WRITE) {
1587 if (imsgbuf_write(ibuf) == -1) {
1588 if (errno == EPIPE) {
1589 /* this pipe is dead, remove the handler */
1590 log_debug("%s: pipe dead (EV_WRITE)", __func__);
1591 event_del(&iev->ev);
1592 event_loopexit(NULL);
1593 return;
1594 }
1595 fatal("%s: imsgbuf_write", __func__);
1596 }
1597 }
1598
1599 for (;;) {
1600 if ((n = imsg_get(ibuf, &imsg)) == -1)
1601 fatal("%s: imsg_get", __func__);
1602 if (n == 0)
1603 break;
1604
1605 switch (imsg.hdr.type) {
1606 case IMSG_DEVOP_MSG:
1607 IMSG_SIZE_CHECK(&imsg, &msg);
1608 memcpy(&msg, imsg.data, sizeof(msg));
1609 handle_dev_msg(&msg, dev);
1610 break;
1611 default:
1612 log_warnx("%s: got non devop imsg %d", __func__,
1613 imsg.hdr.type);
1614 break;
1615 }
1616 imsg_free(&imsg);
1617 }
1618 imsg_event_add(iev);
1619 }
1620
1621
1622 static int
handle_dev_msg(struct viodev_msg * msg,struct virtio_dev * gdev)1623 handle_dev_msg(struct viodev_msg *msg, struct virtio_dev *gdev)
1624 {
1625 uint32_t vm_id = gdev->vm_id;
1626 int irq = gdev->irq;
1627
1628 switch (msg->type) {
1629 case VIODEV_MSG_KICK:
1630 if (msg->state == INTR_STATE_ASSERT)
1631 vcpu_assert_irq(vm_id, msg->vcpu, irq);
1632 else if (msg->state == INTR_STATE_DEASSERT)
1633 vcpu_deassert_irq(vm_id, msg->vcpu, irq);
1634 break;
1635 case VIODEV_MSG_READY:
1636 log_debug("%s: device reports ready", __func__);
1637 break;
1638 case VIODEV_MSG_ERROR:
1639 log_warnx("%s: device reported error", __func__);
1640 break;
1641 case VIODEV_MSG_INVALID:
1642 case VIODEV_MSG_IO_READ:
1643 case VIODEV_MSG_IO_WRITE:
1644 /* FALLTHROUGH */
1645 default:
1646 log_warnx("%s: unsupported device message type %d", __func__,
1647 msg->type);
1648 return (1);
1649 }
1650
1651 return (0);
1652 };
1653
1654 /*
1655 * Called by the VM process while processing IO from the VCPU thread.
1656 *
1657 * N.b. Since the VCPU thread calls this function, we cannot mutate the event
1658 * system. All ipc messages must be sent manually and cannot be queued for
1659 * the event loop to push them. (We need to perform a synchronous read, so
1660 * this isn't really a big deal.)
1661 */
1662 int
virtio_pci_io(int dir,uint16_t reg,uint32_t * data,uint8_t * intr,void * cookie,uint8_t sz)1663 virtio_pci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
1664 void *cookie, uint8_t sz)
1665 {
1666 struct virtio_dev *dev = (struct virtio_dev *)cookie;
1667 struct imsgbuf *ibuf = &dev->sync_iev.ibuf;
1668 struct imsg imsg;
1669 struct viodev_msg msg;
1670 int ret = 0;
1671
1672 memset(&msg, 0, sizeof(msg));
1673 msg.reg = reg;
1674 msg.io_sz = sz;
1675
1676 if (dir == 0) {
1677 msg.type = VIODEV_MSG_IO_WRITE;
1678 msg.data = *data;
1679 msg.data_valid = 1;
1680 } else
1681 msg.type = VIODEV_MSG_IO_READ;
1682
1683 if (msg.type == VIODEV_MSG_IO_WRITE) {
1684 /*
1685 * Write request. No reply expected.
1686 */
1687 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
1688 sizeof(msg));
1689 if (ret == -1) {
1690 log_warn("%s: failed to send async io event to virtio"
1691 " device", __func__);
1692 return (ret);
1693 }
1694 if (imsgbuf_flush(ibuf) == -1) {
1695 log_warnx("%s: imsgbuf_flush (write)", __func__);
1696 return (-1);
1697 }
1698 } else {
1699 /*
1700 * Read request. Requires waiting for a reply.
1701 */
1702 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
1703 sizeof(msg));
1704 if (ret == -1) {
1705 log_warnx("%s: failed to send sync io event to virtio"
1706 " device", __func__);
1707 return (ret);
1708 }
1709 if (imsgbuf_flush(ibuf) == -1) {
1710 log_warnx("%s: imsgbuf_flush (read)", __func__);
1711 return (-1);
1712 }
1713
1714 /* Read our reply. */
1715 ret = imsgbuf_read_one(ibuf, &imsg);
1716 if (ret == 0 || ret == -1) {
1717 log_warn("%s: imsgbuf_read (n=%d)", __func__, ret);
1718 return (-1);
1719 }
1720 IMSG_SIZE_CHECK(&imsg, &msg);
1721 memcpy(&msg, imsg.data, sizeof(msg));
1722 imsg_free(&imsg);
1723
1724 if (msg.type == VIODEV_MSG_IO_READ && msg.data_valid) {
1725 #if DEBUG
1726 log_debug("%s: got sync read response (reg=%s)",
1727 __func__, virtio_reg_name(msg.reg));
1728 #endif /* DEBUG */
1729 *data = msg.data;
1730 /*
1731 * It's possible we're asked to {de,}assert after the
1732 * device performs a register read.
1733 */
1734 if (msg.state == INTR_STATE_ASSERT)
1735 vcpu_assert_irq(dev->vm_id, msg.vcpu, msg.irq);
1736 else if (msg.state == INTR_STATE_DEASSERT)
1737 vcpu_deassert_irq(dev->vm_id, msg.vcpu, msg.irq);
1738 } else {
1739 log_warnx("%s: expected IO_READ, got %d", __func__,
1740 msg.type);
1741 return (-1);
1742 }
1743 }
1744
1745 return (0);
1746 }
1747
1748 void
virtio_assert_irq(struct virtio_dev * dev,int vcpu)1749 virtio_assert_irq(struct virtio_dev *dev, int vcpu)
1750 {
1751 struct viodev_msg msg;
1752 int ret;
1753
1754 memset(&msg, 0, sizeof(msg));
1755 msg.irq = dev->irq;
1756 msg.vcpu = vcpu;
1757 msg.type = VIODEV_MSG_KICK;
1758 msg.state = INTR_STATE_ASSERT;
1759
1760 ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
1761 &msg, sizeof(msg));
1762 if (ret == -1)
1763 log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
1764 }
1765
1766 void
virtio_deassert_irq(struct virtio_dev * dev,int vcpu)1767 virtio_deassert_irq(struct virtio_dev *dev, int vcpu)
1768 {
1769 struct viodev_msg msg;
1770 int ret;
1771
1772 memset(&msg, 0, sizeof(msg));
1773 msg.irq = dev->irq;
1774 msg.vcpu = vcpu;
1775 msg.type = VIODEV_MSG_KICK;
1776 msg.state = INTR_STATE_DEASSERT;
1777
1778 ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
1779 &msg, sizeof(msg));
1780 if (ret == -1)
1781 log_warnx("%s: failed to deassert irq %d", __func__, dev->irq);
1782 }
1783
1784 /*
1785 * Close all underlying file descriptors for a given virtio device.
1786 */
1787 static int
virtio_dev_closefds(struct virtio_dev * dev)1788 virtio_dev_closefds(struct virtio_dev *dev)
1789 {
1790 size_t i;
1791
1792 switch (dev->dev_type) {
1793 case VMD_DEVTYPE_DISK:
1794 for (i = 0; i < dev->vioblk.ndisk_fd; i++) {
1795 close_fd(dev->vioblk.disk_fd[i]);
1796 dev->vioblk.disk_fd[i] = -1;
1797 }
1798 break;
1799 case VMD_DEVTYPE_NET:
1800 close_fd(dev->vionet.data_fd);
1801 dev->vionet.data_fd = -1;
1802 break;
1803 default:
1804 log_warnx("%s: invalid device type", __func__);
1805 return (-1);
1806 }
1807
1808 close_fd(dev->async_fd);
1809 dev->async_fd = -1;
1810 close_fd(dev->sync_fd);
1811 dev->sync_fd = -1;
1812
1813 return (0);
1814 }
1815