xref: /qemu/hw/virtio/vhost.c (revision 226419d6)
1 /*
2  * vhost support
3  *
4  * Copyright Red Hat, Inc. 2010
5  *
6  * Authors:
7  *  Michael S. Tsirkin <mst@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Contributions after 2012-01-13 are licensed under the terms of the
13  * GNU GPL, version 2 or (at your option) any later version.
14  */
15 
16 #include "qemu/osdep.h"
17 #include "hw/virtio/vhost.h"
18 #include "hw/hw.h"
19 #include "qemu/atomic.h"
20 #include "qemu/range.h"
21 #include "qemu/error-report.h"
22 #include "qemu/memfd.h"
23 #include <linux/vhost.h>
24 #include "exec/address-spaces.h"
25 #include "hw/virtio/virtio-bus.h"
26 #include "hw/virtio/virtio-access.h"
27 #include "migration/migration.h"
28 
29 static struct vhost_log *vhost_log;
30 static struct vhost_log *vhost_log_shm;
31 
32 static unsigned int used_memslots;
33 static QLIST_HEAD(, vhost_dev) vhost_devices =
34     QLIST_HEAD_INITIALIZER(vhost_devices);
35 
36 bool vhost_has_free_slot(void)
37 {
38     unsigned int slots_limit = ~0U;
39     struct vhost_dev *hdev;
40 
41     QLIST_FOREACH(hdev, &vhost_devices, entry) {
42         unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
43         slots_limit = MIN(slots_limit, r);
44     }
45     return slots_limit > used_memslots;
46 }
47 
48 static void vhost_dev_sync_region(struct vhost_dev *dev,
49                                   MemoryRegionSection *section,
50                                   uint64_t mfirst, uint64_t mlast,
51                                   uint64_t rfirst, uint64_t rlast)
52 {
53     vhost_log_chunk_t *log = dev->log->log;
54 
55     uint64_t start = MAX(mfirst, rfirst);
56     uint64_t end = MIN(mlast, rlast);
57     vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
58     vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
59     uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK;
60 
61     if (end < start) {
62         return;
63     }
64     assert(end / VHOST_LOG_CHUNK < dev->log_size);
65     assert(start / VHOST_LOG_CHUNK < dev->log_size);
66 
67     for (;from < to; ++from) {
68         vhost_log_chunk_t log;
69         /* We first check with non-atomic: much cheaper,
70          * and we expect non-dirty to be the common case. */
71         if (!*from) {
72             addr += VHOST_LOG_CHUNK;
73             continue;
74         }
75         /* Data must be read atomically. We don't really need barrier semantics
76          * but it's easier to use atomic_* than roll our own. */
77         log = atomic_xchg(from, 0);
78         while (log) {
79             int bit = ctzl(log);
80             hwaddr page_addr;
81             hwaddr section_offset;
82             hwaddr mr_offset;
83             page_addr = addr + bit * VHOST_LOG_PAGE;
84             section_offset = page_addr - section->offset_within_address_space;
85             mr_offset = section_offset + section->offset_within_region;
86             memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
87             log &= ~(0x1ull << bit);
88         }
89         addr += VHOST_LOG_CHUNK;
90     }
91 }
92 
93 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
94                                    MemoryRegionSection *section,
95                                    hwaddr first,
96                                    hwaddr last)
97 {
98     int i;
99     hwaddr start_addr;
100     hwaddr end_addr;
101 
102     if (!dev->log_enabled || !dev->started) {
103         return 0;
104     }
105     start_addr = section->offset_within_address_space;
106     end_addr = range_get_last(start_addr, int128_get64(section->size));
107     start_addr = MAX(first, start_addr);
108     end_addr = MIN(last, end_addr);
109 
110     for (i = 0; i < dev->mem->nregions; ++i) {
111         struct vhost_memory_region *reg = dev->mem->regions + i;
112         vhost_dev_sync_region(dev, section, start_addr, end_addr,
113                               reg->guest_phys_addr,
114                               range_get_last(reg->guest_phys_addr,
115                                              reg->memory_size));
116     }
117     for (i = 0; i < dev->nvqs; ++i) {
118         struct vhost_virtqueue *vq = dev->vqs + i;
119         vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
120                               range_get_last(vq->used_phys, vq->used_size));
121     }
122     return 0;
123 }
124 
125 static void vhost_log_sync(MemoryListener *listener,
126                           MemoryRegionSection *section)
127 {
128     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
129                                          memory_listener);
130     vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
131 }
132 
133 static void vhost_log_sync_range(struct vhost_dev *dev,
134                                  hwaddr first, hwaddr last)
135 {
136     int i;
137     /* FIXME: this is N^2 in number of sections */
138     for (i = 0; i < dev->n_mem_sections; ++i) {
139         MemoryRegionSection *section = &dev->mem_sections[i];
140         vhost_sync_dirty_bitmap(dev, section, first, last);
141     }
142 }
143 
144 /* Assign/unassign. Keep an unsorted array of non-overlapping
145  * memory regions in dev->mem. */
146 static void vhost_dev_unassign_memory(struct vhost_dev *dev,
147                                       uint64_t start_addr,
148                                       uint64_t size)
149 {
150     int from, to, n = dev->mem->nregions;
151     /* Track overlapping/split regions for sanity checking. */
152     int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;
153 
154     for (from = 0, to = 0; from < n; ++from, ++to) {
155         struct vhost_memory_region *reg = dev->mem->regions + to;
156         uint64_t reglast;
157         uint64_t memlast;
158         uint64_t change;
159 
160         /* clone old region */
161         if (to != from) {
162             memcpy(reg, dev->mem->regions + from, sizeof *reg);
163         }
164 
165         /* No overlap is simple */
166         if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
167                             start_addr, size)) {
168             continue;
169         }
170 
171         /* Split only happens if supplied region
172          * is in the middle of an existing one. Thus it can not
173          * overlap with any other existing region. */
174         assert(!split);
175 
176         reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
177         memlast = range_get_last(start_addr, size);
178 
179         /* Remove whole region */
180         if (start_addr <= reg->guest_phys_addr && memlast >= reglast) {
181             --dev->mem->nregions;
182             --to;
183             ++overlap_middle;
184             continue;
185         }
186 
187         /* Shrink region */
188         if (memlast >= reglast) {
189             reg->memory_size = start_addr - reg->guest_phys_addr;
190             assert(reg->memory_size);
191             assert(!overlap_end);
192             ++overlap_end;
193             continue;
194         }
195 
196         /* Shift region */
197         if (start_addr <= reg->guest_phys_addr) {
198             change = memlast + 1 - reg->guest_phys_addr;
199             reg->memory_size -= change;
200             reg->guest_phys_addr += change;
201             reg->userspace_addr += change;
202             assert(reg->memory_size);
203             assert(!overlap_start);
204             ++overlap_start;
205             continue;
206         }
207 
208         /* This only happens if supplied region
209          * is in the middle of an existing one. Thus it can not
210          * overlap with any other existing region. */
211         assert(!overlap_start);
212         assert(!overlap_end);
213         assert(!overlap_middle);
214         /* Split region: shrink first part, shift second part. */
215         memcpy(dev->mem->regions + n, reg, sizeof *reg);
216         reg->memory_size = start_addr - reg->guest_phys_addr;
217         assert(reg->memory_size);
218         change = memlast + 1 - reg->guest_phys_addr;
219         reg = dev->mem->regions + n;
220         reg->memory_size -= change;
221         assert(reg->memory_size);
222         reg->guest_phys_addr += change;
223         reg->userspace_addr += change;
224         /* Never add more than 1 region */
225         assert(dev->mem->nregions == n);
226         ++dev->mem->nregions;
227         ++split;
228     }
229 }
230 
231 /* Called after unassign, so no regions overlap the given range. */
232 static void vhost_dev_assign_memory(struct vhost_dev *dev,
233                                     uint64_t start_addr,
234                                     uint64_t size,
235                                     uint64_t uaddr)
236 {
237     int from, to;
238     struct vhost_memory_region *merged = NULL;
239     for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
240         struct vhost_memory_region *reg = dev->mem->regions + to;
241         uint64_t prlast, urlast;
242         uint64_t pmlast, umlast;
243         uint64_t s, e, u;
244 
245         /* clone old region */
246         if (to != from) {
247             memcpy(reg, dev->mem->regions + from, sizeof *reg);
248         }
249         prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
250         pmlast = range_get_last(start_addr, size);
251         urlast = range_get_last(reg->userspace_addr, reg->memory_size);
252         umlast = range_get_last(uaddr, size);
253 
254         /* check for overlapping regions: should never happen. */
255         assert(prlast < start_addr || pmlast < reg->guest_phys_addr);
256         /* Not an adjacent or overlapping region - do not merge. */
257         if ((prlast + 1 != start_addr || urlast + 1 != uaddr) &&
258             (pmlast + 1 != reg->guest_phys_addr ||
259              umlast + 1 != reg->userspace_addr)) {
260             continue;
261         }
262 
263         if (dev->vhost_ops->vhost_backend_can_merge &&
264             !dev->vhost_ops->vhost_backend_can_merge(dev, uaddr, size,
265                                                      reg->userspace_addr,
266                                                      reg->memory_size)) {
267             continue;
268         }
269 
270         if (merged) {
271             --to;
272             assert(to >= 0);
273         } else {
274             merged = reg;
275         }
276         u = MIN(uaddr, reg->userspace_addr);
277         s = MIN(start_addr, reg->guest_phys_addr);
278         e = MAX(pmlast, prlast);
279         uaddr = merged->userspace_addr = u;
280         start_addr = merged->guest_phys_addr = s;
281         size = merged->memory_size = e - s + 1;
282         assert(merged->memory_size);
283     }
284 
285     if (!merged) {
286         struct vhost_memory_region *reg = dev->mem->regions + to;
287         memset(reg, 0, sizeof *reg);
288         reg->memory_size = size;
289         assert(reg->memory_size);
290         reg->guest_phys_addr = start_addr;
291         reg->userspace_addr = uaddr;
292         ++to;
293     }
294     assert(to <= dev->mem->nregions + 1);
295     dev->mem->nregions = to;
296 }
297 
298 static uint64_t vhost_get_log_size(struct vhost_dev *dev)
299 {
300     uint64_t log_size = 0;
301     int i;
302     for (i = 0; i < dev->mem->nregions; ++i) {
303         struct vhost_memory_region *reg = dev->mem->regions + i;
304         uint64_t last = range_get_last(reg->guest_phys_addr,
305                                        reg->memory_size);
306         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
307     }
308     for (i = 0; i < dev->nvqs; ++i) {
309         struct vhost_virtqueue *vq = dev->vqs + i;
310         uint64_t last = vq->used_phys + vq->used_size - 1;
311         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
312     }
313     return log_size;
314 }
315 
316 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
317 {
318     struct vhost_log *log;
319     uint64_t logsize = size * sizeof(*(log->log));
320     int fd = -1;
321 
322     log = g_new0(struct vhost_log, 1);
323     if (share) {
324         log->log = qemu_memfd_alloc("vhost-log", logsize,
325                                     F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
326                                     &fd);
327         memset(log->log, 0, logsize);
328     } else {
329         log->log = g_malloc0(logsize);
330     }
331 
332     log->size = size;
333     log->refcnt = 1;
334     log->fd = fd;
335 
336     return log;
337 }
338 
339 static struct vhost_log *vhost_log_get(uint64_t size, bool share)
340 {
341     struct vhost_log *log = share ? vhost_log_shm : vhost_log;
342 
343     if (!log || log->size != size) {
344         log = vhost_log_alloc(size, share);
345         if (share) {
346             vhost_log_shm = log;
347         } else {
348             vhost_log = log;
349         }
350     } else {
351         ++log->refcnt;
352     }
353 
354     return log;
355 }
356 
357 static void vhost_log_put(struct vhost_dev *dev, bool sync)
358 {
359     struct vhost_log *log = dev->log;
360 
361     if (!log) {
362         return;
363     }
364 
365     --log->refcnt;
366     if (log->refcnt == 0) {
367         /* Sync only the range covered by the old log */
368         if (dev->log_size && sync) {
369             vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
370         }
371 
372         if (vhost_log == log) {
373             g_free(log->log);
374             vhost_log = NULL;
375         } else if (vhost_log_shm == log) {
376             qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
377                             log->fd);
378             vhost_log_shm = NULL;
379         }
380 
381         g_free(log);
382     }
383 }
384 
385 static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
386 {
387     return dev->vhost_ops->vhost_requires_shm_log &&
388            dev->vhost_ops->vhost_requires_shm_log(dev);
389 }
390 
391 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
392 {
393     struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
394     uint64_t log_base = (uintptr_t)log->log;
395     int r;
396 
397     /* inform backend of log switching, this must be done before
398        releasing the current log, to ensure no logging is lost */
399     r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
400     assert(r >= 0);
401     vhost_log_put(dev, true);
402     dev->log = log;
403     dev->log_size = size;
404 }
405 
406 static int vhost_verify_ring_mappings(struct vhost_dev *dev,
407                                       uint64_t start_addr,
408                                       uint64_t size)
409 {
410     int i;
411     int r = 0;
412 
413     for (i = 0; !r && i < dev->nvqs; ++i) {
414         struct vhost_virtqueue *vq = dev->vqs + i;
415         hwaddr l;
416         void *p;
417 
418         if (!ranges_overlap(start_addr, size, vq->ring_phys, vq->ring_size)) {
419             continue;
420         }
421         l = vq->ring_size;
422         p = cpu_physical_memory_map(vq->ring_phys, &l, 1);
423         if (!p || l != vq->ring_size) {
424             fprintf(stderr, "Unable to map ring buffer for ring %d\n", i);
425             r = -ENOMEM;
426         }
427         if (p != vq->ring) {
428             fprintf(stderr, "Ring buffer relocated for ring %d\n", i);
429             r = -EBUSY;
430         }
431         cpu_physical_memory_unmap(p, l, 0, 0);
432     }
433     return r;
434 }
435 
436 static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev,
437 						      uint64_t start_addr,
438 						      uint64_t size)
439 {
440     int i, n = dev->mem->nregions;
441     for (i = 0; i < n; ++i) {
442         struct vhost_memory_region *reg = dev->mem->regions + i;
443         if (ranges_overlap(reg->guest_phys_addr, reg->memory_size,
444                            start_addr, size)) {
445             return reg;
446         }
447     }
448     return NULL;
449 }
450 
451 static bool vhost_dev_cmp_memory(struct vhost_dev *dev,
452                                  uint64_t start_addr,
453                                  uint64_t size,
454                                  uint64_t uaddr)
455 {
456     struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size);
457     uint64_t reglast;
458     uint64_t memlast;
459 
460     if (!reg) {
461         return true;
462     }
463 
464     reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
465     memlast = range_get_last(start_addr, size);
466 
467     /* Need to extend region? */
468     if (start_addr < reg->guest_phys_addr || memlast > reglast) {
469         return true;
470     }
471     /* userspace_addr changed? */
472     return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr;
473 }
474 
475 static void vhost_set_memory(MemoryListener *listener,
476                              MemoryRegionSection *section,
477                              bool add)
478 {
479     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
480                                          memory_listener);
481     hwaddr start_addr = section->offset_within_address_space;
482     ram_addr_t size = int128_get64(section->size);
483     bool log_dirty =
484         memory_region_get_dirty_log_mask(section->mr) & ~(1 << DIRTY_MEMORY_MIGRATION);
485     int s = offsetof(struct vhost_memory, regions) +
486         (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
487     void *ram;
488 
489     dev->mem = g_realloc(dev->mem, s);
490 
491     if (log_dirty) {
492         add = false;
493     }
494 
495     assert(size);
496 
497     /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
498     ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
499     if (add) {
500         if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
501             /* Region exists with same address. Nothing to do. */
502             return;
503         }
504     } else {
505         if (!vhost_dev_find_reg(dev, start_addr, size)) {
506             /* Removing region that we don't access. Nothing to do. */
507             return;
508         }
509     }
510 
511     vhost_dev_unassign_memory(dev, start_addr, size);
512     if (add) {
513         /* Add given mapping, merging adjacent regions if any */
514         vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram);
515     } else {
516         /* Remove old mapping for this memory, if any. */
517         vhost_dev_unassign_memory(dev, start_addr, size);
518     }
519     dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr);
520     dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1);
521     dev->memory_changed = true;
522     used_memslots = dev->mem->nregions;
523 }
524 
525 static bool vhost_section(MemoryRegionSection *section)
526 {
527     return memory_region_is_ram(section->mr);
528 }
529 
530 static void vhost_begin(MemoryListener *listener)
531 {
532     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
533                                          memory_listener);
534     dev->mem_changed_end_addr = 0;
535     dev->mem_changed_start_addr = -1;
536 }
537 
538 static void vhost_commit(MemoryListener *listener)
539 {
540     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
541                                          memory_listener);
542     hwaddr start_addr = 0;
543     ram_addr_t size = 0;
544     uint64_t log_size;
545     int r;
546 
547     if (!dev->memory_changed) {
548         return;
549     }
550     if (!dev->started) {
551         return;
552     }
553     if (dev->mem_changed_start_addr > dev->mem_changed_end_addr) {
554         return;
555     }
556 
557     if (dev->started) {
558         start_addr = dev->mem_changed_start_addr;
559         size = dev->mem_changed_end_addr - dev->mem_changed_start_addr + 1;
560 
561         r = vhost_verify_ring_mappings(dev, start_addr, size);
562         assert(r >= 0);
563     }
564 
565     if (!dev->log_enabled) {
566         r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
567         assert(r >= 0);
568         dev->memory_changed = false;
569         return;
570     }
571     log_size = vhost_get_log_size(dev);
572     /* We allocate an extra 4K bytes to log,
573      * to reduce the * number of reallocations. */
574 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
575     /* To log more, must increase log size before table update. */
576     if (dev->log_size < log_size) {
577         vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
578     }
579     r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
580     assert(r >= 0);
581     /* To log less, can only decrease log size after table update. */
582     if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
583         vhost_dev_log_resize(dev, log_size);
584     }
585     dev->memory_changed = false;
586 }
587 
588 static void vhost_region_add(MemoryListener *listener,
589                              MemoryRegionSection *section)
590 {
591     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
592                                          memory_listener);
593 
594     if (!vhost_section(section)) {
595         return;
596     }
597 
598     ++dev->n_mem_sections;
599     dev->mem_sections = g_renew(MemoryRegionSection, dev->mem_sections,
600                                 dev->n_mem_sections);
601     dev->mem_sections[dev->n_mem_sections - 1] = *section;
602     memory_region_ref(section->mr);
603     vhost_set_memory(listener, section, true);
604 }
605 
606 static void vhost_region_del(MemoryListener *listener,
607                              MemoryRegionSection *section)
608 {
609     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
610                                          memory_listener);
611     int i;
612 
613     if (!vhost_section(section)) {
614         return;
615     }
616 
617     vhost_set_memory(listener, section, false);
618     memory_region_unref(section->mr);
619     for (i = 0; i < dev->n_mem_sections; ++i) {
620         if (dev->mem_sections[i].offset_within_address_space
621             == section->offset_within_address_space) {
622             --dev->n_mem_sections;
623             memmove(&dev->mem_sections[i], &dev->mem_sections[i+1],
624                     (dev->n_mem_sections - i) * sizeof(*dev->mem_sections));
625             break;
626         }
627     }
628 }
629 
630 static void vhost_region_nop(MemoryListener *listener,
631                              MemoryRegionSection *section)
632 {
633 }
634 
635 static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
636                                     struct vhost_virtqueue *vq,
637                                     unsigned idx, bool enable_log)
638 {
639     struct vhost_vring_addr addr = {
640         .index = idx,
641         .desc_user_addr = (uint64_t)(unsigned long)vq->desc,
642         .avail_user_addr = (uint64_t)(unsigned long)vq->avail,
643         .used_user_addr = (uint64_t)(unsigned long)vq->used,
644         .log_guest_addr = vq->used_phys,
645         .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
646     };
647     int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
648     if (r < 0) {
649         return -errno;
650     }
651     return 0;
652 }
653 
654 static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
655 {
656     uint64_t features = dev->acked_features;
657     int r;
658     if (enable_log) {
659         features |= 0x1ULL << VHOST_F_LOG_ALL;
660     }
661     r = dev->vhost_ops->vhost_set_features(dev, features);
662     return r < 0 ? -errno : 0;
663 }
664 
665 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
666 {
667     int r, t, i, idx;
668     r = vhost_dev_set_features(dev, enable_log);
669     if (r < 0) {
670         goto err_features;
671     }
672     for (i = 0; i < dev->nvqs; ++i) {
673         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
674         r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
675                                      enable_log);
676         if (r < 0) {
677             goto err_vq;
678         }
679     }
680     return 0;
681 err_vq:
682     for (; i >= 0; --i) {
683         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
684         t = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
685                                      dev->log_enabled);
686         assert(t >= 0);
687     }
688     t = vhost_dev_set_features(dev, dev->log_enabled);
689     assert(t >= 0);
690 err_features:
691     return r;
692 }
693 
694 static int vhost_migration_log(MemoryListener *listener, int enable)
695 {
696     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
697                                          memory_listener);
698     int r;
699     if (!!enable == dev->log_enabled) {
700         return 0;
701     }
702     if (!dev->started) {
703         dev->log_enabled = enable;
704         return 0;
705     }
706     if (!enable) {
707         r = vhost_dev_set_log(dev, false);
708         if (r < 0) {
709             return r;
710         }
711         vhost_log_put(dev, false);
712         dev->log = NULL;
713         dev->log_size = 0;
714     } else {
715         vhost_dev_log_resize(dev, vhost_get_log_size(dev));
716         r = vhost_dev_set_log(dev, true);
717         if (r < 0) {
718             return r;
719         }
720     }
721     dev->log_enabled = enable;
722     return 0;
723 }
724 
725 static void vhost_log_global_start(MemoryListener *listener)
726 {
727     int r;
728 
729     r = vhost_migration_log(listener, true);
730     if (r < 0) {
731         abort();
732     }
733 }
734 
735 static void vhost_log_global_stop(MemoryListener *listener)
736 {
737     int r;
738 
739     r = vhost_migration_log(listener, false);
740     if (r < 0) {
741         abort();
742     }
743 }
744 
745 static void vhost_log_start(MemoryListener *listener,
746                             MemoryRegionSection *section,
747                             int old, int new)
748 {
749     /* FIXME: implement */
750 }
751 
752 static void vhost_log_stop(MemoryListener *listener,
753                            MemoryRegionSection *section,
754                            int old, int new)
755 {
756     /* FIXME: implement */
757 }
758 
759 /* The vhost driver natively knows how to handle the vrings of non
760  * cross-endian legacy devices and modern devices. Only legacy devices
761  * exposed to a bi-endian guest may require the vhost driver to use a
762  * specific endianness.
763  */
764 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
765 {
766     if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
767         return false;
768     }
769 #ifdef TARGET_IS_BIENDIAN
770 #ifdef HOST_WORDS_BIGENDIAN
771     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
772 #else
773     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
774 #endif
775 #else
776     return false;
777 #endif
778 }
779 
780 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
781                                                    bool is_big_endian,
782                                                    int vhost_vq_index)
783 {
784     struct vhost_vring_state s = {
785         .index = vhost_vq_index,
786         .num = is_big_endian
787     };
788 
789     if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
790         return 0;
791     }
792 
793     if (errno == ENOTTY) {
794         error_report("vhost does not support cross-endian");
795         return -ENOSYS;
796     }
797 
798     return -errno;
799 }
800 
801 static int vhost_virtqueue_start(struct vhost_dev *dev,
802                                 struct VirtIODevice *vdev,
803                                 struct vhost_virtqueue *vq,
804                                 unsigned idx)
805 {
806     hwaddr s, l, a;
807     int r;
808     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
809     struct vhost_vring_file file = {
810         .index = vhost_vq_index
811     };
812     struct vhost_vring_state state = {
813         .index = vhost_vq_index
814     };
815     struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
816 
817 
818     vq->num = state.num = virtio_queue_get_num(vdev, idx);
819     r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
820     if (r) {
821         return -errno;
822     }
823 
824     state.num = virtio_queue_get_last_avail_idx(vdev, idx);
825     r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
826     if (r) {
827         return -errno;
828     }
829 
830     if (vhost_needs_vring_endian(vdev)) {
831         r = vhost_virtqueue_set_vring_endian_legacy(dev,
832                                                     virtio_is_big_endian(vdev),
833                                                     vhost_vq_index);
834         if (r) {
835             return -errno;
836         }
837     }
838 
839     s = l = virtio_queue_get_desc_size(vdev, idx);
840     a = virtio_queue_get_desc_addr(vdev, idx);
841     vq->desc = cpu_physical_memory_map(a, &l, 0);
842     if (!vq->desc || l != s) {
843         r = -ENOMEM;
844         goto fail_alloc_desc;
845     }
846     s = l = virtio_queue_get_avail_size(vdev, idx);
847     a = virtio_queue_get_avail_addr(vdev, idx);
848     vq->avail = cpu_physical_memory_map(a, &l, 0);
849     if (!vq->avail || l != s) {
850         r = -ENOMEM;
851         goto fail_alloc_avail;
852     }
853     vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
854     vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
855     vq->used = cpu_physical_memory_map(a, &l, 1);
856     if (!vq->used || l != s) {
857         r = -ENOMEM;
858         goto fail_alloc_used;
859     }
860 
861     vq->ring_size = s = l = virtio_queue_get_ring_size(vdev, idx);
862     vq->ring_phys = a = virtio_queue_get_ring_addr(vdev, idx);
863     vq->ring = cpu_physical_memory_map(a, &l, 1);
864     if (!vq->ring || l != s) {
865         r = -ENOMEM;
866         goto fail_alloc_ring;
867     }
868 
869     r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
870     if (r < 0) {
871         r = -errno;
872         goto fail_alloc;
873     }
874 
875     file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
876     r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
877     if (r) {
878         r = -errno;
879         goto fail_kick;
880     }
881 
882     /* Clear and discard previous events if any. */
883     event_notifier_test_and_clear(&vq->masked_notifier);
884 
885     /* Init vring in unmasked state, unless guest_notifier_mask
886      * will do it later.
887      */
888     if (!vdev->use_guest_notifier_mask) {
889         /* TODO: check and handle errors. */
890         vhost_virtqueue_mask(dev, vdev, idx, false);
891     }
892 
893     return 0;
894 
895 fail_kick:
896 fail_alloc:
897     cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
898                               0, 0);
899 fail_alloc_ring:
900     cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
901                               0, 0);
902 fail_alloc_used:
903     cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
904                               0, 0);
905 fail_alloc_avail:
906     cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
907                               0, 0);
908 fail_alloc_desc:
909     return r;
910 }
911 
912 static void vhost_virtqueue_stop(struct vhost_dev *dev,
913                                     struct VirtIODevice *vdev,
914                                     struct vhost_virtqueue *vq,
915                                     unsigned idx)
916 {
917     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
918     struct vhost_vring_state state = {
919         .index = vhost_vq_index,
920     };
921     int r;
922 
923     r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
924     if (r < 0) {
925         fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r);
926         fflush(stderr);
927     }
928     virtio_queue_set_last_avail_idx(vdev, idx, state.num);
929     virtio_queue_invalidate_signalled_used(vdev, idx);
930 
931     /* In the cross-endian case, we need to reset the vring endianness to
932      * native as legacy devices expect so by default.
933      */
934     if (vhost_needs_vring_endian(vdev)) {
935         r = vhost_virtqueue_set_vring_endian_legacy(dev,
936                                                     !virtio_is_big_endian(vdev),
937                                                     vhost_vq_index);
938         if (r < 0) {
939             error_report("failed to reset vring endianness");
940         }
941     }
942 
943     assert (r >= 0);
944     cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
945                               0, virtio_queue_get_ring_size(vdev, idx));
946     cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
947                               1, virtio_queue_get_used_size(vdev, idx));
948     cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
949                               0, virtio_queue_get_avail_size(vdev, idx));
950     cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
951                               0, virtio_queue_get_desc_size(vdev, idx));
952 }
953 
954 static void vhost_eventfd_add(MemoryListener *listener,
955                               MemoryRegionSection *section,
956                               bool match_data, uint64_t data, EventNotifier *e)
957 {
958 }
959 
960 static void vhost_eventfd_del(MemoryListener *listener,
961                               MemoryRegionSection *section,
962                               bool match_data, uint64_t data, EventNotifier *e)
963 {
964 }
965 
966 static int vhost_virtqueue_init(struct vhost_dev *dev,
967                                 struct vhost_virtqueue *vq, int n)
968 {
969     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
970     struct vhost_vring_file file = {
971         .index = vhost_vq_index,
972     };
973     int r = event_notifier_init(&vq->masked_notifier, 0);
974     if (r < 0) {
975         return r;
976     }
977 
978     file.fd = event_notifier_get_fd(&vq->masked_notifier);
979     r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
980     if (r) {
981         r = -errno;
982         goto fail_call;
983     }
984     return 0;
985 fail_call:
986     event_notifier_cleanup(&vq->masked_notifier);
987     return r;
988 }
989 
990 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
991 {
992     event_notifier_cleanup(&vq->masked_notifier);
993 }
994 
995 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
996                    VhostBackendType backend_type)
997 {
998     uint64_t features;
999     int i, r;
1000 
1001     hdev->migration_blocker = NULL;
1002 
1003     if (vhost_set_backend_type(hdev, backend_type) < 0) {
1004         close((uintptr_t)opaque);
1005         return -1;
1006     }
1007 
1008     if (hdev->vhost_ops->vhost_backend_init(hdev, opaque) < 0) {
1009         close((uintptr_t)opaque);
1010         return -errno;
1011     }
1012 
1013     if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
1014         fprintf(stderr, "vhost backend memory slots limit is less"
1015                 " than current number of present memory slots\n");
1016         close((uintptr_t)opaque);
1017         return -1;
1018     }
1019     QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1020 
1021     r = hdev->vhost_ops->vhost_set_owner(hdev);
1022     if (r < 0) {
1023         goto fail;
1024     }
1025 
1026     r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1027     if (r < 0) {
1028         goto fail;
1029     }
1030 
1031     for (i = 0; i < hdev->nvqs; ++i) {
1032         r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1033         if (r < 0) {
1034             goto fail_vq;
1035         }
1036     }
1037     hdev->features = features;
1038 
1039     hdev->memory_listener = (MemoryListener) {
1040         .begin = vhost_begin,
1041         .commit = vhost_commit,
1042         .region_add = vhost_region_add,
1043         .region_del = vhost_region_del,
1044         .region_nop = vhost_region_nop,
1045         .log_start = vhost_log_start,
1046         .log_stop = vhost_log_stop,
1047         .log_sync = vhost_log_sync,
1048         .log_global_start = vhost_log_global_start,
1049         .log_global_stop = vhost_log_global_stop,
1050         .eventfd_add = vhost_eventfd_add,
1051         .eventfd_del = vhost_eventfd_del,
1052         .priority = 10
1053     };
1054 
1055     if (hdev->migration_blocker == NULL) {
1056         if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1057             error_setg(&hdev->migration_blocker,
1058                        "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1059         } else if (!qemu_memfd_check()) {
1060             error_setg(&hdev->migration_blocker,
1061                        "Migration disabled: failed to allocate shared memory");
1062         }
1063     }
1064 
1065     if (hdev->migration_blocker != NULL) {
1066         migrate_add_blocker(hdev->migration_blocker);
1067     }
1068 
1069     hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1070     hdev->n_mem_sections = 0;
1071     hdev->mem_sections = NULL;
1072     hdev->log = NULL;
1073     hdev->log_size = 0;
1074     hdev->log_enabled = false;
1075     hdev->started = false;
1076     hdev->memory_changed = false;
1077     memory_listener_register(&hdev->memory_listener, &address_space_memory);
1078     return 0;
1079 fail_vq:
1080     while (--i >= 0) {
1081         vhost_virtqueue_cleanup(hdev->vqs + i);
1082     }
1083 fail:
1084     r = -errno;
1085     hdev->vhost_ops->vhost_backend_cleanup(hdev);
1086     QLIST_REMOVE(hdev, entry);
1087     return r;
1088 }
1089 
1090 void vhost_dev_cleanup(struct vhost_dev *hdev)
1091 {
1092     int i;
1093     for (i = 0; i < hdev->nvqs; ++i) {
1094         vhost_virtqueue_cleanup(hdev->vqs + i);
1095     }
1096     memory_listener_unregister(&hdev->memory_listener);
1097     if (hdev->migration_blocker) {
1098         migrate_del_blocker(hdev->migration_blocker);
1099         error_free(hdev->migration_blocker);
1100     }
1101     g_free(hdev->mem);
1102     g_free(hdev->mem_sections);
1103     hdev->vhost_ops->vhost_backend_cleanup(hdev);
1104     QLIST_REMOVE(hdev, entry);
1105 }
1106 
1107 /* Stop processing guest IO notifications in qemu.
1108  * Start processing them in vhost in kernel.
1109  */
1110 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1111 {
1112     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1113     VirtioBusState *vbus = VIRTIO_BUS(qbus);
1114     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1115     int i, r, e;
1116     if (!k->set_host_notifier) {
1117         fprintf(stderr, "binding does not support host notifiers\n");
1118         r = -ENOSYS;
1119         goto fail;
1120     }
1121 
1122     for (i = 0; i < hdev->nvqs; ++i) {
1123         r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, true);
1124         if (r < 0) {
1125             fprintf(stderr, "vhost VQ %d notifier binding failed: %d\n", i, -r);
1126             goto fail_vq;
1127         }
1128     }
1129 
1130     return 0;
1131 fail_vq:
1132     while (--i >= 0) {
1133         e = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false);
1134         if (e < 0) {
1135             fprintf(stderr, "vhost VQ %d notifier cleanup error: %d\n", i, -r);
1136             fflush(stderr);
1137         }
1138         assert (e >= 0);
1139     }
1140 fail:
1141     return r;
1142 }
1143 
1144 /* Stop processing guest IO notifications in vhost.
1145  * Start processing them in qemu.
1146  * This might actually run the qemu handlers right away,
1147  * so virtio in qemu must be completely setup when this is called.
1148  */
1149 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1150 {
1151     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1152     VirtioBusState *vbus = VIRTIO_BUS(qbus);
1153     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1154     int i, r;
1155 
1156     for (i = 0; i < hdev->nvqs; ++i) {
1157         r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false);
1158         if (r < 0) {
1159             fprintf(stderr, "vhost VQ %d notifier cleanup failed: %d\n", i, -r);
1160             fflush(stderr);
1161         }
1162         assert (r >= 0);
1163     }
1164 }
1165 
1166 /* Test and clear event pending status.
1167  * Should be called after unmask to avoid losing events.
1168  */
1169 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1170 {
1171     struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1172     assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1173     return event_notifier_test_and_clear(&vq->masked_notifier);
1174 }
1175 
1176 /* Mask/unmask events from this vq. */
1177 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1178                          bool mask)
1179 {
1180     struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1181     int r, index = n - hdev->vq_index;
1182     struct vhost_vring_file file;
1183 
1184     if (mask) {
1185         assert(vdev->use_guest_notifier_mask);
1186         file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
1187     } else {
1188         file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
1189     }
1190 
1191     file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1192     r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1193     assert(r >= 0);
1194 }
1195 
1196 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1197                             uint64_t features)
1198 {
1199     const int *bit = feature_bits;
1200     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1201         uint64_t bit_mask = (1ULL << *bit);
1202         if (!(hdev->features & bit_mask)) {
1203             features &= ~bit_mask;
1204         }
1205         bit++;
1206     }
1207     return features;
1208 }
1209 
1210 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1211                         uint64_t features)
1212 {
1213     const int *bit = feature_bits;
1214     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1215         uint64_t bit_mask = (1ULL << *bit);
1216         if (features & bit_mask) {
1217             hdev->acked_features |= bit_mask;
1218         }
1219         bit++;
1220     }
1221 }
1222 
1223 /* Host notifiers must be enabled at this point. */
1224 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
1225 {
1226     int i, r;
1227 
1228     hdev->started = true;
1229 
1230     r = vhost_dev_set_features(hdev, hdev->log_enabled);
1231     if (r < 0) {
1232         goto fail_features;
1233     }
1234     r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
1235     if (r < 0) {
1236         r = -errno;
1237         goto fail_mem;
1238     }
1239     for (i = 0; i < hdev->nvqs; ++i) {
1240         r = vhost_virtqueue_start(hdev,
1241                                   vdev,
1242                                   hdev->vqs + i,
1243                                   hdev->vq_index + i);
1244         if (r < 0) {
1245             goto fail_vq;
1246         }
1247     }
1248 
1249     if (hdev->log_enabled) {
1250         uint64_t log_base;
1251 
1252         hdev->log_size = vhost_get_log_size(hdev);
1253         hdev->log = vhost_log_get(hdev->log_size,
1254                                   vhost_dev_log_is_shared(hdev));
1255         log_base = (uintptr_t)hdev->log->log;
1256         r = hdev->vhost_ops->vhost_set_log_base(hdev,
1257                                                 hdev->log_size ? log_base : 0,
1258                                                 hdev->log);
1259         if (r < 0) {
1260             r = -errno;
1261             goto fail_log;
1262         }
1263     }
1264 
1265     return 0;
1266 fail_log:
1267     vhost_log_put(hdev, false);
1268 fail_vq:
1269     while (--i >= 0) {
1270         vhost_virtqueue_stop(hdev,
1271                              vdev,
1272                              hdev->vqs + i,
1273                              hdev->vq_index + i);
1274     }
1275     i = hdev->nvqs;
1276 fail_mem:
1277 fail_features:
1278 
1279     hdev->started = false;
1280     return r;
1281 }
1282 
1283 /* Host notifiers must be enabled at this point. */
1284 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
1285 {
1286     int i;
1287 
1288     for (i = 0; i < hdev->nvqs; ++i) {
1289         vhost_virtqueue_stop(hdev,
1290                              vdev,
1291                              hdev->vqs + i,
1292                              hdev->vq_index + i);
1293     }
1294 
1295     vhost_log_put(hdev, true);
1296     hdev->started = false;
1297     hdev->log = NULL;
1298     hdev->log_size = 0;
1299 }
1300 
1301