xref: /qemu/hw/vfio/helpers.c (revision 4bda8224)
1 /*
2  * low level and IOMMU backend agnostic helpers used by VFIO devices,
3  * related to regions, interrupts, capabilities
4  *
5  * Copyright Red Hat, Inc. 2012
6  *
7  * Authors:
8  *  Alex Williamson <alex.williamson@redhat.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2.  See
11  * the COPYING file in the top-level directory.
12  *
13  * Based on qemu-kvm device-assignment:
14  *  Adapted for KVM by Qumranet.
15  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
16  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
17  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
18  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
19  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
20  */
21 
22 #include "qemu/osdep.h"
23 #include <sys/ioctl.h>
24 
25 #include "hw/vfio/vfio-common.h"
26 #include "hw/hw.h"
27 #include "trace.h"
28 #include "qapi/error.h"
29 #include "qemu/error-report.h"
30 #include "monitor/monitor.h"
31 
32 /*
33  * Common VFIO interrupt disable
34  */
35 void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
36 {
37     struct vfio_irq_set irq_set = {
38         .argsz = sizeof(irq_set),
39         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
40         .index = index,
41         .start = 0,
42         .count = 0,
43     };
44 
45     ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
46 }
47 
48 void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
49 {
50     struct vfio_irq_set irq_set = {
51         .argsz = sizeof(irq_set),
52         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
53         .index = index,
54         .start = 0,
55         .count = 1,
56     };
57 
58     ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
59 }
60 
61 void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
62 {
63     struct vfio_irq_set irq_set = {
64         .argsz = sizeof(irq_set),
65         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
66         .index = index,
67         .start = 0,
68         .count = 1,
69     };
70 
71     ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
72 }
73 
74 static inline const char *action_to_str(int action)
75 {
76     switch (action) {
77     case VFIO_IRQ_SET_ACTION_MASK:
78         return "MASK";
79     case VFIO_IRQ_SET_ACTION_UNMASK:
80         return "UNMASK";
81     case VFIO_IRQ_SET_ACTION_TRIGGER:
82         return "TRIGGER";
83     default:
84         return "UNKNOWN ACTION";
85     }
86 }
87 
88 static const char *index_to_str(VFIODevice *vbasedev, int index)
89 {
90     if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
91         return NULL;
92     }
93 
94     switch (index) {
95     case VFIO_PCI_INTX_IRQ_INDEX:
96         return "INTX";
97     case VFIO_PCI_MSI_IRQ_INDEX:
98         return "MSI";
99     case VFIO_PCI_MSIX_IRQ_INDEX:
100         return "MSIX";
101     case VFIO_PCI_ERR_IRQ_INDEX:
102         return "ERR";
103     case VFIO_PCI_REQ_IRQ_INDEX:
104         return "REQ";
105     default:
106         return NULL;
107     }
108 }
109 
110 int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
111                            int action, int fd, Error **errp)
112 {
113     struct vfio_irq_set *irq_set;
114     int argsz, ret = 0;
115     const char *name;
116     int32_t *pfd;
117 
118     argsz = sizeof(*irq_set) + sizeof(*pfd);
119 
120     irq_set = g_malloc0(argsz);
121     irq_set->argsz = argsz;
122     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
123     irq_set->index = index;
124     irq_set->start = subindex;
125     irq_set->count = 1;
126     pfd = (int32_t *)&irq_set->data;
127     *pfd = fd;
128 
129     if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
130         ret = -errno;
131     }
132     g_free(irq_set);
133 
134     if (!ret) {
135         return 0;
136     }
137 
138     error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure");
139 
140     name = index_to_str(vbasedev, index);
141     if (name) {
142         error_prepend(errp, "%s-%d: ", name, subindex);
143     } else {
144         error_prepend(errp, "index %d-%d: ", index, subindex);
145     }
146     error_prepend(errp,
147                   "Failed to %s %s eventfd signaling for interrupt ",
148                   fd < 0 ? "tear down" : "set up", action_to_str(action));
149     return ret;
150 }
151 
152 /*
153  * IO Port/MMIO - Beware of the endians, VFIO is always little endian
154  */
155 void vfio_region_write(void *opaque, hwaddr addr,
156                        uint64_t data, unsigned size)
157 {
158     VFIORegion *region = opaque;
159     VFIODevice *vbasedev = region->vbasedev;
160     union {
161         uint8_t byte;
162         uint16_t word;
163         uint32_t dword;
164         uint64_t qword;
165     } buf;
166 
167     switch (size) {
168     case 1:
169         buf.byte = data;
170         break;
171     case 2:
172         buf.word = cpu_to_le16(data);
173         break;
174     case 4:
175         buf.dword = cpu_to_le32(data);
176         break;
177     case 8:
178         buf.qword = cpu_to_le64(data);
179         break;
180     default:
181         hw_error("vfio: unsupported write size, %u bytes", size);
182         break;
183     }
184 
185     if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
186         error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
187                      ",%d) failed: %m",
188                      __func__, vbasedev->name, region->nr,
189                      addr, data, size);
190     }
191 
192     trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
193 
194     /*
195      * A read or write to a BAR always signals an INTx EOI.  This will
196      * do nothing if not pending (including not in INTx mode).  We assume
197      * that a BAR access is in response to an interrupt and that BAR
198      * accesses will service the interrupt.  Unfortunately, we don't know
199      * which access will service the interrupt, so we're potentially
200      * getting quite a few host interrupts per guest interrupt.
201      */
202     vbasedev->ops->vfio_eoi(vbasedev);
203 }
204 
205 uint64_t vfio_region_read(void *opaque,
206                           hwaddr addr, unsigned size)
207 {
208     VFIORegion *region = opaque;
209     VFIODevice *vbasedev = region->vbasedev;
210     union {
211         uint8_t byte;
212         uint16_t word;
213         uint32_t dword;
214         uint64_t qword;
215     } buf;
216     uint64_t data = 0;
217 
218     if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
219         error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
220                      __func__, vbasedev->name, region->nr,
221                      addr, size);
222         return (uint64_t)-1;
223     }
224     switch (size) {
225     case 1:
226         data = buf.byte;
227         break;
228     case 2:
229         data = le16_to_cpu(buf.word);
230         break;
231     case 4:
232         data = le32_to_cpu(buf.dword);
233         break;
234     case 8:
235         data = le64_to_cpu(buf.qword);
236         break;
237     default:
238         hw_error("vfio: unsupported read size, %u bytes", size);
239         break;
240     }
241 
242     trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
243 
244     /* Same as write above */
245     vbasedev->ops->vfio_eoi(vbasedev);
246 
247     return data;
248 }
249 
250 const MemoryRegionOps vfio_region_ops = {
251     .read = vfio_region_read,
252     .write = vfio_region_write,
253     .endianness = DEVICE_LITTLE_ENDIAN,
254     .valid = {
255         .min_access_size = 1,
256         .max_access_size = 8,
257     },
258     .impl = {
259         .min_access_size = 1,
260         .max_access_size = 8,
261     },
262 };
263 
264 int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size)
265 {
266     vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
267     vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) /
268                                          BITS_PER_BYTE;
269     vbmap->bitmap = g_try_malloc0(vbmap->size);
270     if (!vbmap->bitmap) {
271         return -ENOMEM;
272     }
273 
274     return 0;
275 }
276 
277 struct vfio_info_cap_header *
278 vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id)
279 {
280     struct vfio_info_cap_header *hdr;
281 
282     for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
283         if (hdr->id == id) {
284             return hdr;
285         }
286     }
287 
288     return NULL;
289 }
290 
291 struct vfio_info_cap_header *
292 vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
293 {
294     if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
295         return NULL;
296     }
297 
298     return vfio_get_cap((void *)info, info->cap_offset, id);
299 }
300 
301 struct vfio_info_cap_header *
302 vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id)
303 {
304     if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) {
305         return NULL;
306     }
307 
308     return vfio_get_cap((void *)info, info->cap_offset, id);
309 }
310 
311 static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
312                                           struct vfio_region_info *info)
313 {
314     struct vfio_info_cap_header *hdr;
315     struct vfio_region_info_cap_sparse_mmap *sparse;
316     int i, j;
317 
318     hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
319     if (!hdr) {
320         return -ENODEV;
321     }
322 
323     sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
324 
325     trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
326                                          region->nr, sparse->nr_areas);
327 
328     region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
329 
330     for (i = 0, j = 0; i < sparse->nr_areas; i++) {
331         if (sparse->areas[i].size) {
332             trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
333                                             sparse->areas[i].offset +
334                                             sparse->areas[i].size - 1);
335             region->mmaps[j].offset = sparse->areas[i].offset;
336             region->mmaps[j].size = sparse->areas[i].size;
337             j++;
338         }
339     }
340 
341     region->nr_mmaps = j;
342     region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
343 
344     return 0;
345 }
346 
347 int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
348                       int index, const char *name)
349 {
350     struct vfio_region_info *info;
351     int ret;
352 
353     ret = vfio_get_region_info(vbasedev, index, &info);
354     if (ret) {
355         return ret;
356     }
357 
358     region->vbasedev = vbasedev;
359     region->flags = info->flags;
360     region->size = info->size;
361     region->fd_offset = info->offset;
362     region->nr = index;
363 
364     if (region->size) {
365         region->mem = g_new0(MemoryRegion, 1);
366         memory_region_init_io(region->mem, obj, &vfio_region_ops,
367                               region, name, region->size);
368 
369         if (!vbasedev->no_mmap &&
370             region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
371 
372             ret = vfio_setup_region_sparse_mmaps(region, info);
373 
374             if (ret) {
375                 region->nr_mmaps = 1;
376                 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
377                 region->mmaps[0].offset = 0;
378                 region->mmaps[0].size = region->size;
379             }
380         }
381     }
382 
383     g_free(info);
384 
385     trace_vfio_region_setup(vbasedev->name, index, name,
386                             region->flags, region->fd_offset, region->size);
387     return 0;
388 }
389 
390 static void vfio_subregion_unmap(VFIORegion *region, int index)
391 {
392     trace_vfio_region_unmap(memory_region_name(&region->mmaps[index].mem),
393                             region->mmaps[index].offset,
394                             region->mmaps[index].offset +
395                             region->mmaps[index].size - 1);
396     memory_region_del_subregion(region->mem, &region->mmaps[index].mem);
397     munmap(region->mmaps[index].mmap, region->mmaps[index].size);
398     object_unparent(OBJECT(&region->mmaps[index].mem));
399     region->mmaps[index].mmap = NULL;
400 }
401 
402 int vfio_region_mmap(VFIORegion *region)
403 {
404     int i, prot = 0;
405     char *name;
406 
407     if (!region->mem) {
408         return 0;
409     }
410 
411     prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
412     prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
413 
414     for (i = 0; i < region->nr_mmaps; i++) {
415         region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot,
416                                      MAP_SHARED, region->vbasedev->fd,
417                                      region->fd_offset +
418                                      region->mmaps[i].offset);
419         if (region->mmaps[i].mmap == MAP_FAILED) {
420             int ret = -errno;
421 
422             trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
423                                          region->fd_offset +
424                                          region->mmaps[i].offset,
425                                          region->fd_offset +
426                                          region->mmaps[i].offset +
427                                          region->mmaps[i].size - 1, ret);
428 
429             region->mmaps[i].mmap = NULL;
430 
431             for (i--; i >= 0; i--) {
432                 vfio_subregion_unmap(region, i);
433             }
434 
435             return ret;
436         }
437 
438         name = g_strdup_printf("%s mmaps[%d]",
439                                memory_region_name(region->mem), i);
440         memory_region_init_ram_device_ptr(&region->mmaps[i].mem,
441                                           memory_region_owner(region->mem),
442                                           name, region->mmaps[i].size,
443                                           region->mmaps[i].mmap);
444         g_free(name);
445         memory_region_add_subregion(region->mem, region->mmaps[i].offset,
446                                     &region->mmaps[i].mem);
447 
448         trace_vfio_region_mmap(memory_region_name(&region->mmaps[i].mem),
449                                region->mmaps[i].offset,
450                                region->mmaps[i].offset +
451                                region->mmaps[i].size - 1);
452     }
453 
454     return 0;
455 }
456 
457 void vfio_region_unmap(VFIORegion *region)
458 {
459     int i;
460 
461     if (!region->mem) {
462         return;
463     }
464 
465     for (i = 0; i < region->nr_mmaps; i++) {
466         if (region->mmaps[i].mmap) {
467             vfio_subregion_unmap(region, i);
468         }
469     }
470 }
471 
472 void vfio_region_exit(VFIORegion *region)
473 {
474     int i;
475 
476     if (!region->mem) {
477         return;
478     }
479 
480     for (i = 0; i < region->nr_mmaps; i++) {
481         if (region->mmaps[i].mmap) {
482             memory_region_del_subregion(region->mem, &region->mmaps[i].mem);
483         }
484     }
485 
486     trace_vfio_region_exit(region->vbasedev->name, region->nr);
487 }
488 
489 void vfio_region_finalize(VFIORegion *region)
490 {
491     int i;
492 
493     if (!region->mem) {
494         return;
495     }
496 
497     for (i = 0; i < region->nr_mmaps; i++) {
498         if (region->mmaps[i].mmap) {
499             munmap(region->mmaps[i].mmap, region->mmaps[i].size);
500             object_unparent(OBJECT(&region->mmaps[i].mem));
501         }
502     }
503 
504     object_unparent(OBJECT(region->mem));
505 
506     g_free(region->mem);
507     g_free(region->mmaps);
508 
509     trace_vfio_region_finalize(region->vbasedev->name, region->nr);
510 
511     region->mem = NULL;
512     region->mmaps = NULL;
513     region->nr_mmaps = 0;
514     region->size = 0;
515     region->flags = 0;
516     region->nr = 0;
517 }
518 
519 void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
520 {
521     int i;
522 
523     if (!region->mem) {
524         return;
525     }
526 
527     for (i = 0; i < region->nr_mmaps; i++) {
528         if (region->mmaps[i].mmap) {
529             memory_region_set_enabled(&region->mmaps[i].mem, enabled);
530         }
531     }
532 
533     trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
534                                         enabled);
535 }
536 
537 int vfio_get_region_info(VFIODevice *vbasedev, int index,
538                          struct vfio_region_info **info)
539 {
540     size_t argsz = sizeof(struct vfio_region_info);
541 
542     *info = g_malloc0(argsz);
543 
544     (*info)->index = index;
545 retry:
546     (*info)->argsz = argsz;
547 
548     if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
549         g_free(*info);
550         *info = NULL;
551         return -errno;
552     }
553 
554     if ((*info)->argsz > argsz) {
555         argsz = (*info)->argsz;
556         *info = g_realloc(*info, argsz);
557 
558         goto retry;
559     }
560 
561     return 0;
562 }
563 
564 int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
565                              uint32_t subtype, struct vfio_region_info **info)
566 {
567     int i;
568 
569     for (i = 0; i < vbasedev->num_regions; i++) {
570         struct vfio_info_cap_header *hdr;
571         struct vfio_region_info_cap_type *cap_type;
572 
573         if (vfio_get_region_info(vbasedev, i, info)) {
574             continue;
575         }
576 
577         hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
578         if (!hdr) {
579             g_free(*info);
580             continue;
581         }
582 
583         cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
584 
585         trace_vfio_get_dev_region(vbasedev->name, i,
586                                   cap_type->type, cap_type->subtype);
587 
588         if (cap_type->type == type && cap_type->subtype == subtype) {
589             return 0;
590         }
591 
592         g_free(*info);
593     }
594 
595     *info = NULL;
596     return -ENODEV;
597 }
598 
599 bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
600 {
601     struct vfio_region_info *info = NULL;
602     bool ret = false;
603 
604     if (!vfio_get_region_info(vbasedev, region, &info)) {
605         if (vfio_get_region_info_cap(info, cap_type)) {
606             ret = true;
607         }
608         g_free(info);
609     }
610 
611     return ret;
612 }
613 
614 int vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
615 {
616     struct stat st;
617 
618     if (vbasedev->fd < 0) {
619         if (stat(vbasedev->sysfsdev, &st) < 0) {
620             error_setg_errno(errp, errno, "no such host device");
621             error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev);
622             return -errno;
623         }
624         /* User may specify a name, e.g: VFIO platform device */
625         if (!vbasedev->name) {
626             vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
627         }
628     } else {
629         if (!vbasedev->iommufd) {
630             error_setg(errp, "Use FD passing only with iommufd backend");
631             return -EINVAL;
632         }
633         /*
634          * Give a name with fd so any function printing out vbasedev->name
635          * will not break.
636          */
637         if (!vbasedev->name) {
638             vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd);
639         }
640     }
641 
642     return 0;
643 }
644 
645 void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
646 {
647     int fd = monitor_fd_param(monitor_cur(), str, errp);
648 
649     if (fd < 0) {
650         error_prepend(errp, "Could not parse remote object fd %s:", str);
651         return;
652     }
653     vbasedev->fd = fd;
654 }
655 
656 void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
657                       DeviceState *dev, bool ram_discard)
658 {
659     vbasedev->type = type;
660     vbasedev->ops = ops;
661     vbasedev->dev = dev;
662     vbasedev->fd = -1;
663 
664     vbasedev->ram_block_discard_allowed = ram_discard;
665 }
666