xref: /qemu/hw/vfio/spapr.c (revision b49f4755)
1 /*
2  * DMA memory preregistration
3  *
4  * Authors:
5  *  Alexey Kardashevskiy <aik@ozlabs.ru>
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2.  See
8  * the COPYING file in the top-level directory.
9  */
10 
11 #include "qemu/osdep.h"
12 #include <sys/ioctl.h>
13 #include <linux/vfio.h>
14 #ifdef CONFIG_KVM
15 #include <linux/kvm.h>
16 #endif
17 #include "sysemu/kvm.h"
18 #include "exec/address-spaces.h"
19 
20 #include "hw/vfio/vfio-common.h"
21 #include "hw/hw.h"
22 #include "exec/ram_addr.h"
23 #include "qemu/error-report.h"
24 #include "qapi/error.h"
25 #include "trace.h"
26 
27 typedef struct VFIOSpaprContainer {
28     VFIOContainer container;
29     MemoryListener prereg_listener;
30     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
31 } VFIOSpaprContainer;
32 
33 static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
34 {
35     if (memory_region_is_iommu(section->mr)) {
36         hw_error("Cannot possibly preregister IOMMU memory");
37     }
38 
39     return !memory_region_is_ram(section->mr) ||
40             memory_region_is_ram_device(section->mr);
41 }
42 
43 static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
44 {
45     return memory_region_get_ram_ptr(section->mr) +
46         section->offset_within_region +
47         (gpa - section->offset_within_address_space);
48 }
49 
50 static void vfio_prereg_listener_region_add(MemoryListener *listener,
51                                             MemoryRegionSection *section)
52 {
53     VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
54                                                   prereg_listener);
55     VFIOContainer *container = &scontainer->container;
56     VFIOContainerBase *bcontainer = &container->bcontainer;
57     const hwaddr gpa = section->offset_within_address_space;
58     hwaddr end;
59     int ret;
60     hwaddr page_mask = qemu_real_host_page_mask();
61     struct vfio_iommu_spapr_register_memory reg = {
62         .argsz = sizeof(reg),
63         .flags = 0,
64     };
65 
66     if (vfio_prereg_listener_skipped_section(section)) {
67         trace_vfio_prereg_listener_region_add_skip(
68                 section->offset_within_address_space,
69                 section->offset_within_address_space +
70                 int128_get64(int128_sub(section->size, int128_one())));
71         return;
72     }
73 
74     if (unlikely((section->offset_within_address_space & ~page_mask) ||
75                  (section->offset_within_region & ~page_mask) ||
76                  (int128_get64(section->size) & ~page_mask))) {
77         error_report("%s received unaligned region", __func__);
78         return;
79     }
80 
81     end = section->offset_within_address_space + int128_get64(section->size);
82     if (gpa >= end) {
83         return;
84     }
85 
86     memory_region_ref(section->mr);
87 
88     reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
89     reg.size = end - gpa;
90 
91     ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
92     trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0);
93     if (ret) {
94         /*
95          * On the initfn path, store the first error in the container so we
96          * can gracefully fail.  Runtime, there's not much we can do other
97          * than throw a hardware error.
98          */
99         if (!bcontainer->initialized) {
100             if (!bcontainer->error) {
101                 error_setg_errno(&bcontainer->error, -ret,
102                                  "Memory registering failed");
103             }
104         } else {
105             hw_error("vfio: Memory registering failed, unable to continue");
106         }
107     }
108 }
109 
110 static void vfio_prereg_listener_region_del(MemoryListener *listener,
111                                             MemoryRegionSection *section)
112 {
113     VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
114                                                   prereg_listener);
115     VFIOContainer *container = &scontainer->container;
116     const hwaddr gpa = section->offset_within_address_space;
117     hwaddr end;
118     int ret;
119     hwaddr page_mask = qemu_real_host_page_mask();
120     struct vfio_iommu_spapr_register_memory reg = {
121         .argsz = sizeof(reg),
122         .flags = 0,
123     };
124 
125     if (vfio_prereg_listener_skipped_section(section)) {
126         trace_vfio_prereg_listener_region_del_skip(
127                 section->offset_within_address_space,
128                 section->offset_within_address_space +
129                 int128_get64(int128_sub(section->size, int128_one())));
130         return;
131     }
132 
133     if (unlikely((section->offset_within_address_space & ~page_mask) ||
134                  (section->offset_within_region & ~page_mask) ||
135                  (int128_get64(section->size) & ~page_mask))) {
136         error_report("%s received unaligned region", __func__);
137         return;
138     }
139 
140     end = section->offset_within_address_space + int128_get64(section->size);
141     if (gpa >= end) {
142         return;
143     }
144 
145     reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
146     reg.size = end - gpa;
147 
148     ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
149     trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
150 }
151 
152 static const MemoryListener vfio_prereg_listener = {
153     .name = "vfio-pre-reg",
154     .region_add = vfio_prereg_listener_region_add,
155     .region_del = vfio_prereg_listener_region_del,
156 };
157 
158 static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova,
159                               hwaddr max_iova, uint64_t iova_pgsizes)
160 {
161     VFIOHostDMAWindow *hostwin;
162 
163     QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
164         if (ranges_overlap(hostwin->min_iova,
165                            hostwin->max_iova - hostwin->min_iova + 1,
166                            min_iova,
167                            max_iova - min_iova + 1)) {
168             hw_error("%s: Overlapped IOMMU are not enabled", __func__);
169         }
170     }
171 
172     hostwin = g_malloc0(sizeof(*hostwin));
173 
174     hostwin->min_iova = min_iova;
175     hostwin->max_iova = max_iova;
176     hostwin->iova_pgsizes = iova_pgsizes;
177     QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next);
178 }
179 
180 static int vfio_host_win_del(VFIOSpaprContainer *scontainer,
181                              hwaddr min_iova, hwaddr max_iova)
182 {
183     VFIOHostDMAWindow *hostwin;
184 
185     QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
186         if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
187             QLIST_REMOVE(hostwin, hostwin_next);
188             g_free(hostwin);
189             return 0;
190         }
191     }
192 
193     return -1;
194 }
195 
196 static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container,
197                                             hwaddr iova, hwaddr end)
198 {
199     VFIOHostDMAWindow *hostwin;
200     bool hostwin_found = false;
201 
202     QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
203         if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
204             hostwin_found = true;
205             break;
206         }
207     }
208 
209     return hostwin_found ? hostwin : NULL;
210 }
211 
212 static int vfio_spapr_remove_window(VFIOContainer *container,
213                                     hwaddr offset_within_address_space)
214 {
215     struct vfio_iommu_spapr_tce_remove remove = {
216         .argsz = sizeof(remove),
217         .start_addr = offset_within_address_space,
218     };
219     int ret;
220 
221     ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
222     if (ret) {
223         error_report("Failed to remove window at %"PRIx64,
224                      (uint64_t)remove.start_addr);
225         return -errno;
226     }
227 
228     trace_vfio_spapr_remove_window(offset_within_address_space);
229 
230     return 0;
231 }
232 
233 static int vfio_spapr_create_window(VFIOContainer *container,
234                                     MemoryRegionSection *section,
235                                     hwaddr *pgsize)
236 {
237     int ret = 0;
238     VFIOContainerBase *bcontainer = &container->bcontainer;
239     IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
240     uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask;
241     unsigned entries, bits_total, bits_per_level, max_levels;
242     struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
243     long rampagesize = qemu_minrampagesize();
244 
245     /*
246      * The host might not support the guest supported IOMMU page size,
247      * so we will use smaller physical IOMMU pages to back them.
248      */
249     if (pagesize > rampagesize) {
250         pagesize = rampagesize;
251     }
252     pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1));
253     pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0;
254     if (!pagesize) {
255         error_report("Host doesn't support page size 0x%"PRIx64
256                      ", the supported mask is 0x%lx",
257                      memory_region_iommu_get_min_page_size(iommu_mr),
258                      bcontainer->pgsizes);
259         return -EINVAL;
260     }
261 
262     /*
263      * FIXME: For VFIO iommu types which have KVM acceleration to
264      * avoid bouncing all map/unmaps through qemu this way, this
265      * would be the right place to wire that up (tell the KVM
266      * device emulation the VFIO iommu handles to use).
267      */
268     create.window_size = int128_get64(section->size);
269     create.page_shift = ctz64(pagesize);
270     /*
271      * SPAPR host supports multilevel TCE tables. We try to guess optimal
272      * levels number and if this fails (for example due to the host memory
273      * fragmentation), we increase levels. The DMA address structure is:
274      * rrrrrrrr rxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx iiiiiiii
275      * where:
276      *   r = reserved (bits >= 55 are reserved in the existing hardware)
277      *   i = IOMMU page offset (64K in this example)
278      *   x = bits to index a TCE which can be split to equal chunks to index
279      *      within the level.
280      * The aim is to split "x" to smaller possible number of levels.
281      */
282     entries = create.window_size >> create.page_shift;
283     /* bits_total is number of "x" needed */
284     bits_total = ctz64(entries * sizeof(uint64_t));
285     /*
286      * bits_per_level is a safe guess of how much we can allocate per level:
287      * 8 is the current minimum for CONFIG_FORCE_MAX_ZONEORDER and MAX_ORDER
288      * is usually bigger than that.
289      * Below we look at qemu_real_host_page_size as TCEs are allocated from
290      * system pages.
291      */
292     bits_per_level = ctz64(qemu_real_host_page_size()) + 8;
293     create.levels = bits_total / bits_per_level;
294     if (bits_total % bits_per_level) {
295         ++create.levels;
296     }
297     max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
298     for ( ; create.levels <= max_levels; ++create.levels) {
299         ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
300         if (!ret) {
301             break;
302         }
303     }
304     if (ret) {
305         error_report("Failed to create a window, ret = %d (%m)", ret);
306         return -errno;
307     }
308 
309     if (create.start_addr != section->offset_within_address_space) {
310         vfio_spapr_remove_window(container, create.start_addr);
311 
312         error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
313                      section->offset_within_address_space,
314                      (uint64_t)create.start_addr);
315         return -EINVAL;
316     }
317     trace_vfio_spapr_create_window(create.page_shift,
318                                    create.levels,
319                                    create.window_size,
320                                    create.start_addr);
321     *pgsize = pagesize;
322 
323     return 0;
324 }
325 
326 static int
327 vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
328                                         MemoryRegionSection *section,
329                                         Error **errp)
330 {
331     VFIOContainer *container = container_of(bcontainer, VFIOContainer,
332                                             bcontainer);
333     VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
334                                                   container);
335     VFIOHostDMAWindow *hostwin;
336     hwaddr pgsize = 0;
337     int ret;
338 
339     /*
340      * VFIO_SPAPR_TCE_IOMMU supports a single host window between
341      * [dma32_window_start, dma32_window_size), we need to ensure
342      * the section fall in this range.
343      */
344     if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
345         hwaddr iova, end;
346 
347         iova = section->offset_within_address_space;
348         end = iova + int128_get64(section->size) - 1;
349 
350         if (!vfio_find_hostwin(scontainer, iova, end)) {
351             error_setg(errp, "Container %p can't map guest IOVA region"
352                        " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container,
353                        iova, end);
354             return -EINVAL;
355         }
356         return 0;
357     }
358 
359     if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
360         return 0;
361     }
362 
363     /* For now intersections are not allowed, we may relax this later */
364     QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
365         if (ranges_overlap(hostwin->min_iova,
366                            hostwin->max_iova - hostwin->min_iova + 1,
367                            section->offset_within_address_space,
368                            int128_get64(section->size))) {
369             error_setg(errp,
370                 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
371                 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
372                 section->offset_within_address_space,
373                 section->offset_within_address_space +
374                     int128_get64(section->size) - 1,
375                 hostwin->min_iova, hostwin->max_iova);
376             return -EINVAL;
377         }
378     }
379 
380     ret = vfio_spapr_create_window(container, section, &pgsize);
381     if (ret) {
382         error_setg_errno(errp, -ret, "Failed to create SPAPR window");
383         return ret;
384     }
385 
386     vfio_host_win_add(scontainer, section->offset_within_address_space,
387                       section->offset_within_address_space +
388                       int128_get64(section->size) - 1, pgsize);
389 #ifdef CONFIG_KVM
390     if (kvm_enabled()) {
391         VFIOGroup *group;
392         IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
393         struct kvm_vfio_spapr_tce param;
394         struct kvm_device_attr attr = {
395             .group = KVM_DEV_VFIO_GROUP,
396             .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
397             .addr = (uint64_t)(unsigned long)&param,
398         };
399 
400         if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
401                                           &param.tablefd)) {
402             QLIST_FOREACH(group, &container->group_list, container_next) {
403                 param.groupfd = group->fd;
404                 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
405                     error_setg_errno(errp, errno,
406                                      "vfio: failed GROUP_SET_SPAPR_TCE for "
407                                      "KVM VFIO device %d and group fd %d",
408                                      param.tablefd, param.groupfd);
409                     return -errno;
410                 }
411                 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
412             }
413         }
414     }
415 #endif
416     return 0;
417 }
418 
419 static void
420 vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer,
421                                         MemoryRegionSection *section)
422 {
423     VFIOContainer *container = container_of(bcontainer, VFIOContainer,
424                                             bcontainer);
425     VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
426                                                   container);
427 
428     if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
429         return;
430     }
431 
432     vfio_spapr_remove_window(container,
433                              section->offset_within_address_space);
434     if (vfio_host_win_del(scontainer,
435                           section->offset_within_address_space,
436                           section->offset_within_address_space +
437                           int128_get64(section->size) - 1) < 0) {
438         hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
439                  __func__, section->offset_within_address_space);
440     }
441 }
442 
443 static VFIOIOMMUOps vfio_iommu_spapr_ops;
444 
445 static void setup_spapr_ops(VFIOContainerBase *bcontainer)
446 {
447     vfio_iommu_spapr_ops = *bcontainer->ops;
448     vfio_iommu_spapr_ops.add_window = vfio_spapr_container_add_section_window;
449     vfio_iommu_spapr_ops.del_window = vfio_spapr_container_del_section_window;
450     bcontainer->ops = &vfio_iommu_spapr_ops;
451 }
452 
453 int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
454 {
455     VFIOContainerBase *bcontainer = &container->bcontainer;
456     VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
457                                                   container);
458     struct vfio_iommu_spapr_tce_info info;
459     bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
460     int ret, fd = container->fd;
461 
462     QLIST_INIT(&scontainer->hostwin_list);
463 
464     /*
465      * The host kernel code implementing VFIO_IOMMU_DISABLE is called
466      * when container fd is closed so we do not call it explicitly
467      * in this file.
468      */
469     if (!v2) {
470         ret = ioctl(fd, VFIO_IOMMU_ENABLE);
471         if (ret) {
472             error_setg_errno(errp, errno, "failed to enable container");
473             return -errno;
474         }
475     } else {
476         scontainer->prereg_listener = vfio_prereg_listener;
477 
478         memory_listener_register(&scontainer->prereg_listener,
479                                  &address_space_memory);
480         if (bcontainer->error) {
481             ret = -1;
482             error_propagate_prepend(errp, bcontainer->error,
483                     "RAM memory listener initialization failed: ");
484             goto listener_unregister_exit;
485         }
486     }
487 
488     info.argsz = sizeof(info);
489     ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
490     if (ret) {
491         error_setg_errno(errp, errno,
492                          "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
493         ret = -errno;
494         goto listener_unregister_exit;
495     }
496 
497     if (v2) {
498         bcontainer->pgsizes = info.ddw.pgsizes;
499         /*
500          * There is a default window in just created container.
501          * To make region_add/del simpler, we better remove this
502          * window now and let those iommu_listener callbacks
503          * create/remove them when needed.
504          */
505         ret = vfio_spapr_remove_window(container, info.dma32_window_start);
506         if (ret) {
507             error_setg_errno(errp, -ret,
508                              "failed to remove existing window");
509             goto listener_unregister_exit;
510         }
511     } else {
512         /* The default table uses 4K pages */
513         bcontainer->pgsizes = 0x1000;
514         vfio_host_win_add(scontainer, info.dma32_window_start,
515                           info.dma32_window_start +
516                           info.dma32_window_size - 1,
517                           0x1000);
518     }
519 
520     setup_spapr_ops(bcontainer);
521 
522     return 0;
523 
524 listener_unregister_exit:
525     if (v2) {
526         memory_listener_unregister(&scontainer->prereg_listener);
527     }
528     return ret;
529 }
530 
531 void vfio_spapr_container_deinit(VFIOContainer *container)
532 {
533     VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
534                                                   container);
535     VFIOHostDMAWindow *hostwin, *next;
536 
537     if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
538         memory_listener_unregister(&scontainer->prereg_listener);
539     }
540     QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next,
541                        next) {
542         QLIST_REMOVE(hostwin, hostwin_next);
543         g_free(hostwin);
544     }
545 }
546