xref: /qemu/block/blkio.c (revision 78f314cf)
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 /*
3  * libblkio BlockDriver
4  *
5  * Copyright Red Hat, Inc.
6  *
7  * Author:
8  *   Stefan Hajnoczi <stefanha@redhat.com>
9  */
10 
11 #include "qemu/osdep.h"
12 #include <blkio.h>
13 #include "block/block_int.h"
14 #include "exec/memory.h"
15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */
16 #include "qapi/error.h"
17 #include "qemu/error-report.h"
18 #include "qapi/qmp/qdict.h"
19 #include "qemu/module.h"
20 #include "exec/memory.h" /* for ram_block_discard_disable() */
21 
22 #include "block/block-io.h"
23 
24 /*
25  * Keep the QEMU BlockDriver names identical to the libblkio driver names.
26  * Using macros instead of typing out the string literals avoids typos.
27  */
28 #define DRIVER_IO_URING "io_uring"
29 #define DRIVER_NVME_IO_URING "nvme-io_uring"
30 #define DRIVER_VIRTIO_BLK_VFIO_PCI "virtio-blk-vfio-pci"
31 #define DRIVER_VIRTIO_BLK_VHOST_USER "virtio-blk-vhost-user"
32 #define DRIVER_VIRTIO_BLK_VHOST_VDPA "virtio-blk-vhost-vdpa"
33 
34 /*
35  * Allocated bounce buffers are kept in a list sorted by buffer address.
36  */
37 typedef struct BlkioBounceBuf {
38     QLIST_ENTRY(BlkioBounceBuf) next;
39 
40     /* The bounce buffer */
41     struct iovec buf;
42 } BlkioBounceBuf;
43 
44 typedef struct {
45     /*
46      * libblkio is not thread-safe so this lock protects ->blkio and
47      * ->blkioq.
48      */
49     QemuMutex blkio_lock;
50     struct blkio *blkio;
51     struct blkioq *blkioq; /* make this multi-queue in the future... */
52     int completion_fd;
53 
54     /*
55      * Polling fetches the next completion into this field.
56      *
57      * No lock is necessary since only one thread calls aio_poll() and invokes
58      * fd and poll handlers.
59      */
60     struct blkio_completion poll_completion;
61 
62     /*
63      * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available.
64      *
65      * Lock ordering: ->bounce_lock before ->blkio_lock.
66      */
67     CoMutex bounce_lock;
68 
69     /* Bounce buffer pool */
70     struct blkio_mem_region bounce_pool;
71 
72     /* Sorted list of allocated bounce buffers */
73     QLIST_HEAD(, BlkioBounceBuf) bounce_bufs;
74 
75     /* Queue for coroutines waiting for bounce buffer space */
76     CoQueue bounce_available;
77 
78     /* The value of the "mem-region-alignment" property */
79     size_t mem_region_alignment;
80 
81     /* Can we skip adding/deleting blkio_mem_regions? */
82     bool needs_mem_regions;
83 
84     /* Are file descriptors necessary for blkio_mem_regions? */
85     bool needs_mem_region_fd;
86 
87     /* Are madvise(MADV_DONTNEED)-style operations unavailable? */
88     bool may_pin_mem_regions;
89 } BDRVBlkioState;
90 
91 /* Called with s->bounce_lock held */
92 static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes)
93 {
94     /* There can be no allocated bounce buffers during resize */
95     assert(QLIST_EMPTY(&s->bounce_bufs));
96 
97     /* Pad size to reduce frequency of resize calls */
98     bytes += 128 * 1024;
99 
100     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
101         int ret;
102 
103         if (s->bounce_pool.addr) {
104             blkio_unmap_mem_region(s->blkio, &s->bounce_pool);
105             blkio_free_mem_region(s->blkio, &s->bounce_pool);
106             memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
107         }
108 
109         /* Automatically freed when s->blkio is destroyed */
110         ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes);
111         if (ret < 0) {
112             return ret;
113         }
114 
115         ret = blkio_map_mem_region(s->blkio, &s->bounce_pool);
116         if (ret < 0) {
117             blkio_free_mem_region(s->blkio, &s->bounce_pool);
118             memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
119             return ret;
120         }
121     }
122 
123     return 0;
124 }
125 
126 /* Called with s->bounce_lock held */
127 static bool
128 blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
129                              int64_t bytes)
130 {
131     void *addr = s->bounce_pool.addr;
132     BlkioBounceBuf *cur = NULL;
133     BlkioBounceBuf *prev = NULL;
134     ptrdiff_t space;
135 
136     /*
137      * This is just a linear search over the holes between requests. An
138      * efficient allocator would be nice.
139      */
140     QLIST_FOREACH(cur, &s->bounce_bufs, next) {
141         space = cur->buf.iov_base - addr;
142         if (bytes <= space) {
143             QLIST_INSERT_BEFORE(cur, bounce, next);
144             bounce->buf.iov_base = addr;
145             bounce->buf.iov_len = bytes;
146             return true;
147         }
148 
149         addr = cur->buf.iov_base + cur->buf.iov_len;
150         prev = cur;
151     }
152 
153     /* Is there space after the last request? */
154     space = s->bounce_pool.addr + s->bounce_pool.len - addr;
155     if (bytes > space) {
156         return false;
157     }
158     if (prev) {
159         QLIST_INSERT_AFTER(prev, bounce, next);
160     } else {
161         QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next);
162     }
163     bounce->buf.iov_base = addr;
164     bounce->buf.iov_len = bytes;
165     return true;
166 }
167 
168 static int coroutine_fn
169 blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
170                           int64_t bytes)
171 {
172     /*
173      * Ensure fairness: first time around we join the back of the queue,
174      * subsequently we join the front so we don't lose our place.
175      */
176     CoQueueWaitFlags wait_flags = 0;
177 
178     QEMU_LOCK_GUARD(&s->bounce_lock);
179 
180     /* Ensure fairness: don't even try if other requests are already waiting */
181     if (!qemu_co_queue_empty(&s->bounce_available)) {
182         qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
183                                  wait_flags);
184         wait_flags = CO_QUEUE_WAIT_FRONT;
185     }
186 
187     while (true) {
188         if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) {
189             /* Kick the next queued request since there may be space */
190             qemu_co_queue_next(&s->bounce_available);
191             return 0;
192         }
193 
194         /*
195          * If there are no in-flight requests then the pool was simply too
196          * small.
197          */
198         if (QLIST_EMPTY(&s->bounce_bufs)) {
199             bool ok;
200             int ret;
201 
202             ret = blkio_resize_bounce_pool(s, bytes);
203             if (ret < 0) {
204                 /* Kick the next queued request since that may fail too */
205                 qemu_co_queue_next(&s->bounce_available);
206                 return ret;
207             }
208 
209             ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes);
210             assert(ok); /* must have space this time */
211             return 0;
212         }
213 
214         qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
215                                  wait_flags);
216         wait_flags = CO_QUEUE_WAIT_FRONT;
217     }
218 }
219 
220 static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s,
221                                                   BlkioBounceBuf *bounce)
222 {
223     QEMU_LOCK_GUARD(&s->bounce_lock);
224 
225     QLIST_REMOVE(bounce, next);
226 
227     /* Wake up waiting coroutines since space may now be available */
228     qemu_co_queue_next(&s->bounce_available);
229 }
230 
231 /* For async to .bdrv_co_*() conversion */
232 typedef struct {
233     Coroutine *coroutine;
234     int ret;
235 } BlkioCoData;
236 
237 static void blkio_completion_fd_read(void *opaque)
238 {
239     BlockDriverState *bs = opaque;
240     BDRVBlkioState *s = bs->opaque;
241     uint64_t val;
242     int ret;
243 
244     /* Polling may have already fetched a completion */
245     if (s->poll_completion.user_data != NULL) {
246         BlkioCoData *cod = s->poll_completion.user_data;
247         cod->ret = s->poll_completion.ret;
248 
249         /* Clear it in case aio_co_wake() enters a nested event loop */
250         s->poll_completion.user_data = NULL;
251 
252         aio_co_wake(cod->coroutine);
253     }
254 
255     /* Reset completion fd status */
256     ret = read(s->completion_fd, &val, sizeof(val));
257 
258     /* Ignore errors, there's nothing we can do */
259     (void)ret;
260 
261     /*
262      * Reading one completion at a time makes nested event loop re-entrancy
263      * simple. Change this loop to get multiple completions in one go if it
264      * becomes a performance bottleneck.
265      */
266     while (true) {
267         struct blkio_completion completion;
268 
269         WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
270             ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL);
271         }
272         if (ret != 1) {
273             break;
274         }
275 
276         BlkioCoData *cod = completion.user_data;
277         cod->ret = completion.ret;
278         aio_co_wake(cod->coroutine);
279     }
280 }
281 
282 static bool blkio_completion_fd_poll(void *opaque)
283 {
284     BlockDriverState *bs = opaque;
285     BDRVBlkioState *s = bs->opaque;
286     int ret;
287 
288     /* Just in case we already fetched a completion */
289     if (s->poll_completion.user_data != NULL) {
290         return true;
291     }
292 
293     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
294         ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL);
295     }
296     return ret == 1;
297 }
298 
299 static void blkio_completion_fd_poll_ready(void *opaque)
300 {
301     blkio_completion_fd_read(opaque);
302 }
303 
304 static void blkio_attach_aio_context(BlockDriverState *bs,
305                                      AioContext *new_context)
306 {
307     BDRVBlkioState *s = bs->opaque;
308 
309     aio_set_fd_handler(new_context, s->completion_fd,
310                        blkio_completion_fd_read, NULL,
311                        blkio_completion_fd_poll,
312                        blkio_completion_fd_poll_ready, bs);
313 }
314 
315 static void blkio_detach_aio_context(BlockDriverState *bs)
316 {
317     BDRVBlkioState *s = bs->opaque;
318 
319     aio_set_fd_handler(bdrv_get_aio_context(bs), s->completion_fd, NULL, NULL,
320                        NULL, NULL, NULL);
321 }
322 
323 /* Call with s->blkio_lock held to submit I/O after enqueuing a new request */
324 static void blkio_submit_io(BlockDriverState *bs)
325 {
326     if (qatomic_read(&bs->io_plugged) == 0) {
327         BDRVBlkioState *s = bs->opaque;
328 
329         blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
330     }
331 }
332 
333 static int coroutine_fn
334 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
335 {
336     BDRVBlkioState *s = bs->opaque;
337     BlkioCoData cod = {
338         .coroutine = qemu_coroutine_self(),
339     };
340 
341     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
342         blkioq_discard(s->blkioq, offset, bytes, &cod, 0);
343         blkio_submit_io(bs);
344     }
345 
346     qemu_coroutine_yield();
347     return cod.ret;
348 }
349 
350 static int coroutine_fn
351 blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
352                 QEMUIOVector *qiov, BdrvRequestFlags flags)
353 {
354     BlkioCoData cod = {
355         .coroutine = qemu_coroutine_self(),
356     };
357     BDRVBlkioState *s = bs->opaque;
358     bool use_bounce_buffer =
359         s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
360     BlkioBounceBuf bounce;
361     struct iovec *iov = qiov->iov;
362     int iovcnt = qiov->niov;
363 
364     if (use_bounce_buffer) {
365         int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
366         if (ret < 0) {
367             return ret;
368         }
369 
370         iov = &bounce.buf;
371         iovcnt = 1;
372     }
373 
374     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
375         blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0);
376         blkio_submit_io(bs);
377     }
378 
379     qemu_coroutine_yield();
380 
381     if (use_bounce_buffer) {
382         if (cod.ret == 0) {
383             qemu_iovec_from_buf(qiov, 0,
384                                 bounce.buf.iov_base,
385                                 bounce.buf.iov_len);
386         }
387 
388         blkio_free_bounce_buffer(s, &bounce);
389     }
390 
391     return cod.ret;
392 }
393 
394 static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset,
395         int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
396 {
397     uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
398     BlkioCoData cod = {
399         .coroutine = qemu_coroutine_self(),
400     };
401     BDRVBlkioState *s = bs->opaque;
402     bool use_bounce_buffer =
403         s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
404     BlkioBounceBuf bounce;
405     struct iovec *iov = qiov->iov;
406     int iovcnt = qiov->niov;
407 
408     if (use_bounce_buffer) {
409         int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
410         if (ret < 0) {
411             return ret;
412         }
413 
414         qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes);
415         iov = &bounce.buf;
416         iovcnt = 1;
417     }
418 
419     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
420         blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags);
421         blkio_submit_io(bs);
422     }
423 
424     qemu_coroutine_yield();
425 
426     if (use_bounce_buffer) {
427         blkio_free_bounce_buffer(s, &bounce);
428     }
429 
430     return cod.ret;
431 }
432 
433 static int coroutine_fn blkio_co_flush(BlockDriverState *bs)
434 {
435     BDRVBlkioState *s = bs->opaque;
436     BlkioCoData cod = {
437         .coroutine = qemu_coroutine_self(),
438     };
439 
440     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
441         blkioq_flush(s->blkioq, &cod, 0);
442         blkio_submit_io(bs);
443     }
444 
445     qemu_coroutine_yield();
446     return cod.ret;
447 }
448 
449 static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs,
450     int64_t offset, int64_t bytes, BdrvRequestFlags flags)
451 {
452     BDRVBlkioState *s = bs->opaque;
453     BlkioCoData cod = {
454         .coroutine = qemu_coroutine_self(),
455     };
456     uint32_t blkio_flags = 0;
457 
458     if (flags & BDRV_REQ_FUA) {
459         blkio_flags |= BLKIO_REQ_FUA;
460     }
461     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
462         blkio_flags |= BLKIO_REQ_NO_UNMAP;
463     }
464     if (flags & BDRV_REQ_NO_FALLBACK) {
465         blkio_flags |= BLKIO_REQ_NO_FALLBACK;
466     }
467 
468     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
469         blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags);
470         blkio_submit_io(bs);
471     }
472 
473     qemu_coroutine_yield();
474     return cod.ret;
475 }
476 
477 static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs)
478 {
479     BDRVBlkioState *s = bs->opaque;
480 
481     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
482         blkio_submit_io(bs);
483     }
484 }
485 
486 typedef enum {
487     BMRR_OK,
488     BMRR_SKIP,
489     BMRR_FAIL,
490 } BlkioMemRegionResult;
491 
492 /*
493  * Produce a struct blkio_mem_region for a given address and size.
494  *
495  * This function produces identical results when called multiple times with the
496  * same arguments. This property is necessary because blkio_unmap_mem_region()
497  * must receive the same struct blkio_mem_region field values that were passed
498  * to blkio_map_mem_region().
499  */
500 static BlkioMemRegionResult
501 blkio_mem_region_from_host(BlockDriverState *bs,
502                            void *host, size_t size,
503                            struct blkio_mem_region *region,
504                            Error **errp)
505 {
506     BDRVBlkioState *s = bs->opaque;
507     int fd = -1;
508     ram_addr_t fd_offset = 0;
509 
510     if (((uintptr_t)host | size) % s->mem_region_alignment) {
511         error_setg(errp, "unaligned buf %p with size %zu", host, size);
512         return BMRR_FAIL;
513     }
514 
515     /* Attempt to find the fd for the underlying memory */
516     if (s->needs_mem_region_fd) {
517         RAMBlock *ram_block;
518         RAMBlock *end_block;
519         ram_addr_t offset;
520 
521         /*
522          * bdrv_register_buf() is called with the BQL held so mr lives at least
523          * until this function returns.
524          */
525         ram_block = qemu_ram_block_from_host(host, false, &fd_offset);
526         if (ram_block) {
527             fd = qemu_ram_get_fd(ram_block);
528         }
529         if (fd == -1) {
530             /*
531              * Ideally every RAMBlock would have an fd. pc-bios and other
532              * things don't. Luckily they are usually not I/O buffers and we
533              * can just ignore them.
534              */
535             return BMRR_SKIP;
536         }
537 
538         /* Make sure the fd covers the entire range */
539         end_block = qemu_ram_block_from_host(host + size - 1, false, &offset);
540         if (ram_block != end_block) {
541             error_setg(errp, "registered buffer at %p with size %zu extends "
542                        "beyond RAMBlock", host, size);
543             return BMRR_FAIL;
544         }
545     }
546 
547     *region = (struct blkio_mem_region){
548         .addr = host,
549         .len = size,
550         .fd = fd,
551         .fd_offset = fd_offset,
552     };
553     return BMRR_OK;
554 }
555 
556 static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size,
557                                Error **errp)
558 {
559     BDRVBlkioState *s = bs->opaque;
560     struct blkio_mem_region region;
561     BlkioMemRegionResult region_result;
562     int ret;
563 
564     /*
565      * Mapping memory regions conflicts with RAM discard (virtio-mem) when
566      * there is pinning, so only do it when necessary.
567      */
568     if (!s->needs_mem_regions && s->may_pin_mem_regions) {
569         return true;
570     }
571 
572     region_result = blkio_mem_region_from_host(bs, host, size, &region, errp);
573     if (region_result == BMRR_SKIP) {
574         return true;
575     } else if (region_result != BMRR_OK) {
576         return false;
577     }
578 
579     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
580         ret = blkio_map_mem_region(s->blkio, &region);
581     }
582 
583     if (ret < 0) {
584         error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s",
585                    host, size, blkio_get_error_msg());
586         return false;
587     }
588     return true;
589 }
590 
591 static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
592 {
593     BDRVBlkioState *s = bs->opaque;
594     struct blkio_mem_region region;
595 
596     /* See blkio_register_buf() */
597     if (!s->needs_mem_regions && s->may_pin_mem_regions) {
598         return;
599     }
600 
601     if (blkio_mem_region_from_host(bs, host, size, &region, NULL) != BMRR_OK) {
602         return;
603     }
604 
605     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
606         blkio_unmap_mem_region(s->blkio, &region);
607     }
608 }
609 
610 static int blkio_io_uring_open(BlockDriverState *bs, QDict *options, int flags,
611                                Error **errp)
612 {
613     const char *filename = qdict_get_str(options, "filename");
614     BDRVBlkioState *s = bs->opaque;
615     int ret;
616 
617     ret = blkio_set_str(s->blkio, "path", filename);
618     qdict_del(options, "filename");
619     if (ret < 0) {
620         error_setg_errno(errp, -ret, "failed to set path: %s",
621                          blkio_get_error_msg());
622         return ret;
623     }
624 
625     if (flags & BDRV_O_NOCACHE) {
626         ret = blkio_set_bool(s->blkio, "direct", true);
627         if (ret < 0) {
628             error_setg_errno(errp, -ret, "failed to set direct: %s",
629                              blkio_get_error_msg());
630             return ret;
631         }
632     }
633 
634     return 0;
635 }
636 
637 static int blkio_nvme_io_uring(BlockDriverState *bs, QDict *options, int flags,
638                                Error **errp)
639 {
640     const char *path = qdict_get_try_str(options, "path");
641     BDRVBlkioState *s = bs->opaque;
642     int ret;
643 
644     if (!path) {
645         error_setg(errp, "missing 'path' option");
646         return -EINVAL;
647     }
648 
649     ret = blkio_set_str(s->blkio, "path", path);
650     qdict_del(options, "path");
651     if (ret < 0) {
652         error_setg_errno(errp, -ret, "failed to set path: %s",
653                          blkio_get_error_msg());
654         return ret;
655     }
656 
657     if (!(flags & BDRV_O_NOCACHE)) {
658         error_setg(errp, "cache.direct=off is not supported");
659         return -EINVAL;
660     }
661 
662     return 0;
663 }
664 
665 static int blkio_virtio_blk_common_open(BlockDriverState *bs,
666         QDict *options, int flags, Error **errp)
667 {
668     const char *path = qdict_get_try_str(options, "path");
669     BDRVBlkioState *s = bs->opaque;
670     int ret;
671 
672     if (!path) {
673         error_setg(errp, "missing 'path' option");
674         return -EINVAL;
675     }
676 
677     ret = blkio_set_str(s->blkio, "path", path);
678     qdict_del(options, "path");
679     if (ret < 0) {
680         error_setg_errno(errp, -ret, "failed to set path: %s",
681                          blkio_get_error_msg());
682         return ret;
683     }
684 
685     if (!(flags & BDRV_O_NOCACHE)) {
686         error_setg(errp, "cache.direct=off is not supported");
687         return -EINVAL;
688     }
689     return 0;
690 }
691 
692 static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
693                            Error **errp)
694 {
695     const char *blkio_driver = bs->drv->protocol_name;
696     BDRVBlkioState *s = bs->opaque;
697     int ret;
698 
699     ret = blkio_create(blkio_driver, &s->blkio);
700     if (ret < 0) {
701         error_setg_errno(errp, -ret, "blkio_create failed: %s",
702                          blkio_get_error_msg());
703         return ret;
704     }
705 
706     if (strcmp(blkio_driver, DRIVER_IO_URING) == 0) {
707         ret = blkio_io_uring_open(bs, options, flags, errp);
708     } else if (strcmp(blkio_driver, DRIVER_NVME_IO_URING) == 0) {
709         ret = blkio_nvme_io_uring(bs, options, flags, errp);
710     } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VFIO_PCI) == 0) {
711         ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
712     } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_USER) == 0) {
713         ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
714     } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_VDPA) == 0) {
715         ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
716     } else {
717         g_assert_not_reached();
718     }
719     if (ret < 0) {
720         blkio_destroy(&s->blkio);
721         return ret;
722     }
723 
724     if (!(flags & BDRV_O_RDWR)) {
725         ret = blkio_set_bool(s->blkio, "read-only", true);
726         if (ret < 0) {
727             error_setg_errno(errp, -ret, "failed to set read-only: %s",
728                              blkio_get_error_msg());
729             blkio_destroy(&s->blkio);
730             return ret;
731         }
732     }
733 
734     ret = blkio_connect(s->blkio);
735     if (ret < 0) {
736         error_setg_errno(errp, -ret, "blkio_connect failed: %s",
737                          blkio_get_error_msg());
738         blkio_destroy(&s->blkio);
739         return ret;
740     }
741 
742     ret = blkio_get_bool(s->blkio,
743                          "needs-mem-regions",
744                          &s->needs_mem_regions);
745     if (ret < 0) {
746         error_setg_errno(errp, -ret,
747                          "failed to get needs-mem-regions: %s",
748                          blkio_get_error_msg());
749         blkio_destroy(&s->blkio);
750         return ret;
751     }
752 
753     ret = blkio_get_bool(s->blkio,
754                          "needs-mem-region-fd",
755                          &s->needs_mem_region_fd);
756     if (ret < 0) {
757         error_setg_errno(errp, -ret,
758                          "failed to get needs-mem-region-fd: %s",
759                          blkio_get_error_msg());
760         blkio_destroy(&s->blkio);
761         return ret;
762     }
763 
764     ret = blkio_get_uint64(s->blkio,
765                            "mem-region-alignment",
766                            &s->mem_region_alignment);
767     if (ret < 0) {
768         error_setg_errno(errp, -ret,
769                          "failed to get mem-region-alignment: %s",
770                          blkio_get_error_msg());
771         blkio_destroy(&s->blkio);
772         return ret;
773     }
774 
775     ret = blkio_get_bool(s->blkio,
776                          "may-pin-mem-regions",
777                          &s->may_pin_mem_regions);
778     if (ret < 0) {
779         /* Be conservative (assume pinning) if the property is not supported */
780         s->may_pin_mem_regions = s->needs_mem_regions;
781     }
782 
783     /*
784      * Notify if libblkio drivers pin memory and prevent features like
785      * virtio-mem from working.
786      */
787     if (s->may_pin_mem_regions) {
788         ret = ram_block_discard_disable(true);
789         if (ret < 0) {
790             error_setg_errno(errp, -ret, "ram_block_discard_disable() failed");
791             blkio_destroy(&s->blkio);
792             return ret;
793         }
794     }
795 
796     ret = blkio_start(s->blkio);
797     if (ret < 0) {
798         error_setg_errno(errp, -ret, "blkio_start failed: %s",
799                          blkio_get_error_msg());
800         blkio_destroy(&s->blkio);
801         if (s->may_pin_mem_regions) {
802             ram_block_discard_disable(false);
803         }
804         return ret;
805     }
806 
807     bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;
808     bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP |
809                                BDRV_REQ_NO_FALLBACK;
810 
811     qemu_mutex_init(&s->blkio_lock);
812     qemu_co_mutex_init(&s->bounce_lock);
813     qemu_co_queue_init(&s->bounce_available);
814     QLIST_INIT(&s->bounce_bufs);
815     s->blkioq = blkio_get_queue(s->blkio, 0);
816     s->completion_fd = blkioq_get_completion_fd(s->blkioq);
817 
818     blkio_attach_aio_context(bs, bdrv_get_aio_context(bs));
819     return 0;
820 }
821 
822 static void blkio_close(BlockDriverState *bs)
823 {
824     BDRVBlkioState *s = bs->opaque;
825 
826     /* There is no destroy() API for s->bounce_lock */
827 
828     qemu_mutex_destroy(&s->blkio_lock);
829     blkio_detach_aio_context(bs);
830     blkio_destroy(&s->blkio);
831 
832     if (s->may_pin_mem_regions) {
833         ram_block_discard_disable(false);
834     }
835 }
836 
837 static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs)
838 {
839     BDRVBlkioState *s = bs->opaque;
840     uint64_t capacity;
841     int ret;
842 
843     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
844         ret = blkio_get_uint64(s->blkio, "capacity", &capacity);
845     }
846     if (ret < 0) {
847         return -ret;
848     }
849 
850     return capacity;
851 }
852 
853 static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset,
854                                        bool exact, PreallocMode prealloc,
855                                        BdrvRequestFlags flags, Error **errp)
856 {
857     int64_t current_length;
858 
859     if (prealloc != PREALLOC_MODE_OFF) {
860         error_setg(errp, "Unsupported preallocation mode '%s'",
861                    PreallocMode_str(prealloc));
862         return -ENOTSUP;
863     }
864 
865     current_length = blkio_co_getlength(bs);
866 
867     if (offset > current_length) {
868         error_setg(errp, "Cannot grow device");
869         return -EINVAL;
870     } else if (exact && offset != current_length) {
871         error_setg(errp, "Cannot resize device");
872         return -ENOTSUP;
873     }
874 
875     return 0;
876 }
877 
878 static int coroutine_fn
879 blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
880 {
881     return 0;
882 }
883 
884 static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
885 {
886     BDRVBlkioState *s = bs->opaque;
887     QEMU_LOCK_GUARD(&s->blkio_lock);
888     int value;
889     int ret;
890 
891     ret = blkio_get_int(s->blkio, "request-alignment", &value);
892     if (ret < 0) {
893         error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s",
894                          blkio_get_error_msg());
895         return;
896     }
897     bs->bl.request_alignment = value;
898     if (bs->bl.request_alignment < 1 ||
899         bs->bl.request_alignment >= INT_MAX ||
900         !is_power_of_2(bs->bl.request_alignment)) {
901         error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", "
902                    "must be a power of 2 less than INT_MAX",
903                    bs->bl.request_alignment);
904         return;
905     }
906 
907     ret = blkio_get_int(s->blkio, "optimal-io-size", &value);
908     if (ret < 0) {
909         error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s",
910                          blkio_get_error_msg());
911         return;
912     }
913     bs->bl.opt_transfer = value;
914     if (bs->bl.opt_transfer > INT_MAX ||
915         (bs->bl.opt_transfer % bs->bl.request_alignment)) {
916         error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must "
917                    "be a multiple of %" PRIu32, bs->bl.opt_transfer,
918                    bs->bl.request_alignment);
919         return;
920     }
921 
922     ret = blkio_get_int(s->blkio, "max-transfer", &value);
923     if (ret < 0) {
924         error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s",
925                          blkio_get_error_msg());
926         return;
927     }
928     bs->bl.max_transfer = value;
929     if ((bs->bl.max_transfer % bs->bl.request_alignment) ||
930         (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) {
931         error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be "
932                    "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)",
933                    bs->bl.max_transfer, bs->bl.request_alignment,
934                    bs->bl.opt_transfer);
935         return;
936     }
937 
938     ret = blkio_get_int(s->blkio, "buf-alignment", &value);
939     if (ret < 0) {
940         error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s",
941                          blkio_get_error_msg());
942         return;
943     }
944     if (value < 1) {
945         error_setg(errp, "invalid \"buf-alignment\" value %d, must be "
946                    "positive", value);
947         return;
948     }
949     bs->bl.min_mem_alignment = value;
950 
951     ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value);
952     if (ret < 0) {
953         error_setg_errno(errp, -ret,
954                          "failed to get \"optimal-buf-alignment\": %s",
955                          blkio_get_error_msg());
956         return;
957     }
958     if (value < 1) {
959         error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, "
960                    "must be positive", value);
961         return;
962     }
963     bs->bl.opt_mem_alignment = value;
964 
965     ret = blkio_get_int(s->blkio, "max-segments", &value);
966     if (ret < 0) {
967         error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s",
968                          blkio_get_error_msg());
969         return;
970     }
971     if (value < 1) {
972         error_setg(errp, "invalid \"max-segments\" value %d, must be positive",
973                    value);
974         return;
975     }
976     bs->bl.max_iov = value;
977 }
978 
979 /*
980  * TODO
981  * Missing libblkio APIs:
982  * - block_status
983  * - co_invalidate_cache
984  *
985  * Out of scope?
986  * - create
987  * - truncate
988  */
989 
990 #define BLKIO_DRIVER(name, ...) \
991     { \
992         .format_name             = name, \
993         .protocol_name           = name, \
994         .instance_size           = sizeof(BDRVBlkioState), \
995         .bdrv_file_open          = blkio_file_open, \
996         .bdrv_close              = blkio_close, \
997         .bdrv_co_getlength       = blkio_co_getlength, \
998         .bdrv_co_truncate        = blkio_truncate, \
999         .bdrv_co_get_info        = blkio_co_get_info, \
1000         .bdrv_attach_aio_context = blkio_attach_aio_context, \
1001         .bdrv_detach_aio_context = blkio_detach_aio_context, \
1002         .bdrv_co_pdiscard        = blkio_co_pdiscard, \
1003         .bdrv_co_preadv          = blkio_co_preadv, \
1004         .bdrv_co_pwritev         = blkio_co_pwritev, \
1005         .bdrv_co_flush_to_disk   = blkio_co_flush, \
1006         .bdrv_co_pwrite_zeroes   = blkio_co_pwrite_zeroes, \
1007         .bdrv_co_io_unplug       = blkio_co_io_unplug, \
1008         .bdrv_refresh_limits     = blkio_refresh_limits, \
1009         .bdrv_register_buf       = blkio_register_buf, \
1010         .bdrv_unregister_buf     = blkio_unregister_buf, \
1011         __VA_ARGS__ \
1012     }
1013 
1014 static BlockDriver bdrv_io_uring = BLKIO_DRIVER(
1015     DRIVER_IO_URING,
1016     .bdrv_needs_filename = true,
1017 );
1018 
1019 static BlockDriver bdrv_nvme_io_uring = BLKIO_DRIVER(
1020     DRIVER_NVME_IO_URING,
1021 );
1022 
1023 static BlockDriver bdrv_virtio_blk_vfio_pci = BLKIO_DRIVER(
1024     DRIVER_VIRTIO_BLK_VFIO_PCI
1025 );
1026 
1027 static BlockDriver bdrv_virtio_blk_vhost_user = BLKIO_DRIVER(
1028     DRIVER_VIRTIO_BLK_VHOST_USER
1029 );
1030 
1031 static BlockDriver bdrv_virtio_blk_vhost_vdpa = BLKIO_DRIVER(
1032     DRIVER_VIRTIO_BLK_VHOST_VDPA
1033 );
1034 
1035 static void bdrv_blkio_init(void)
1036 {
1037     bdrv_register(&bdrv_io_uring);
1038     bdrv_register(&bdrv_nvme_io_uring);
1039     bdrv_register(&bdrv_virtio_blk_vfio_pci);
1040     bdrv_register(&bdrv_virtio_blk_vhost_user);
1041     bdrv_register(&bdrv_virtio_blk_vhost_vdpa);
1042 }
1043 
1044 block_init(bdrv_blkio_init);
1045