xref: /qemu/block/block-backend.c (revision 13f934e7)
1 /*
2  * QEMU Block backends
3  *
4  * Copyright (C) 2014-2016 Red Hat, Inc.
5  *
6  * Authors:
7  *  Markus Armbruster <armbru@redhat.com>,
8  *
9  * This work is licensed under the terms of the GNU LGPL, version 2.1
10  * or later.  See the COPYING.LIB file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "sysemu/block-backend.h"
15 #include "block/block_int.h"
16 #include "block/blockjob.h"
17 #include "block/throttle-groups.h"
18 #include "hw/qdev-core.h"
19 #include "sysemu/blockdev.h"
20 #include "sysemu/runstate.h"
21 #include "sysemu/replay.h"
22 #include "qapi/error.h"
23 #include "qapi/qapi-events-block.h"
24 #include "qemu/id.h"
25 #include "qemu/main-loop.h"
26 #include "qemu/option.h"
27 #include "trace.h"
28 #include "migration/misc.h"
29 
30 /* Number of coroutines to reserve per attached device model */
31 #define COROUTINE_POOL_RESERVATION 64
32 
33 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
34 
35 static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb);
36 
37 typedef struct BlockBackendAioNotifier {
38     void (*attached_aio_context)(AioContext *new_context, void *opaque);
39     void (*detach_aio_context)(void *opaque);
40     void *opaque;
41     QLIST_ENTRY(BlockBackendAioNotifier) list;
42 } BlockBackendAioNotifier;
43 
44 struct BlockBackend {
45     char *name;
46     int refcnt;
47     BdrvChild *root;
48     AioContext *ctx;
49     DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
50     QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
51     QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
52     BlockBackendPublic public;
53 
54     DeviceState *dev;           /* attached device model, if any */
55     const BlockDevOps *dev_ops;
56     void *dev_opaque;
57 
58     /* the block size for which the guest device expects atomicity */
59     int guest_block_size;
60 
61     /* If the BDS tree is removed, some of its options are stored here (which
62      * can be used to restore those options in the new BDS on insert) */
63     BlockBackendRootState root_state;
64 
65     bool enable_write_cache;
66 
67     /* I/O stats (display with "info blockstats"). */
68     BlockAcctStats stats;
69 
70     BlockdevOnError on_read_error, on_write_error;
71     bool iostatus_enabled;
72     BlockDeviceIoStatus iostatus;
73 
74     uint64_t perm;
75     uint64_t shared_perm;
76     bool disable_perm;
77 
78     bool allow_aio_context_change;
79     bool allow_write_beyond_eof;
80 
81     NotifierList remove_bs_notifiers, insert_bs_notifiers;
82     QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers;
83 
84     int quiesce_counter;
85     CoQueue queued_requests;
86     bool disable_request_queuing;
87 
88     VMChangeStateEntry *vmsh;
89     bool force_allow_inactivate;
90 
91     /* Number of in-flight aio requests.  BlockDriverState also counts
92      * in-flight requests but aio requests can exist even when blk->root is
93      * NULL, so we cannot rely on its counter for that case.
94      * Accessed with atomic ops.
95      */
96     unsigned int in_flight;
97 };
98 
99 typedef struct BlockBackendAIOCB {
100     BlockAIOCB common;
101     BlockBackend *blk;
102     int ret;
103 } BlockBackendAIOCB;
104 
105 static const AIOCBInfo block_backend_aiocb_info = {
106     .get_aio_context = blk_aiocb_get_aio_context,
107     .aiocb_size = sizeof(BlockBackendAIOCB),
108 };
109 
110 static void drive_info_del(DriveInfo *dinfo);
111 static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
112 
113 /* All BlockBackends */
114 static QTAILQ_HEAD(, BlockBackend) block_backends =
115     QTAILQ_HEAD_INITIALIZER(block_backends);
116 
117 /* All BlockBackends referenced by the monitor and which are iterated through by
118  * blk_next() */
119 static QTAILQ_HEAD(, BlockBackend) monitor_block_backends =
120     QTAILQ_HEAD_INITIALIZER(monitor_block_backends);
121 
122 static void blk_root_inherit_options(BdrvChildRole role, bool parent_is_format,
123                                      int *child_flags, QDict *child_options,
124                                      int parent_flags, QDict *parent_options)
125 {
126     /* We're not supposed to call this function for root nodes */
127     abort();
128 }
129 static void blk_root_drained_begin(BdrvChild *child);
130 static bool blk_root_drained_poll(BdrvChild *child);
131 static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter);
132 
133 static void blk_root_change_media(BdrvChild *child, bool load);
134 static void blk_root_resize(BdrvChild *child);
135 
136 static bool blk_root_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
137                                      GSList **ignore, Error **errp);
138 static void blk_root_set_aio_ctx(BdrvChild *child, AioContext *ctx,
139                                  GSList **ignore);
140 
141 static char *blk_root_get_parent_desc(BdrvChild *child)
142 {
143     BlockBackend *blk = child->opaque;
144     g_autofree char *dev_id = NULL;
145 
146     if (blk->name) {
147         return g_strdup_printf("block device '%s'", blk->name);
148     }
149 
150     dev_id = blk_get_attached_dev_id(blk);
151     if (*dev_id) {
152         return g_strdup_printf("block device '%s'", dev_id);
153     } else {
154         /* TODO Callback into the BB owner for something more detailed */
155         return g_strdup("an unnamed block device");
156     }
157 }
158 
159 static const char *blk_root_get_name(BdrvChild *child)
160 {
161     return blk_name(child->opaque);
162 }
163 
164 static void blk_vm_state_changed(void *opaque, bool running, RunState state)
165 {
166     Error *local_err = NULL;
167     BlockBackend *blk = opaque;
168 
169     if (state == RUN_STATE_INMIGRATE) {
170         return;
171     }
172 
173     qemu_del_vm_change_state_handler(blk->vmsh);
174     blk->vmsh = NULL;
175     blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
176     if (local_err) {
177         error_report_err(local_err);
178     }
179 }
180 
181 /*
182  * Notifies the user of the BlockBackend that migration has completed. qdev
183  * devices can tighten their permissions in response (specifically revoke
184  * shared write permissions that we needed for storage migration).
185  *
186  * If an error is returned, the VM cannot be allowed to be resumed.
187  */
188 static void blk_root_activate(BdrvChild *child, Error **errp)
189 {
190     BlockBackend *blk = child->opaque;
191     Error *local_err = NULL;
192 
193     if (!blk->disable_perm) {
194         return;
195     }
196 
197     blk->disable_perm = false;
198 
199     blk_set_perm(blk, blk->perm, BLK_PERM_ALL, &local_err);
200     if (local_err) {
201         error_propagate(errp, local_err);
202         blk->disable_perm = true;
203         return;
204     }
205 
206     if (runstate_check(RUN_STATE_INMIGRATE)) {
207         /* Activation can happen when migration process is still active, for
208          * example when nbd_server_add is called during non-shared storage
209          * migration. Defer the shared_perm update to migration completion. */
210         if (!blk->vmsh) {
211             blk->vmsh = qemu_add_vm_change_state_handler(blk_vm_state_changed,
212                                                          blk);
213         }
214         return;
215     }
216 
217     blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
218     if (local_err) {
219         error_propagate(errp, local_err);
220         blk->disable_perm = true;
221         return;
222     }
223 }
224 
225 void blk_set_force_allow_inactivate(BlockBackend *blk)
226 {
227     blk->force_allow_inactivate = true;
228 }
229 
230 static bool blk_can_inactivate(BlockBackend *blk)
231 {
232     /* If it is a guest device, inactivate is ok. */
233     if (blk->dev || blk_name(blk)[0]) {
234         return true;
235     }
236 
237     /* Inactivating means no more writes to the image can be done,
238      * even if those writes would be changes invisible to the
239      * guest.  For block job BBs that satisfy this, we can just allow
240      * it.  This is the case for mirror job source, which is required
241      * by libvirt non-shared block migration. */
242     if (!(blk->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED))) {
243         return true;
244     }
245 
246     return blk->force_allow_inactivate;
247 }
248 
249 static int blk_root_inactivate(BdrvChild *child)
250 {
251     BlockBackend *blk = child->opaque;
252 
253     if (blk->disable_perm) {
254         return 0;
255     }
256 
257     if (!blk_can_inactivate(blk)) {
258         return -EPERM;
259     }
260 
261     blk->disable_perm = true;
262     if (blk->root) {
263         bdrv_child_try_set_perm(blk->root, 0, BLK_PERM_ALL, &error_abort);
264     }
265 
266     return 0;
267 }
268 
269 static void blk_root_attach(BdrvChild *child)
270 {
271     BlockBackend *blk = child->opaque;
272     BlockBackendAioNotifier *notifier;
273 
274     trace_blk_root_attach(child, blk, child->bs);
275 
276     QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
277         bdrv_add_aio_context_notifier(child->bs,
278                 notifier->attached_aio_context,
279                 notifier->detach_aio_context,
280                 notifier->opaque);
281     }
282 }
283 
284 static void blk_root_detach(BdrvChild *child)
285 {
286     BlockBackend *blk = child->opaque;
287     BlockBackendAioNotifier *notifier;
288 
289     trace_blk_root_detach(child, blk, child->bs);
290 
291     QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
292         bdrv_remove_aio_context_notifier(child->bs,
293                 notifier->attached_aio_context,
294                 notifier->detach_aio_context,
295                 notifier->opaque);
296     }
297 }
298 
299 static AioContext *blk_root_get_parent_aio_context(BdrvChild *c)
300 {
301     BlockBackend *blk = c->opaque;
302 
303     return blk_get_aio_context(blk);
304 }
305 
306 static const BdrvChildClass child_root = {
307     .inherit_options    = blk_root_inherit_options,
308 
309     .change_media       = blk_root_change_media,
310     .resize             = blk_root_resize,
311     .get_name           = blk_root_get_name,
312     .get_parent_desc    = blk_root_get_parent_desc,
313 
314     .drained_begin      = blk_root_drained_begin,
315     .drained_poll       = blk_root_drained_poll,
316     .drained_end        = blk_root_drained_end,
317 
318     .activate           = blk_root_activate,
319     .inactivate         = blk_root_inactivate,
320 
321     .attach             = blk_root_attach,
322     .detach             = blk_root_detach,
323 
324     .can_set_aio_ctx    = blk_root_can_set_aio_ctx,
325     .set_aio_ctx        = blk_root_set_aio_ctx,
326 
327     .get_parent_aio_context = blk_root_get_parent_aio_context,
328 };
329 
330 /*
331  * Create a new BlockBackend with a reference count of one.
332  *
333  * @perm is a bitmasks of BLK_PERM_* constants which describes the permissions
334  * to request for a block driver node that is attached to this BlockBackend.
335  * @shared_perm is a bitmask which describes which permissions may be granted
336  * to other users of the attached node.
337  * Both sets of permissions can be changed later using blk_set_perm().
338  *
339  * Return the new BlockBackend on success, null on failure.
340  */
341 BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
342 {
343     BlockBackend *blk;
344 
345     blk = g_new0(BlockBackend, 1);
346     blk->refcnt = 1;
347     blk->ctx = ctx;
348     blk->perm = perm;
349     blk->shared_perm = shared_perm;
350     blk_set_enable_write_cache(blk, true);
351 
352     blk->on_read_error = BLOCKDEV_ON_ERROR_REPORT;
353     blk->on_write_error = BLOCKDEV_ON_ERROR_ENOSPC;
354 
355     block_acct_init(&blk->stats);
356 
357     qemu_co_queue_init(&blk->queued_requests);
358     notifier_list_init(&blk->remove_bs_notifiers);
359     notifier_list_init(&blk->insert_bs_notifiers);
360     QLIST_INIT(&blk->aio_notifiers);
361 
362     QTAILQ_INSERT_TAIL(&block_backends, blk, link);
363     return blk;
364 }
365 
366 /*
367  * Create a new BlockBackend connected to an existing BlockDriverState.
368  *
369  * @perm is a bitmasks of BLK_PERM_* constants which describes the
370  * permissions to request for @bs that is attached to this
371  * BlockBackend.  @shared_perm is a bitmask which describes which
372  * permissions may be granted to other users of the attached node.
373  * Both sets of permissions can be changed later using blk_set_perm().
374  *
375  * Return the new BlockBackend on success, null on failure.
376  */
377 BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm,
378                               uint64_t shared_perm, Error **errp)
379 {
380     BlockBackend *blk = blk_new(bdrv_get_aio_context(bs), perm, shared_perm);
381 
382     if (blk_insert_bs(blk, bs, errp) < 0) {
383         blk_unref(blk);
384         return NULL;
385     }
386     return blk;
387 }
388 
389 /*
390  * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
391  * The new BlockBackend is in the main AioContext.
392  *
393  * Just as with bdrv_open(), after having called this function the reference to
394  * @options belongs to the block layer (even on failure).
395  *
396  * TODO: Remove @filename and @flags; it should be possible to specify a whole
397  * BDS tree just by specifying the @options QDict (or @reference,
398  * alternatively). At the time of adding this function, this is not possible,
399  * though, so callers of this function have to be able to specify @filename and
400  * @flags.
401  */
402 BlockBackend *blk_new_open(const char *filename, const char *reference,
403                            QDict *options, int flags, Error **errp)
404 {
405     BlockBackend *blk;
406     BlockDriverState *bs;
407     uint64_t perm = 0;
408     uint64_t shared = BLK_PERM_ALL;
409 
410     /*
411      * blk_new_open() is mainly used in .bdrv_create implementations and the
412      * tools where sharing isn't a major concern because the BDS stays private
413      * and the file is generally not supposed to be used by a second process,
414      * so we just request permission according to the flags.
415      *
416      * The exceptions are xen_disk and blockdev_init(); in these cases, the
417      * caller of blk_new_open() doesn't make use of the permissions, but they
418      * shouldn't hurt either. We can still share everything here because the
419      * guest devices will add their own blockers if they can't share.
420      */
421     if ((flags & BDRV_O_NO_IO) == 0) {
422         perm |= BLK_PERM_CONSISTENT_READ;
423         if (flags & BDRV_O_RDWR) {
424             perm |= BLK_PERM_WRITE;
425         }
426     }
427     if (flags & BDRV_O_RESIZE) {
428         perm |= BLK_PERM_RESIZE;
429     }
430     if (flags & BDRV_O_NO_SHARE) {
431         shared = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
432     }
433 
434     blk = blk_new(qemu_get_aio_context(), perm, shared);
435     bs = bdrv_open(filename, reference, options, flags, errp);
436     if (!bs) {
437         blk_unref(blk);
438         return NULL;
439     }
440 
441     blk->root = bdrv_root_attach_child(bs, "root", &child_root,
442                                        BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
443                                        perm, shared, blk, errp);
444     if (!blk->root) {
445         blk_unref(blk);
446         return NULL;
447     }
448 
449     return blk;
450 }
451 
452 static void blk_delete(BlockBackend *blk)
453 {
454     assert(!blk->refcnt);
455     assert(!blk->name);
456     assert(!blk->dev);
457     if (blk->public.throttle_group_member.throttle_state) {
458         blk_io_limits_disable(blk);
459     }
460     if (blk->root) {
461         blk_remove_bs(blk);
462     }
463     if (blk->vmsh) {
464         qemu_del_vm_change_state_handler(blk->vmsh);
465         blk->vmsh = NULL;
466     }
467     assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
468     assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
469     assert(QLIST_EMPTY(&blk->aio_notifiers));
470     QTAILQ_REMOVE(&block_backends, blk, link);
471     drive_info_del(blk->legacy_dinfo);
472     block_acct_cleanup(&blk->stats);
473     g_free(blk);
474 }
475 
476 static void drive_info_del(DriveInfo *dinfo)
477 {
478     if (!dinfo) {
479         return;
480     }
481     qemu_opts_del(dinfo->opts);
482     g_free(dinfo);
483 }
484 
485 int blk_get_refcnt(BlockBackend *blk)
486 {
487     return blk ? blk->refcnt : 0;
488 }
489 
490 /*
491  * Increment @blk's reference count.
492  * @blk must not be null.
493  */
494 void blk_ref(BlockBackend *blk)
495 {
496     assert(blk->refcnt > 0);
497     blk->refcnt++;
498 }
499 
500 /*
501  * Decrement @blk's reference count.
502  * If this drops it to zero, destroy @blk.
503  * For convenience, do nothing if @blk is null.
504  */
505 void blk_unref(BlockBackend *blk)
506 {
507     if (blk) {
508         assert(blk->refcnt > 0);
509         if (blk->refcnt > 1) {
510             blk->refcnt--;
511         } else {
512             blk_drain(blk);
513             /* blk_drain() cannot resurrect blk, nobody held a reference */
514             assert(blk->refcnt == 1);
515             blk->refcnt = 0;
516             blk_delete(blk);
517         }
518     }
519 }
520 
521 /*
522  * Behaves similarly to blk_next() but iterates over all BlockBackends, even the
523  * ones which are hidden (i.e. are not referenced by the monitor).
524  */
525 BlockBackend *blk_all_next(BlockBackend *blk)
526 {
527     return blk ? QTAILQ_NEXT(blk, link)
528                : QTAILQ_FIRST(&block_backends);
529 }
530 
531 void blk_remove_all_bs(void)
532 {
533     BlockBackend *blk = NULL;
534 
535     while ((blk = blk_all_next(blk)) != NULL) {
536         AioContext *ctx = blk_get_aio_context(blk);
537 
538         aio_context_acquire(ctx);
539         if (blk->root) {
540             blk_remove_bs(blk);
541         }
542         aio_context_release(ctx);
543     }
544 }
545 
546 /*
547  * Return the monitor-owned BlockBackend after @blk.
548  * If @blk is null, return the first one.
549  * Else, return @blk's next sibling, which may be null.
550  *
551  * To iterate over all BlockBackends, do
552  * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
553  *     ...
554  * }
555  */
556 BlockBackend *blk_next(BlockBackend *blk)
557 {
558     return blk ? QTAILQ_NEXT(blk, monitor_link)
559                : QTAILQ_FIRST(&monitor_block_backends);
560 }
561 
562 /* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
563  * the monitor or attached to a BlockBackend */
564 BlockDriverState *bdrv_next(BdrvNextIterator *it)
565 {
566     BlockDriverState *bs, *old_bs;
567 
568     /* Must be called from the main loop */
569     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
570 
571     /* First, return all root nodes of BlockBackends. In order to avoid
572      * returning a BDS twice when multiple BBs refer to it, we only return it
573      * if the BB is the first one in the parent list of the BDS. */
574     if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
575         BlockBackend *old_blk = it->blk;
576 
577         old_bs = old_blk ? blk_bs(old_blk) : NULL;
578 
579         do {
580             it->blk = blk_all_next(it->blk);
581             bs = it->blk ? blk_bs(it->blk) : NULL;
582         } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
583 
584         if (it->blk) {
585             blk_ref(it->blk);
586         }
587         blk_unref(old_blk);
588 
589         if (bs) {
590             bdrv_ref(bs);
591             bdrv_unref(old_bs);
592             return bs;
593         }
594         it->phase = BDRV_NEXT_MONITOR_OWNED;
595     } else {
596         old_bs = it->bs;
597     }
598 
599     /* Then return the monitor-owned BDSes without a BB attached. Ignore all
600      * BDSes that are attached to a BlockBackend here; they have been handled
601      * by the above block already */
602     do {
603         it->bs = bdrv_next_monitor_owned(it->bs);
604         bs = it->bs;
605     } while (bs && bdrv_has_blk(bs));
606 
607     if (bs) {
608         bdrv_ref(bs);
609     }
610     bdrv_unref(old_bs);
611 
612     return bs;
613 }
614 
615 static void bdrv_next_reset(BdrvNextIterator *it)
616 {
617     *it = (BdrvNextIterator) {
618         .phase = BDRV_NEXT_BACKEND_ROOTS,
619     };
620 }
621 
622 BlockDriverState *bdrv_first(BdrvNextIterator *it)
623 {
624     bdrv_next_reset(it);
625     return bdrv_next(it);
626 }
627 
628 /* Must be called when aborting a bdrv_next() iteration before
629  * bdrv_next() returns NULL */
630 void bdrv_next_cleanup(BdrvNextIterator *it)
631 {
632     /* Must be called from the main loop */
633     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
634 
635     if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
636         if (it->blk) {
637             bdrv_unref(blk_bs(it->blk));
638             blk_unref(it->blk);
639         }
640     } else {
641         bdrv_unref(it->bs);
642     }
643 
644     bdrv_next_reset(it);
645 }
646 
647 /*
648  * Add a BlockBackend into the list of backends referenced by the monitor, with
649  * the given @name acting as the handle for the monitor.
650  * Strictly for use by blockdev.c.
651  *
652  * @name must not be null or empty.
653  *
654  * Returns true on success and false on failure. In the latter case, an Error
655  * object is returned through @errp.
656  */
657 bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp)
658 {
659     assert(!blk->name);
660     assert(name && name[0]);
661 
662     if (!id_wellformed(name)) {
663         error_setg(errp, "Invalid device name");
664         return false;
665     }
666     if (blk_by_name(name)) {
667         error_setg(errp, "Device with id '%s' already exists", name);
668         return false;
669     }
670     if (bdrv_find_node(name)) {
671         error_setg(errp,
672                    "Device name '%s' conflicts with an existing node name",
673                    name);
674         return false;
675     }
676 
677     blk->name = g_strdup(name);
678     QTAILQ_INSERT_TAIL(&monitor_block_backends, blk, monitor_link);
679     return true;
680 }
681 
682 /*
683  * Remove a BlockBackend from the list of backends referenced by the monitor.
684  * Strictly for use by blockdev.c.
685  */
686 void monitor_remove_blk(BlockBackend *blk)
687 {
688     if (!blk->name) {
689         return;
690     }
691 
692     QTAILQ_REMOVE(&monitor_block_backends, blk, monitor_link);
693     g_free(blk->name);
694     blk->name = NULL;
695 }
696 
697 /*
698  * Return @blk's name, a non-null string.
699  * Returns an empty string iff @blk is not referenced by the monitor.
700  */
701 const char *blk_name(const BlockBackend *blk)
702 {
703     return blk->name ?: "";
704 }
705 
706 /*
707  * Return the BlockBackend with name @name if it exists, else null.
708  * @name must not be null.
709  */
710 BlockBackend *blk_by_name(const char *name)
711 {
712     BlockBackend *blk = NULL;
713 
714     assert(name);
715     while ((blk = blk_next(blk)) != NULL) {
716         if (!strcmp(name, blk->name)) {
717             return blk;
718         }
719     }
720     return NULL;
721 }
722 
723 /*
724  * Return the BlockDriverState attached to @blk if any, else null.
725  */
726 BlockDriverState *blk_bs(BlockBackend *blk)
727 {
728     return blk->root ? blk->root->bs : NULL;
729 }
730 
731 static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
732 {
733     BdrvChild *child;
734     QLIST_FOREACH(child, &bs->parents, next_parent) {
735         if (child->klass == &child_root) {
736             return child->opaque;
737         }
738     }
739 
740     return NULL;
741 }
742 
743 /*
744  * Returns true if @bs has an associated BlockBackend.
745  */
746 bool bdrv_has_blk(BlockDriverState *bs)
747 {
748     return bdrv_first_blk(bs) != NULL;
749 }
750 
751 /*
752  * Returns true if @bs has only BlockBackends as parents.
753  */
754 bool bdrv_is_root_node(BlockDriverState *bs)
755 {
756     BdrvChild *c;
757 
758     QLIST_FOREACH(c, &bs->parents, next_parent) {
759         if (c->klass != &child_root) {
760             return false;
761         }
762     }
763 
764     return true;
765 }
766 
767 /*
768  * Return @blk's DriveInfo if any, else null.
769  */
770 DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
771 {
772     return blk->legacy_dinfo;
773 }
774 
775 /*
776  * Set @blk's DriveInfo to @dinfo, and return it.
777  * @blk must not have a DriveInfo set already.
778  * No other BlockBackend may have the same DriveInfo set.
779  */
780 DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo)
781 {
782     assert(!blk->legacy_dinfo);
783     return blk->legacy_dinfo = dinfo;
784 }
785 
786 /*
787  * Return the BlockBackend with DriveInfo @dinfo.
788  * It must exist.
789  */
790 BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
791 {
792     BlockBackend *blk = NULL;
793 
794     while ((blk = blk_next(blk)) != NULL) {
795         if (blk->legacy_dinfo == dinfo) {
796             return blk;
797         }
798     }
799     abort();
800 }
801 
802 /*
803  * Returns a pointer to the publicly accessible fields of @blk.
804  */
805 BlockBackendPublic *blk_get_public(BlockBackend *blk)
806 {
807     return &blk->public;
808 }
809 
810 /*
811  * Returns a BlockBackend given the associated @public fields.
812  */
813 BlockBackend *blk_by_public(BlockBackendPublic *public)
814 {
815     return container_of(public, BlockBackend, public);
816 }
817 
818 /*
819  * Disassociates the currently associated BlockDriverState from @blk.
820  */
821 void blk_remove_bs(BlockBackend *blk)
822 {
823     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
824     BlockDriverState *bs;
825     BdrvChild *root;
826 
827     notifier_list_notify(&blk->remove_bs_notifiers, blk);
828     if (tgm->throttle_state) {
829         bs = blk_bs(blk);
830         bdrv_drained_begin(bs);
831         throttle_group_detach_aio_context(tgm);
832         throttle_group_attach_aio_context(tgm, qemu_get_aio_context());
833         bdrv_drained_end(bs);
834     }
835 
836     blk_update_root_state(blk);
837 
838     /* bdrv_root_unref_child() will cause blk->root to become stale and may
839      * switch to a completion coroutine later on. Let's drain all I/O here
840      * to avoid that and a potential QEMU crash.
841      */
842     blk_drain(blk);
843     root = blk->root;
844     blk->root = NULL;
845     bdrv_root_unref_child(root);
846 }
847 
848 /*
849  * Associates a new BlockDriverState with @blk.
850  */
851 int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
852 {
853     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
854     bdrv_ref(bs);
855     blk->root = bdrv_root_attach_child(bs, "root", &child_root,
856                                        BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
857                                        blk->perm, blk->shared_perm,
858                                        blk, errp);
859     if (blk->root == NULL) {
860         return -EPERM;
861     }
862 
863     notifier_list_notify(&blk->insert_bs_notifiers, blk);
864     if (tgm->throttle_state) {
865         throttle_group_detach_aio_context(tgm);
866         throttle_group_attach_aio_context(tgm, bdrv_get_aio_context(bs));
867     }
868 
869     return 0;
870 }
871 
872 /*
873  * Sets the permission bitmasks that the user of the BlockBackend needs.
874  */
875 int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
876                  Error **errp)
877 {
878     int ret;
879 
880     if (blk->root && !blk->disable_perm) {
881         ret = bdrv_child_try_set_perm(blk->root, perm, shared_perm, errp);
882         if (ret < 0) {
883             return ret;
884         }
885     }
886 
887     blk->perm = perm;
888     blk->shared_perm = shared_perm;
889 
890     return 0;
891 }
892 
893 void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm)
894 {
895     *perm = blk->perm;
896     *shared_perm = blk->shared_perm;
897 }
898 
899 /*
900  * Attach device model @dev to @blk.
901  * Return 0 on success, -EBUSY when a device model is attached already.
902  */
903 int blk_attach_dev(BlockBackend *blk, DeviceState *dev)
904 {
905     if (blk->dev) {
906         return -EBUSY;
907     }
908 
909     /* While migration is still incoming, we don't need to apply the
910      * permissions of guest device BlockBackends. We might still have a block
911      * job or NBD server writing to the image for storage migration. */
912     if (runstate_check(RUN_STATE_INMIGRATE)) {
913         blk->disable_perm = true;
914     }
915 
916     blk_ref(blk);
917     blk->dev = dev;
918     blk_iostatus_reset(blk);
919 
920     return 0;
921 }
922 
923 /*
924  * Detach device model @dev from @blk.
925  * @dev must be currently attached to @blk.
926  */
927 void blk_detach_dev(BlockBackend *blk, DeviceState *dev)
928 {
929     assert(blk->dev == dev);
930     blk->dev = NULL;
931     blk->dev_ops = NULL;
932     blk->dev_opaque = NULL;
933     blk->guest_block_size = 512;
934     blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
935     blk_unref(blk);
936 }
937 
938 /*
939  * Return the device model attached to @blk if any, else null.
940  */
941 DeviceState *blk_get_attached_dev(BlockBackend *blk)
942 {
943     return blk->dev;
944 }
945 
946 /* Return the qdev ID, or if no ID is assigned the QOM path, of the block
947  * device attached to the BlockBackend. */
948 char *blk_get_attached_dev_id(BlockBackend *blk)
949 {
950     DeviceState *dev = blk->dev;
951 
952     if (!dev) {
953         return g_strdup("");
954     } else if (dev->id) {
955         return g_strdup(dev->id);
956     }
957 
958     return object_get_canonical_path(OBJECT(dev)) ?: g_strdup("");
959 }
960 
961 /*
962  * Return the BlockBackend which has the device model @dev attached if it
963  * exists, else null.
964  *
965  * @dev must not be null.
966  */
967 BlockBackend *blk_by_dev(void *dev)
968 {
969     BlockBackend *blk = NULL;
970 
971     assert(dev != NULL);
972     while ((blk = blk_all_next(blk)) != NULL) {
973         if (blk->dev == dev) {
974             return blk;
975         }
976     }
977     return NULL;
978 }
979 
980 /*
981  * Set @blk's device model callbacks to @ops.
982  * @opaque is the opaque argument to pass to the callbacks.
983  * This is for use by device models.
984  */
985 void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
986                      void *opaque)
987 {
988     blk->dev_ops = ops;
989     blk->dev_opaque = opaque;
990 
991     /* Are we currently quiesced? Should we enforce this right now? */
992     if (blk->quiesce_counter && ops->drained_begin) {
993         ops->drained_begin(opaque);
994     }
995 }
996 
997 /*
998  * Notify @blk's attached device model of media change.
999  *
1000  * If @load is true, notify of media load. This action can fail, meaning that
1001  * the medium cannot be loaded. @errp is set then.
1002  *
1003  * If @load is false, notify of media eject. This can never fail.
1004  *
1005  * Also send DEVICE_TRAY_MOVED events as appropriate.
1006  */
1007 void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp)
1008 {
1009     if (blk->dev_ops && blk->dev_ops->change_media_cb) {
1010         bool tray_was_open, tray_is_open;
1011         Error *local_err = NULL;
1012 
1013         tray_was_open = blk_dev_is_tray_open(blk);
1014         blk->dev_ops->change_media_cb(blk->dev_opaque, load, &local_err);
1015         if (local_err) {
1016             assert(load == true);
1017             error_propagate(errp, local_err);
1018             return;
1019         }
1020         tray_is_open = blk_dev_is_tray_open(blk);
1021 
1022         if (tray_was_open != tray_is_open) {
1023             char *id = blk_get_attached_dev_id(blk);
1024             qapi_event_send_device_tray_moved(blk_name(blk), id, tray_is_open);
1025             g_free(id);
1026         }
1027     }
1028 }
1029 
1030 static void blk_root_change_media(BdrvChild *child, bool load)
1031 {
1032     blk_dev_change_media_cb(child->opaque, load, NULL);
1033 }
1034 
1035 /*
1036  * Does @blk's attached device model have removable media?
1037  * %true if no device model is attached.
1038  */
1039 bool blk_dev_has_removable_media(BlockBackend *blk)
1040 {
1041     return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb);
1042 }
1043 
1044 /*
1045  * Does @blk's attached device model have a tray?
1046  */
1047 bool blk_dev_has_tray(BlockBackend *blk)
1048 {
1049     return blk->dev_ops && blk->dev_ops->is_tray_open;
1050 }
1051 
1052 /*
1053  * Notify @blk's attached device model of a media eject request.
1054  * If @force is true, the medium is about to be yanked out forcefully.
1055  */
1056 void blk_dev_eject_request(BlockBackend *blk, bool force)
1057 {
1058     if (blk->dev_ops && blk->dev_ops->eject_request_cb) {
1059         blk->dev_ops->eject_request_cb(blk->dev_opaque, force);
1060     }
1061 }
1062 
1063 /*
1064  * Does @blk's attached device model have a tray, and is it open?
1065  */
1066 bool blk_dev_is_tray_open(BlockBackend *blk)
1067 {
1068     if (blk_dev_has_tray(blk)) {
1069         return blk->dev_ops->is_tray_open(blk->dev_opaque);
1070     }
1071     return false;
1072 }
1073 
1074 /*
1075  * Does @blk's attached device model have the medium locked?
1076  * %false if the device model has no such lock.
1077  */
1078 bool blk_dev_is_medium_locked(BlockBackend *blk)
1079 {
1080     if (blk->dev_ops && blk->dev_ops->is_medium_locked) {
1081         return blk->dev_ops->is_medium_locked(blk->dev_opaque);
1082     }
1083     return false;
1084 }
1085 
1086 /*
1087  * Notify @blk's attached device model of a backend size change.
1088  */
1089 static void blk_root_resize(BdrvChild *child)
1090 {
1091     BlockBackend *blk = child->opaque;
1092 
1093     if (blk->dev_ops && blk->dev_ops->resize_cb) {
1094         blk->dev_ops->resize_cb(blk->dev_opaque);
1095     }
1096 }
1097 
1098 void blk_iostatus_enable(BlockBackend *blk)
1099 {
1100     blk->iostatus_enabled = true;
1101     blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1102 }
1103 
1104 /* The I/O status is only enabled if the drive explicitly
1105  * enables it _and_ the VM is configured to stop on errors */
1106 bool blk_iostatus_is_enabled(const BlockBackend *blk)
1107 {
1108     return (blk->iostatus_enabled &&
1109            (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
1110             blk->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
1111             blk->on_read_error == BLOCKDEV_ON_ERROR_STOP));
1112 }
1113 
1114 BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk)
1115 {
1116     return blk->iostatus;
1117 }
1118 
1119 void blk_iostatus_disable(BlockBackend *blk)
1120 {
1121     blk->iostatus_enabled = false;
1122 }
1123 
1124 void blk_iostatus_reset(BlockBackend *blk)
1125 {
1126     if (blk_iostatus_is_enabled(blk)) {
1127         blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1128     }
1129 }
1130 
1131 void blk_iostatus_set_err(BlockBackend *blk, int error)
1132 {
1133     assert(blk_iostatus_is_enabled(blk));
1134     if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
1135         blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
1136                                           BLOCK_DEVICE_IO_STATUS_FAILED;
1137     }
1138 }
1139 
1140 void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow)
1141 {
1142     blk->allow_write_beyond_eof = allow;
1143 }
1144 
1145 void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow)
1146 {
1147     blk->allow_aio_context_change = allow;
1148 }
1149 
1150 void blk_set_disable_request_queuing(BlockBackend *blk, bool disable)
1151 {
1152     blk->disable_request_queuing = disable;
1153 }
1154 
1155 static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
1156                                   size_t size)
1157 {
1158     int64_t len;
1159 
1160     if (size > INT_MAX) {
1161         return -EIO;
1162     }
1163 
1164     if (!blk_is_available(blk)) {
1165         return -ENOMEDIUM;
1166     }
1167 
1168     if (offset < 0) {
1169         return -EIO;
1170     }
1171 
1172     if (!blk->allow_write_beyond_eof) {
1173         len = blk_getlength(blk);
1174         if (len < 0) {
1175             return len;
1176         }
1177 
1178         if (offset > len || len - offset < size) {
1179             return -EIO;
1180         }
1181     }
1182 
1183     return 0;
1184 }
1185 
1186 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1187 static void coroutine_fn blk_wait_while_drained(BlockBackend *blk)
1188 {
1189     assert(blk->in_flight > 0);
1190 
1191     if (blk->quiesce_counter && !blk->disable_request_queuing) {
1192         blk_dec_in_flight(blk);
1193         qemu_co_queue_wait(&blk->queued_requests, NULL);
1194         blk_inc_in_flight(blk);
1195     }
1196 }
1197 
1198 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1199 static int coroutine_fn
1200 blk_do_preadv(BlockBackend *blk, int64_t offset, unsigned int bytes,
1201               QEMUIOVector *qiov, BdrvRequestFlags flags)
1202 {
1203     int ret;
1204     BlockDriverState *bs;
1205 
1206     blk_wait_while_drained(blk);
1207 
1208     /* Call blk_bs() only after waiting, the graph may have changed */
1209     bs = blk_bs(blk);
1210     trace_blk_co_preadv(blk, bs, offset, bytes, flags);
1211 
1212     ret = blk_check_byte_request(blk, offset, bytes);
1213     if (ret < 0) {
1214         return ret;
1215     }
1216 
1217     bdrv_inc_in_flight(bs);
1218 
1219     /* throttling disk I/O */
1220     if (blk->public.throttle_group_member.throttle_state) {
1221         throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1222                 bytes, false);
1223     }
1224 
1225     ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
1226     bdrv_dec_in_flight(bs);
1227     return ret;
1228 }
1229 
1230 int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
1231                                unsigned int bytes, QEMUIOVector *qiov,
1232                                BdrvRequestFlags flags)
1233 {
1234     int ret;
1235 
1236     blk_inc_in_flight(blk);
1237     ret = blk_do_preadv(blk, offset, bytes, qiov, flags);
1238     blk_dec_in_flight(blk);
1239 
1240     return ret;
1241 }
1242 
1243 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1244 static int coroutine_fn
1245 blk_do_pwritev_part(BlockBackend *blk, int64_t offset, unsigned int bytes,
1246                     QEMUIOVector *qiov, size_t qiov_offset,
1247                     BdrvRequestFlags flags)
1248 {
1249     int ret;
1250     BlockDriverState *bs;
1251 
1252     blk_wait_while_drained(blk);
1253 
1254     /* Call blk_bs() only after waiting, the graph may have changed */
1255     bs = blk_bs(blk);
1256     trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
1257 
1258     ret = blk_check_byte_request(blk, offset, bytes);
1259     if (ret < 0) {
1260         return ret;
1261     }
1262 
1263     bdrv_inc_in_flight(bs);
1264     /* throttling disk I/O */
1265     if (blk->public.throttle_group_member.throttle_state) {
1266         throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1267                 bytes, true);
1268     }
1269 
1270     if (!blk->enable_write_cache) {
1271         flags |= BDRV_REQ_FUA;
1272     }
1273 
1274     ret = bdrv_co_pwritev_part(blk->root, offset, bytes, qiov, qiov_offset,
1275                                flags);
1276     bdrv_dec_in_flight(bs);
1277     return ret;
1278 }
1279 
1280 int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
1281                                      unsigned int bytes,
1282                                      QEMUIOVector *qiov, size_t qiov_offset,
1283                                      BdrvRequestFlags flags)
1284 {
1285     int ret;
1286 
1287     blk_inc_in_flight(blk);
1288     ret = blk_do_pwritev_part(blk, offset, bytes, qiov, qiov_offset, flags);
1289     blk_dec_in_flight(blk);
1290 
1291     return ret;
1292 }
1293 
1294 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
1295                                 unsigned int bytes, QEMUIOVector *qiov,
1296                                 BdrvRequestFlags flags)
1297 {
1298     return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
1299 }
1300 
1301 typedef struct BlkRwCo {
1302     BlockBackend *blk;
1303     int64_t offset;
1304     void *iobuf;
1305     int ret;
1306     BdrvRequestFlags flags;
1307 } BlkRwCo;
1308 
1309 static void blk_read_entry(void *opaque)
1310 {
1311     BlkRwCo *rwco = opaque;
1312     QEMUIOVector *qiov = rwco->iobuf;
1313 
1314     rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, qiov->size,
1315                               qiov, rwco->flags);
1316     aio_wait_kick();
1317 }
1318 
1319 static void blk_write_entry(void *opaque)
1320 {
1321     BlkRwCo *rwco = opaque;
1322     QEMUIOVector *qiov = rwco->iobuf;
1323 
1324     rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, qiov->size,
1325                                     qiov, 0, rwco->flags);
1326     aio_wait_kick();
1327 }
1328 
1329 static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
1330                    int64_t bytes, CoroutineEntry co_entry,
1331                    BdrvRequestFlags flags)
1332 {
1333     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1334     BlkRwCo rwco = {
1335         .blk    = blk,
1336         .offset = offset,
1337         .iobuf  = &qiov,
1338         .flags  = flags,
1339         .ret    = NOT_DONE,
1340     };
1341 
1342     blk_inc_in_flight(blk);
1343     if (qemu_in_coroutine()) {
1344         /* Fast-path if already in coroutine context */
1345         co_entry(&rwco);
1346     } else {
1347         Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
1348         bdrv_coroutine_enter(blk_bs(blk), co);
1349         BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
1350     }
1351     blk_dec_in_flight(blk);
1352 
1353     return rwco.ret;
1354 }
1355 
1356 int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1357                       int bytes, BdrvRequestFlags flags)
1358 {
1359     return blk_prw(blk, offset, NULL, bytes, blk_write_entry,
1360                    flags | BDRV_REQ_ZERO_WRITE);
1361 }
1362 
1363 int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
1364 {
1365     return bdrv_make_zero(blk->root, flags);
1366 }
1367 
1368 void blk_inc_in_flight(BlockBackend *blk)
1369 {
1370     qatomic_inc(&blk->in_flight);
1371 }
1372 
1373 void blk_dec_in_flight(BlockBackend *blk)
1374 {
1375     qatomic_dec(&blk->in_flight);
1376     aio_wait_kick();
1377 }
1378 
1379 static void error_callback_bh(void *opaque)
1380 {
1381     struct BlockBackendAIOCB *acb = opaque;
1382 
1383     blk_dec_in_flight(acb->blk);
1384     acb->common.cb(acb->common.opaque, acb->ret);
1385     qemu_aio_unref(acb);
1386 }
1387 
1388 BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
1389                                   BlockCompletionFunc *cb,
1390                                   void *opaque, int ret)
1391 {
1392     struct BlockBackendAIOCB *acb;
1393 
1394     blk_inc_in_flight(blk);
1395     acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
1396     acb->blk = blk;
1397     acb->ret = ret;
1398 
1399     replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1400                                      error_callback_bh, acb);
1401     return &acb->common;
1402 }
1403 
1404 typedef struct BlkAioEmAIOCB {
1405     BlockAIOCB common;
1406     BlkRwCo rwco;
1407     int bytes;
1408     bool has_returned;
1409 } BlkAioEmAIOCB;
1410 
1411 static AioContext *blk_aio_em_aiocb_get_aio_context(BlockAIOCB *acb_)
1412 {
1413     BlkAioEmAIOCB *acb = container_of(acb_, BlkAioEmAIOCB, common);
1414 
1415     return blk_get_aio_context(acb->rwco.blk);
1416 }
1417 
1418 static const AIOCBInfo blk_aio_em_aiocb_info = {
1419     .aiocb_size         = sizeof(BlkAioEmAIOCB),
1420     .get_aio_context    = blk_aio_em_aiocb_get_aio_context,
1421 };
1422 
1423 static void blk_aio_complete(BlkAioEmAIOCB *acb)
1424 {
1425     if (acb->has_returned) {
1426         acb->common.cb(acb->common.opaque, acb->rwco.ret);
1427         blk_dec_in_flight(acb->rwco.blk);
1428         qemu_aio_unref(acb);
1429     }
1430 }
1431 
1432 static void blk_aio_complete_bh(void *opaque)
1433 {
1434     BlkAioEmAIOCB *acb = opaque;
1435     assert(acb->has_returned);
1436     blk_aio_complete(acb);
1437 }
1438 
1439 static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
1440                                 void *iobuf, CoroutineEntry co_entry,
1441                                 BdrvRequestFlags flags,
1442                                 BlockCompletionFunc *cb, void *opaque)
1443 {
1444     BlkAioEmAIOCB *acb;
1445     Coroutine *co;
1446 
1447     blk_inc_in_flight(blk);
1448     acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1449     acb->rwco = (BlkRwCo) {
1450         .blk    = blk,
1451         .offset = offset,
1452         .iobuf  = iobuf,
1453         .flags  = flags,
1454         .ret    = NOT_DONE,
1455     };
1456     acb->bytes = bytes;
1457     acb->has_returned = false;
1458 
1459     co = qemu_coroutine_create(co_entry, acb);
1460     bdrv_coroutine_enter(blk_bs(blk), co);
1461 
1462     acb->has_returned = true;
1463     if (acb->rwco.ret != NOT_DONE) {
1464         replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1465                                          blk_aio_complete_bh, acb);
1466     }
1467 
1468     return &acb->common;
1469 }
1470 
1471 static void blk_aio_read_entry(void *opaque)
1472 {
1473     BlkAioEmAIOCB *acb = opaque;
1474     BlkRwCo *rwco = &acb->rwco;
1475     QEMUIOVector *qiov = rwco->iobuf;
1476 
1477     assert(qiov->size == acb->bytes);
1478     rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, acb->bytes,
1479                               qiov, rwco->flags);
1480     blk_aio_complete(acb);
1481 }
1482 
1483 static void blk_aio_write_entry(void *opaque)
1484 {
1485     BlkAioEmAIOCB *acb = opaque;
1486     BlkRwCo *rwco = &acb->rwco;
1487     QEMUIOVector *qiov = rwco->iobuf;
1488 
1489     assert(!qiov || qiov->size == acb->bytes);
1490     rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, acb->bytes,
1491                                     qiov, 0, rwco->flags);
1492     blk_aio_complete(acb);
1493 }
1494 
1495 BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1496                                   int count, BdrvRequestFlags flags,
1497                                   BlockCompletionFunc *cb, void *opaque)
1498 {
1499     return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
1500                         flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
1501 }
1502 
1503 int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
1504 {
1505     int ret = blk_prw(blk, offset, buf, count, blk_read_entry, 0);
1506     if (ret < 0) {
1507         return ret;
1508     }
1509     return count;
1510 }
1511 
1512 int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count,
1513                BdrvRequestFlags flags)
1514 {
1515     int ret = blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
1516                       flags);
1517     if (ret < 0) {
1518         return ret;
1519     }
1520     return count;
1521 }
1522 
1523 int64_t blk_getlength(BlockBackend *blk)
1524 {
1525     if (!blk_is_available(blk)) {
1526         return -ENOMEDIUM;
1527     }
1528 
1529     return bdrv_getlength(blk_bs(blk));
1530 }
1531 
1532 void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr)
1533 {
1534     if (!blk_bs(blk)) {
1535         *nb_sectors_ptr = 0;
1536     } else {
1537         bdrv_get_geometry(blk_bs(blk), nb_sectors_ptr);
1538     }
1539 }
1540 
1541 int64_t blk_nb_sectors(BlockBackend *blk)
1542 {
1543     if (!blk_is_available(blk)) {
1544         return -ENOMEDIUM;
1545     }
1546 
1547     return bdrv_nb_sectors(blk_bs(blk));
1548 }
1549 
1550 BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
1551                            QEMUIOVector *qiov, BdrvRequestFlags flags,
1552                            BlockCompletionFunc *cb, void *opaque)
1553 {
1554     return blk_aio_prwv(blk, offset, qiov->size, qiov,
1555                         blk_aio_read_entry, flags, cb, opaque);
1556 }
1557 
1558 BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
1559                             QEMUIOVector *qiov, BdrvRequestFlags flags,
1560                             BlockCompletionFunc *cb, void *opaque)
1561 {
1562     return blk_aio_prwv(blk, offset, qiov->size, qiov,
1563                         blk_aio_write_entry, flags, cb, opaque);
1564 }
1565 
1566 void blk_aio_cancel(BlockAIOCB *acb)
1567 {
1568     bdrv_aio_cancel(acb);
1569 }
1570 
1571 void blk_aio_cancel_async(BlockAIOCB *acb)
1572 {
1573     bdrv_aio_cancel_async(acb);
1574 }
1575 
1576 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1577 static int coroutine_fn
1578 blk_do_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1579 {
1580     blk_wait_while_drained(blk);
1581 
1582     if (!blk_is_available(blk)) {
1583         return -ENOMEDIUM;
1584     }
1585 
1586     return bdrv_co_ioctl(blk_bs(blk), req, buf);
1587 }
1588 
1589 static void blk_ioctl_entry(void *opaque)
1590 {
1591     BlkRwCo *rwco = opaque;
1592     QEMUIOVector *qiov = rwco->iobuf;
1593 
1594     rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, qiov->iov[0].iov_base);
1595     aio_wait_kick();
1596 }
1597 
1598 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1599 {
1600     return blk_prw(blk, req, buf, 0, blk_ioctl_entry, 0);
1601 }
1602 
1603 static void blk_aio_ioctl_entry(void *opaque)
1604 {
1605     BlkAioEmAIOCB *acb = opaque;
1606     BlkRwCo *rwco = &acb->rwco;
1607 
1608     rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, rwco->iobuf);
1609 
1610     blk_aio_complete(acb);
1611 }
1612 
1613 BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
1614                           BlockCompletionFunc *cb, void *opaque)
1615 {
1616     return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
1617 }
1618 
1619 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1620 static int coroutine_fn
1621 blk_do_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1622 {
1623     int ret;
1624 
1625     blk_wait_while_drained(blk);
1626 
1627     ret = blk_check_byte_request(blk, offset, bytes);
1628     if (ret < 0) {
1629         return ret;
1630     }
1631 
1632     return bdrv_co_pdiscard(blk->root, offset, bytes);
1633 }
1634 
1635 static void blk_aio_pdiscard_entry(void *opaque)
1636 {
1637     BlkAioEmAIOCB *acb = opaque;
1638     BlkRwCo *rwco = &acb->rwco;
1639 
1640     rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, acb->bytes);
1641     blk_aio_complete(acb);
1642 }
1643 
1644 BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
1645                              int64_t offset, int bytes,
1646                              BlockCompletionFunc *cb, void *opaque)
1647 {
1648     return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0,
1649                         cb, opaque);
1650 }
1651 
1652 int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1653 {
1654     int ret;
1655 
1656     blk_inc_in_flight(blk);
1657     ret = blk_do_pdiscard(blk, offset, bytes);
1658     blk_dec_in_flight(blk);
1659 
1660     return ret;
1661 }
1662 
1663 static void blk_pdiscard_entry(void *opaque)
1664 {
1665     BlkRwCo *rwco = opaque;
1666     QEMUIOVector *qiov = rwco->iobuf;
1667 
1668     rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, qiov->size);
1669     aio_wait_kick();
1670 }
1671 
1672 int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1673 {
1674     return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0);
1675 }
1676 
1677 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1678 static int coroutine_fn blk_do_flush(BlockBackend *blk)
1679 {
1680     blk_wait_while_drained(blk);
1681 
1682     if (!blk_is_available(blk)) {
1683         return -ENOMEDIUM;
1684     }
1685 
1686     return bdrv_co_flush(blk_bs(blk));
1687 }
1688 
1689 static void blk_aio_flush_entry(void *opaque)
1690 {
1691     BlkAioEmAIOCB *acb = opaque;
1692     BlkRwCo *rwco = &acb->rwco;
1693 
1694     rwco->ret = blk_do_flush(rwco->blk);
1695     blk_aio_complete(acb);
1696 }
1697 
1698 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
1699                           BlockCompletionFunc *cb, void *opaque)
1700 {
1701     return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque);
1702 }
1703 
1704 int coroutine_fn blk_co_flush(BlockBackend *blk)
1705 {
1706     int ret;
1707 
1708     blk_inc_in_flight(blk);
1709     ret = blk_do_flush(blk);
1710     blk_dec_in_flight(blk);
1711 
1712     return ret;
1713 }
1714 
1715 static void blk_flush_entry(void *opaque)
1716 {
1717     BlkRwCo *rwco = opaque;
1718     rwco->ret = blk_do_flush(rwco->blk);
1719     aio_wait_kick();
1720 }
1721 
1722 int blk_flush(BlockBackend *blk)
1723 {
1724     return blk_prw(blk, 0, NULL, 0, blk_flush_entry, 0);
1725 }
1726 
1727 void blk_drain(BlockBackend *blk)
1728 {
1729     BlockDriverState *bs = blk_bs(blk);
1730 
1731     if (bs) {
1732         bdrv_drained_begin(bs);
1733     }
1734 
1735     /* We may have -ENOMEDIUM completions in flight */
1736     AIO_WAIT_WHILE(blk_get_aio_context(blk),
1737                    qatomic_mb_read(&blk->in_flight) > 0);
1738 
1739     if (bs) {
1740         bdrv_drained_end(bs);
1741     }
1742 }
1743 
1744 void blk_drain_all(void)
1745 {
1746     BlockBackend *blk = NULL;
1747 
1748     bdrv_drain_all_begin();
1749 
1750     while ((blk = blk_all_next(blk)) != NULL) {
1751         AioContext *ctx = blk_get_aio_context(blk);
1752 
1753         aio_context_acquire(ctx);
1754 
1755         /* We may have -ENOMEDIUM completions in flight */
1756         AIO_WAIT_WHILE(ctx, qatomic_mb_read(&blk->in_flight) > 0);
1757 
1758         aio_context_release(ctx);
1759     }
1760 
1761     bdrv_drain_all_end();
1762 }
1763 
1764 void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
1765                       BlockdevOnError on_write_error)
1766 {
1767     blk->on_read_error = on_read_error;
1768     blk->on_write_error = on_write_error;
1769 }
1770 
1771 BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read)
1772 {
1773     return is_read ? blk->on_read_error : blk->on_write_error;
1774 }
1775 
1776 BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
1777                                       int error)
1778 {
1779     BlockdevOnError on_err = blk_get_on_error(blk, is_read);
1780 
1781     switch (on_err) {
1782     case BLOCKDEV_ON_ERROR_ENOSPC:
1783         return (error == ENOSPC) ?
1784                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
1785     case BLOCKDEV_ON_ERROR_STOP:
1786         return BLOCK_ERROR_ACTION_STOP;
1787     case BLOCKDEV_ON_ERROR_REPORT:
1788         return BLOCK_ERROR_ACTION_REPORT;
1789     case BLOCKDEV_ON_ERROR_IGNORE:
1790         return BLOCK_ERROR_ACTION_IGNORE;
1791     case BLOCKDEV_ON_ERROR_AUTO:
1792     default:
1793         abort();
1794     }
1795 }
1796 
1797 static void send_qmp_error_event(BlockBackend *blk,
1798                                  BlockErrorAction action,
1799                                  bool is_read, int error)
1800 {
1801     IoOperationType optype;
1802     BlockDriverState *bs = blk_bs(blk);
1803 
1804     optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
1805     qapi_event_send_block_io_error(blk_name(blk), !!bs,
1806                                    bs ? bdrv_get_node_name(bs) : NULL, optype,
1807                                    action, blk_iostatus_is_enabled(blk),
1808                                    error == ENOSPC, strerror(error));
1809 }
1810 
1811 /* This is done by device models because, while the block layer knows
1812  * about the error, it does not know whether an operation comes from
1813  * the device or the block layer (from a job, for example).
1814  */
1815 void blk_error_action(BlockBackend *blk, BlockErrorAction action,
1816                       bool is_read, int error)
1817 {
1818     assert(error >= 0);
1819 
1820     if (action == BLOCK_ERROR_ACTION_STOP) {
1821         /* First set the iostatus, so that "info block" returns an iostatus
1822          * that matches the events raised so far (an additional error iostatus
1823          * is fine, but not a lost one).
1824          */
1825         blk_iostatus_set_err(blk, error);
1826 
1827         /* Then raise the request to stop the VM and the event.
1828          * qemu_system_vmstop_request_prepare has two effects.  First,
1829          * it ensures that the STOP event always comes after the
1830          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
1831          * can observe the STOP event and do a "cont" before the STOP
1832          * event is issued, the VM will not stop.  In this case, vm_start()
1833          * also ensures that the STOP/RESUME pair of events is emitted.
1834          */
1835         qemu_system_vmstop_request_prepare();
1836         send_qmp_error_event(blk, action, is_read, error);
1837         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
1838     } else {
1839         send_qmp_error_event(blk, action, is_read, error);
1840     }
1841 }
1842 
1843 /*
1844  * Returns true if the BlockBackend can support taking write permissions
1845  * (because its root node is not read-only).
1846  */
1847 bool blk_supports_write_perm(BlockBackend *blk)
1848 {
1849     BlockDriverState *bs = blk_bs(blk);
1850 
1851     if (bs) {
1852         return !bdrv_is_read_only(bs);
1853     } else {
1854         return blk->root_state.open_flags & BDRV_O_RDWR;
1855     }
1856 }
1857 
1858 /*
1859  * Returns true if the BlockBackend can be written to in its current
1860  * configuration (i.e. if write permission have been requested)
1861  */
1862 bool blk_is_writable(BlockBackend *blk)
1863 {
1864     return blk->perm & BLK_PERM_WRITE;
1865 }
1866 
1867 bool blk_is_sg(BlockBackend *blk)
1868 {
1869     BlockDriverState *bs = blk_bs(blk);
1870 
1871     if (!bs) {
1872         return false;
1873     }
1874 
1875     return bdrv_is_sg(bs);
1876 }
1877 
1878 bool blk_enable_write_cache(BlockBackend *blk)
1879 {
1880     return blk->enable_write_cache;
1881 }
1882 
1883 void blk_set_enable_write_cache(BlockBackend *blk, bool wce)
1884 {
1885     blk->enable_write_cache = wce;
1886 }
1887 
1888 void blk_invalidate_cache(BlockBackend *blk, Error **errp)
1889 {
1890     BlockDriverState *bs = blk_bs(blk);
1891 
1892     if (!bs) {
1893         error_setg(errp, "Device '%s' has no medium", blk->name);
1894         return;
1895     }
1896 
1897     bdrv_invalidate_cache(bs, errp);
1898 }
1899 
1900 bool blk_is_inserted(BlockBackend *blk)
1901 {
1902     BlockDriverState *bs = blk_bs(blk);
1903 
1904     return bs && bdrv_is_inserted(bs);
1905 }
1906 
1907 bool blk_is_available(BlockBackend *blk)
1908 {
1909     return blk_is_inserted(blk) && !blk_dev_is_tray_open(blk);
1910 }
1911 
1912 void blk_lock_medium(BlockBackend *blk, bool locked)
1913 {
1914     BlockDriverState *bs = blk_bs(blk);
1915 
1916     if (bs) {
1917         bdrv_lock_medium(bs, locked);
1918     }
1919 }
1920 
1921 void blk_eject(BlockBackend *blk, bool eject_flag)
1922 {
1923     BlockDriverState *bs = blk_bs(blk);
1924     char *id;
1925 
1926     if (bs) {
1927         bdrv_eject(bs, eject_flag);
1928     }
1929 
1930     /* Whether or not we ejected on the backend,
1931      * the frontend experienced a tray event. */
1932     id = blk_get_attached_dev_id(blk);
1933     qapi_event_send_device_tray_moved(blk_name(blk), id,
1934                                       eject_flag);
1935     g_free(id);
1936 }
1937 
1938 int blk_get_flags(BlockBackend *blk)
1939 {
1940     BlockDriverState *bs = blk_bs(blk);
1941 
1942     if (bs) {
1943         return bdrv_get_flags(bs);
1944     } else {
1945         return blk->root_state.open_flags;
1946     }
1947 }
1948 
1949 /* Returns the minimum request alignment, in bytes; guaranteed nonzero */
1950 uint32_t blk_get_request_alignment(BlockBackend *blk)
1951 {
1952     BlockDriverState *bs = blk_bs(blk);
1953     return bs ? bs->bl.request_alignment : BDRV_SECTOR_SIZE;
1954 }
1955 
1956 /* Returns the maximum hardware transfer length, in bytes; guaranteed nonzero */
1957 uint64_t blk_get_max_hw_transfer(BlockBackend *blk)
1958 {
1959     BlockDriverState *bs = blk_bs(blk);
1960     uint64_t max = INT_MAX;
1961 
1962     if (bs) {
1963         max = MIN_NON_ZERO(max, bs->bl.max_hw_transfer);
1964         max = MIN_NON_ZERO(max, bs->bl.max_transfer);
1965     }
1966     return ROUND_DOWN(max, blk_get_request_alignment(blk));
1967 }
1968 
1969 /* Returns the maximum transfer length, in bytes; guaranteed nonzero */
1970 uint32_t blk_get_max_transfer(BlockBackend *blk)
1971 {
1972     BlockDriverState *bs = blk_bs(blk);
1973     uint32_t max = INT_MAX;
1974 
1975     if (bs) {
1976         max = MIN_NON_ZERO(max, bs->bl.max_transfer);
1977     }
1978     return ROUND_DOWN(max, blk_get_request_alignment(blk));
1979 }
1980 
1981 int blk_get_max_iov(BlockBackend *blk)
1982 {
1983     return blk->root->bs->bl.max_iov;
1984 }
1985 
1986 void blk_set_guest_block_size(BlockBackend *blk, int align)
1987 {
1988     blk->guest_block_size = align;
1989 }
1990 
1991 void *blk_try_blockalign(BlockBackend *blk, size_t size)
1992 {
1993     return qemu_try_blockalign(blk ? blk_bs(blk) : NULL, size);
1994 }
1995 
1996 void *blk_blockalign(BlockBackend *blk, size_t size)
1997 {
1998     return qemu_blockalign(blk ? blk_bs(blk) : NULL, size);
1999 }
2000 
2001 bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
2002 {
2003     BlockDriverState *bs = blk_bs(blk);
2004 
2005     if (!bs) {
2006         return false;
2007     }
2008 
2009     return bdrv_op_is_blocked(bs, op, errp);
2010 }
2011 
2012 void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
2013 {
2014     BlockDriverState *bs = blk_bs(blk);
2015 
2016     if (bs) {
2017         bdrv_op_unblock(bs, op, reason);
2018     }
2019 }
2020 
2021 void blk_op_block_all(BlockBackend *blk, Error *reason)
2022 {
2023     BlockDriverState *bs = blk_bs(blk);
2024 
2025     if (bs) {
2026         bdrv_op_block_all(bs, reason);
2027     }
2028 }
2029 
2030 void blk_op_unblock_all(BlockBackend *blk, Error *reason)
2031 {
2032     BlockDriverState *bs = blk_bs(blk);
2033 
2034     if (bs) {
2035         bdrv_op_unblock_all(bs, reason);
2036     }
2037 }
2038 
2039 AioContext *blk_get_aio_context(BlockBackend *blk)
2040 {
2041     BlockDriverState *bs = blk_bs(blk);
2042 
2043     if (bs) {
2044         AioContext *ctx = bdrv_get_aio_context(blk_bs(blk));
2045         assert(ctx == blk->ctx);
2046     }
2047 
2048     return blk->ctx;
2049 }
2050 
2051 static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
2052 {
2053     BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb);
2054     return blk_get_aio_context(blk_acb->blk);
2055 }
2056 
2057 static int blk_do_set_aio_context(BlockBackend *blk, AioContext *new_context,
2058                                   bool update_root_node, Error **errp)
2059 {
2060     BlockDriverState *bs = blk_bs(blk);
2061     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2062     int ret;
2063 
2064     if (bs) {
2065         if (update_root_node) {
2066             ret = bdrv_child_try_set_aio_context(bs, new_context, blk->root,
2067                                                  errp);
2068             if (ret < 0) {
2069                 return ret;
2070             }
2071         }
2072         if (tgm->throttle_state) {
2073             bdrv_drained_begin(bs);
2074             throttle_group_detach_aio_context(tgm);
2075             throttle_group_attach_aio_context(tgm, new_context);
2076             bdrv_drained_end(bs);
2077         }
2078     }
2079 
2080     blk->ctx = new_context;
2081     return 0;
2082 }
2083 
2084 int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
2085                         Error **errp)
2086 {
2087     return blk_do_set_aio_context(blk, new_context, true, errp);
2088 }
2089 
2090 static bool blk_root_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
2091                                      GSList **ignore, Error **errp)
2092 {
2093     BlockBackend *blk = child->opaque;
2094 
2095     if (blk->allow_aio_context_change) {
2096         return true;
2097     }
2098 
2099     /* Only manually created BlockBackends that are not attached to anything
2100      * can change their AioContext without updating their user. */
2101     if (!blk->name || blk->dev) {
2102         /* TODO Add BB name/QOM path */
2103         error_setg(errp, "Cannot change iothread of active block backend");
2104         return false;
2105     }
2106 
2107     return true;
2108 }
2109 
2110 static void blk_root_set_aio_ctx(BdrvChild *child, AioContext *ctx,
2111                                  GSList **ignore)
2112 {
2113     BlockBackend *blk = child->opaque;
2114     blk_do_set_aio_context(blk, ctx, false, &error_abort);
2115 }
2116 
2117 void blk_add_aio_context_notifier(BlockBackend *blk,
2118         void (*attached_aio_context)(AioContext *new_context, void *opaque),
2119         void (*detach_aio_context)(void *opaque), void *opaque)
2120 {
2121     BlockBackendAioNotifier *notifier;
2122     BlockDriverState *bs = blk_bs(blk);
2123 
2124     notifier = g_new(BlockBackendAioNotifier, 1);
2125     notifier->attached_aio_context = attached_aio_context;
2126     notifier->detach_aio_context = detach_aio_context;
2127     notifier->opaque = opaque;
2128     QLIST_INSERT_HEAD(&blk->aio_notifiers, notifier, list);
2129 
2130     if (bs) {
2131         bdrv_add_aio_context_notifier(bs, attached_aio_context,
2132                                       detach_aio_context, opaque);
2133     }
2134 }
2135 
2136 void blk_remove_aio_context_notifier(BlockBackend *blk,
2137                                      void (*attached_aio_context)(AioContext *,
2138                                                                   void *),
2139                                      void (*detach_aio_context)(void *),
2140                                      void *opaque)
2141 {
2142     BlockBackendAioNotifier *notifier;
2143     BlockDriverState *bs = blk_bs(blk);
2144 
2145     if (bs) {
2146         bdrv_remove_aio_context_notifier(bs, attached_aio_context,
2147                                          detach_aio_context, opaque);
2148     }
2149 
2150     QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
2151         if (notifier->attached_aio_context == attached_aio_context &&
2152             notifier->detach_aio_context == detach_aio_context &&
2153             notifier->opaque == opaque) {
2154             QLIST_REMOVE(notifier, list);
2155             g_free(notifier);
2156             return;
2157         }
2158     }
2159 
2160     abort();
2161 }
2162 
2163 void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify)
2164 {
2165     notifier_list_add(&blk->remove_bs_notifiers, notify);
2166 }
2167 
2168 void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
2169 {
2170     notifier_list_add(&blk->insert_bs_notifiers, notify);
2171 }
2172 
2173 void blk_io_plug(BlockBackend *blk)
2174 {
2175     BlockDriverState *bs = blk_bs(blk);
2176 
2177     if (bs) {
2178         bdrv_io_plug(bs);
2179     }
2180 }
2181 
2182 void blk_io_unplug(BlockBackend *blk)
2183 {
2184     BlockDriverState *bs = blk_bs(blk);
2185 
2186     if (bs) {
2187         bdrv_io_unplug(bs);
2188     }
2189 }
2190 
2191 BlockAcctStats *blk_get_stats(BlockBackend *blk)
2192 {
2193     return &blk->stats;
2194 }
2195 
2196 void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
2197                   BlockCompletionFunc *cb, void *opaque)
2198 {
2199     return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
2200 }
2201 
2202 int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
2203                                       int bytes, BdrvRequestFlags flags)
2204 {
2205     return blk_co_pwritev(blk, offset, bytes, NULL,
2206                           flags | BDRV_REQ_ZERO_WRITE);
2207 }
2208 
2209 int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
2210                           int count)
2211 {
2212     return blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
2213                    BDRV_REQ_WRITE_COMPRESSED);
2214 }
2215 
2216 int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
2217                  PreallocMode prealloc, BdrvRequestFlags flags, Error **errp)
2218 {
2219     if (!blk_is_available(blk)) {
2220         error_setg(errp, "No medium inserted");
2221         return -ENOMEDIUM;
2222     }
2223 
2224     return bdrv_truncate(blk->root, offset, exact, prealloc, flags, errp);
2225 }
2226 
2227 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
2228                      int64_t pos, int size)
2229 {
2230     int ret;
2231 
2232     if (!blk_is_available(blk)) {
2233         return -ENOMEDIUM;
2234     }
2235 
2236     ret = bdrv_save_vmstate(blk_bs(blk), buf, pos, size);
2237     if (ret < 0) {
2238         return ret;
2239     }
2240 
2241     if (ret == size && !blk->enable_write_cache) {
2242         ret = bdrv_flush(blk_bs(blk));
2243     }
2244 
2245     return ret < 0 ? ret : size;
2246 }
2247 
2248 int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
2249 {
2250     if (!blk_is_available(blk)) {
2251         return -ENOMEDIUM;
2252     }
2253 
2254     return bdrv_load_vmstate(blk_bs(blk), buf, pos, size);
2255 }
2256 
2257 int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
2258 {
2259     if (!blk_is_available(blk)) {
2260         return -ENOMEDIUM;
2261     }
2262 
2263     return bdrv_probe_blocksizes(blk_bs(blk), bsz);
2264 }
2265 
2266 int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo)
2267 {
2268     if (!blk_is_available(blk)) {
2269         return -ENOMEDIUM;
2270     }
2271 
2272     return bdrv_probe_geometry(blk_bs(blk), geo);
2273 }
2274 
2275 /*
2276  * Updates the BlockBackendRootState object with data from the currently
2277  * attached BlockDriverState.
2278  */
2279 void blk_update_root_state(BlockBackend *blk)
2280 {
2281     assert(blk->root);
2282 
2283     blk->root_state.open_flags    = blk->root->bs->open_flags;
2284     blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
2285 }
2286 
2287 /*
2288  * Returns the detect-zeroes setting to be used for bdrv_open() of a
2289  * BlockDriverState which is supposed to inherit the root state.
2290  */
2291 bool blk_get_detect_zeroes_from_root_state(BlockBackend *blk)
2292 {
2293     return blk->root_state.detect_zeroes;
2294 }
2295 
2296 /*
2297  * Returns the flags to be used for bdrv_open() of a BlockDriverState which is
2298  * supposed to inherit the root state.
2299  */
2300 int blk_get_open_flags_from_root_state(BlockBackend *blk)
2301 {
2302     return blk->root_state.open_flags;
2303 }
2304 
2305 BlockBackendRootState *blk_get_root_state(BlockBackend *blk)
2306 {
2307     return &blk->root_state;
2308 }
2309 
2310 int blk_commit_all(void)
2311 {
2312     BlockBackend *blk = NULL;
2313 
2314     while ((blk = blk_all_next(blk)) != NULL) {
2315         AioContext *aio_context = blk_get_aio_context(blk);
2316         BlockDriverState *unfiltered_bs = bdrv_skip_filters(blk_bs(blk));
2317 
2318         aio_context_acquire(aio_context);
2319         if (blk_is_inserted(blk) && bdrv_cow_child(unfiltered_bs)) {
2320             int ret;
2321 
2322             ret = bdrv_commit(unfiltered_bs);
2323             if (ret < 0) {
2324                 aio_context_release(aio_context);
2325                 return ret;
2326             }
2327         }
2328         aio_context_release(aio_context);
2329     }
2330     return 0;
2331 }
2332 
2333 
2334 /* throttling disk I/O limits */
2335 void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
2336 {
2337     throttle_group_config(&blk->public.throttle_group_member, cfg);
2338 }
2339 
2340 void blk_io_limits_disable(BlockBackend *blk)
2341 {
2342     BlockDriverState *bs = blk_bs(blk);
2343     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2344     assert(tgm->throttle_state);
2345     if (bs) {
2346         bdrv_drained_begin(bs);
2347     }
2348     throttle_group_unregister_tgm(tgm);
2349     if (bs) {
2350         bdrv_drained_end(bs);
2351     }
2352 }
2353 
2354 /* should be called before blk_set_io_limits if a limit is set */
2355 void blk_io_limits_enable(BlockBackend *blk, const char *group)
2356 {
2357     assert(!blk->public.throttle_group_member.throttle_state);
2358     throttle_group_register_tgm(&blk->public.throttle_group_member,
2359                                 group, blk_get_aio_context(blk));
2360 }
2361 
2362 void blk_io_limits_update_group(BlockBackend *blk, const char *group)
2363 {
2364     /* this BB is not part of any group */
2365     if (!blk->public.throttle_group_member.throttle_state) {
2366         return;
2367     }
2368 
2369     /* this BB is a part of the same group than the one we want */
2370     if (!g_strcmp0(throttle_group_get_name(&blk->public.throttle_group_member),
2371                 group)) {
2372         return;
2373     }
2374 
2375     /* need to change the group this bs belong to */
2376     blk_io_limits_disable(blk);
2377     blk_io_limits_enable(blk, group);
2378 }
2379 
2380 static void blk_root_drained_begin(BdrvChild *child)
2381 {
2382     BlockBackend *blk = child->opaque;
2383     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2384 
2385     if (++blk->quiesce_counter == 1) {
2386         if (blk->dev_ops && blk->dev_ops->drained_begin) {
2387             blk->dev_ops->drained_begin(blk->dev_opaque);
2388         }
2389     }
2390 
2391     /* Note that blk->root may not be accessible here yet if we are just
2392      * attaching to a BlockDriverState that is drained. Use child instead. */
2393 
2394     if (qatomic_fetch_inc(&tgm->io_limits_disabled) == 0) {
2395         throttle_group_restart_tgm(tgm);
2396     }
2397 }
2398 
2399 static bool blk_root_drained_poll(BdrvChild *child)
2400 {
2401     BlockBackend *blk = child->opaque;
2402     bool busy = false;
2403     assert(blk->quiesce_counter);
2404 
2405     if (blk->dev_ops && blk->dev_ops->drained_poll) {
2406         busy = blk->dev_ops->drained_poll(blk->dev_opaque);
2407     }
2408     return busy || !!blk->in_flight;
2409 }
2410 
2411 static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter)
2412 {
2413     BlockBackend *blk = child->opaque;
2414     assert(blk->quiesce_counter);
2415 
2416     assert(blk->public.throttle_group_member.io_limits_disabled);
2417     qatomic_dec(&blk->public.throttle_group_member.io_limits_disabled);
2418 
2419     if (--blk->quiesce_counter == 0) {
2420         if (blk->dev_ops && blk->dev_ops->drained_end) {
2421             blk->dev_ops->drained_end(blk->dev_opaque);
2422         }
2423         while (qemu_co_enter_next(&blk->queued_requests, NULL)) {
2424             /* Resume all queued requests */
2425         }
2426     }
2427 }
2428 
2429 void blk_register_buf(BlockBackend *blk, void *host, size_t size)
2430 {
2431     bdrv_register_buf(blk_bs(blk), host, size);
2432 }
2433 
2434 void blk_unregister_buf(BlockBackend *blk, void *host)
2435 {
2436     bdrv_unregister_buf(blk_bs(blk), host);
2437 }
2438 
2439 int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
2440                                    BlockBackend *blk_out, int64_t off_out,
2441                                    int bytes, BdrvRequestFlags read_flags,
2442                                    BdrvRequestFlags write_flags)
2443 {
2444     int r;
2445     r = blk_check_byte_request(blk_in, off_in, bytes);
2446     if (r) {
2447         return r;
2448     }
2449     r = blk_check_byte_request(blk_out, off_out, bytes);
2450     if (r) {
2451         return r;
2452     }
2453     return bdrv_co_copy_range(blk_in->root, off_in,
2454                               blk_out->root, off_out,
2455                               bytes, read_flags, write_flags);
2456 }
2457 
2458 const BdrvChild *blk_root(BlockBackend *blk)
2459 {
2460     return blk->root;
2461 }
2462 
2463 int blk_make_empty(BlockBackend *blk, Error **errp)
2464 {
2465     if (!blk_is_available(blk)) {
2466         error_setg(errp, "No medium inserted");
2467         return -ENOMEDIUM;
2468     }
2469 
2470     return bdrv_make_empty(blk->root, errp);
2471 }
2472