xref: /qemu/block.c (revision 727385c4)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  * Copyright (c) 2020 Virtuozzo International GmbH.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu/osdep.h"
27 #include "block/trace.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "block/fuse.h"
31 #include "block/nbd.h"
32 #include "block/qdict.h"
33 #include "qemu/error-report.h"
34 #include "block/module_block.h"
35 #include "qemu/main-loop.h"
36 #include "qemu/module.h"
37 #include "qapi/error.h"
38 #include "qapi/qmp/qdict.h"
39 #include "qapi/qmp/qjson.h"
40 #include "qapi/qmp/qnull.h"
41 #include "qapi/qmp/qstring.h"
42 #include "qapi/qobject-output-visitor.h"
43 #include "qapi/qapi-visit-block-core.h"
44 #include "sysemu/block-backend.h"
45 #include "qemu/notify.h"
46 #include "qemu/option.h"
47 #include "qemu/coroutine.h"
48 #include "block/qapi.h"
49 #include "qemu/timer.h"
50 #include "qemu/cutils.h"
51 #include "qemu/id.h"
52 #include "qemu/range.h"
53 #include "qemu/rcu.h"
54 #include "block/coroutines.h"
55 
56 #ifdef CONFIG_BSD
57 #include <sys/ioctl.h>
58 #include <sys/queue.h>
59 #if defined(HAVE_SYS_DISK_H)
60 #include <sys/disk.h>
61 #endif
62 #endif
63 
64 #ifdef _WIN32
65 #include <windows.h>
66 #endif
67 
68 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
69 
70 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
71     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
72 
73 static QTAILQ_HEAD(, BlockDriverState) all_bdrv_states =
74     QTAILQ_HEAD_INITIALIZER(all_bdrv_states);
75 
76 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
77     QLIST_HEAD_INITIALIZER(bdrv_drivers);
78 
79 static BlockDriverState *bdrv_open_inherit(const char *filename,
80                                            const char *reference,
81                                            QDict *options, int flags,
82                                            BlockDriverState *parent,
83                                            const BdrvChildClass *child_class,
84                                            BdrvChildRole child_role,
85                                            Error **errp);
86 
87 static bool bdrv_recurse_has_child(BlockDriverState *bs,
88                                    BlockDriverState *child);
89 
90 static void bdrv_replace_child_noperm(BdrvChild *child,
91                                       BlockDriverState *new_bs);
92 static void bdrv_remove_file_or_backing_child(BlockDriverState *bs,
93                                               BdrvChild *child,
94                                               Transaction *tran);
95 static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs,
96                                             Transaction *tran);
97 
98 static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
99                                BlockReopenQueue *queue,
100                                Transaction *change_child_tran, Error **errp);
101 static void bdrv_reopen_commit(BDRVReopenState *reopen_state);
102 static void bdrv_reopen_abort(BDRVReopenState *reopen_state);
103 
104 /* If non-zero, use only whitelisted block drivers */
105 static int use_bdrv_whitelist;
106 
107 #ifdef _WIN32
108 static int is_windows_drive_prefix(const char *filename)
109 {
110     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
111              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
112             filename[1] == ':');
113 }
114 
115 int is_windows_drive(const char *filename)
116 {
117     if (is_windows_drive_prefix(filename) &&
118         filename[2] == '\0')
119         return 1;
120     if (strstart(filename, "\\\\.\\", NULL) ||
121         strstart(filename, "//./", NULL))
122         return 1;
123     return 0;
124 }
125 #endif
126 
127 size_t bdrv_opt_mem_align(BlockDriverState *bs)
128 {
129     if (!bs || !bs->drv) {
130         /* page size or 4k (hdd sector size) should be on the safe side */
131         return MAX(4096, qemu_real_host_page_size);
132     }
133 
134     return bs->bl.opt_mem_alignment;
135 }
136 
137 size_t bdrv_min_mem_align(BlockDriverState *bs)
138 {
139     if (!bs || !bs->drv) {
140         /* page size or 4k (hdd sector size) should be on the safe side */
141         return MAX(4096, qemu_real_host_page_size);
142     }
143 
144     return bs->bl.min_mem_alignment;
145 }
146 
147 /* check if the path starts with "<protocol>:" */
148 int path_has_protocol(const char *path)
149 {
150     const char *p;
151 
152 #ifdef _WIN32
153     if (is_windows_drive(path) ||
154         is_windows_drive_prefix(path)) {
155         return 0;
156     }
157     p = path + strcspn(path, ":/\\");
158 #else
159     p = path + strcspn(path, ":/");
160 #endif
161 
162     return *p == ':';
163 }
164 
165 int path_is_absolute(const char *path)
166 {
167 #ifdef _WIN32
168     /* specific case for names like: "\\.\d:" */
169     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
170         return 1;
171     }
172     return (*path == '/' || *path == '\\');
173 #else
174     return (*path == '/');
175 #endif
176 }
177 
178 /* if filename is absolute, just return its duplicate. Otherwise, build a
179    path to it by considering it is relative to base_path. URL are
180    supported. */
181 char *path_combine(const char *base_path, const char *filename)
182 {
183     const char *protocol_stripped = NULL;
184     const char *p, *p1;
185     char *result;
186     int len;
187 
188     if (path_is_absolute(filename)) {
189         return g_strdup(filename);
190     }
191 
192     if (path_has_protocol(base_path)) {
193         protocol_stripped = strchr(base_path, ':');
194         if (protocol_stripped) {
195             protocol_stripped++;
196         }
197     }
198     p = protocol_stripped ?: base_path;
199 
200     p1 = strrchr(base_path, '/');
201 #ifdef _WIN32
202     {
203         const char *p2;
204         p2 = strrchr(base_path, '\\');
205         if (!p1 || p2 > p1) {
206             p1 = p2;
207         }
208     }
209 #endif
210     if (p1) {
211         p1++;
212     } else {
213         p1 = base_path;
214     }
215     if (p1 > p) {
216         p = p1;
217     }
218     len = p - base_path;
219 
220     result = g_malloc(len + strlen(filename) + 1);
221     memcpy(result, base_path, len);
222     strcpy(result + len, filename);
223 
224     return result;
225 }
226 
227 /*
228  * Helper function for bdrv_parse_filename() implementations to remove optional
229  * protocol prefixes (especially "file:") from a filename and for putting the
230  * stripped filename into the options QDict if there is such a prefix.
231  */
232 void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix,
233                                       QDict *options)
234 {
235     if (strstart(filename, prefix, &filename)) {
236         /* Stripping the explicit protocol prefix may result in a protocol
237          * prefix being (wrongly) detected (if the filename contains a colon) */
238         if (path_has_protocol(filename)) {
239             GString *fat_filename;
240 
241             /* This means there is some colon before the first slash; therefore,
242              * this cannot be an absolute path */
243             assert(!path_is_absolute(filename));
244 
245             /* And we can thus fix the protocol detection issue by prefixing it
246              * by "./" */
247             fat_filename = g_string_new("./");
248             g_string_append(fat_filename, filename);
249 
250             assert(!path_has_protocol(fat_filename->str));
251 
252             qdict_put(options, "filename",
253                       qstring_from_gstring(fat_filename));
254         } else {
255             /* If no protocol prefix was detected, we can use the shortened
256              * filename as-is */
257             qdict_put_str(options, "filename", filename);
258         }
259     }
260 }
261 
262 
263 /* Returns whether the image file is opened as read-only. Note that this can
264  * return false and writing to the image file is still not possible because the
265  * image is inactivated. */
266 bool bdrv_is_read_only(BlockDriverState *bs)
267 {
268     return !(bs->open_flags & BDRV_O_RDWR);
269 }
270 
271 int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
272                            bool ignore_allow_rdw, Error **errp)
273 {
274     /* Do not set read_only if copy_on_read is enabled */
275     if (bs->copy_on_read && read_only) {
276         error_setg(errp, "Can't set node '%s' to r/o with copy-on-read enabled",
277                    bdrv_get_device_or_node_name(bs));
278         return -EINVAL;
279     }
280 
281     /* Do not clear read_only if it is prohibited */
282     if (!read_only && !(bs->open_flags & BDRV_O_ALLOW_RDWR) &&
283         !ignore_allow_rdw)
284     {
285         error_setg(errp, "Node '%s' is read only",
286                    bdrv_get_device_or_node_name(bs));
287         return -EPERM;
288     }
289 
290     return 0;
291 }
292 
293 /*
294  * Called by a driver that can only provide a read-only image.
295  *
296  * Returns 0 if the node is already read-only or it could switch the node to
297  * read-only because BDRV_O_AUTO_RDONLY is set.
298  *
299  * Returns -EACCES if the node is read-write and BDRV_O_AUTO_RDONLY is not set
300  * or bdrv_can_set_read_only() forbids making the node read-only. If @errmsg
301  * is not NULL, it is used as the error message for the Error object.
302  */
303 int bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg,
304                               Error **errp)
305 {
306     int ret = 0;
307 
308     if (!(bs->open_flags & BDRV_O_RDWR)) {
309         return 0;
310     }
311     if (!(bs->open_flags & BDRV_O_AUTO_RDONLY)) {
312         goto fail;
313     }
314 
315     ret = bdrv_can_set_read_only(bs, true, false, NULL);
316     if (ret < 0) {
317         goto fail;
318     }
319 
320     bs->open_flags &= ~BDRV_O_RDWR;
321 
322     return 0;
323 
324 fail:
325     error_setg(errp, "%s", errmsg ?: "Image is read-only");
326     return -EACCES;
327 }
328 
329 /*
330  * If @backing is empty, this function returns NULL without setting
331  * @errp.  In all other cases, NULL will only be returned with @errp
332  * set.
333  *
334  * Therefore, a return value of NULL without @errp set means that
335  * there is no backing file; if @errp is set, there is one but its
336  * absolute filename cannot be generated.
337  */
338 char *bdrv_get_full_backing_filename_from_filename(const char *backed,
339                                                    const char *backing,
340                                                    Error **errp)
341 {
342     if (backing[0] == '\0') {
343         return NULL;
344     } else if (path_has_protocol(backing) || path_is_absolute(backing)) {
345         return g_strdup(backing);
346     } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
347         error_setg(errp, "Cannot use relative backing file names for '%s'",
348                    backed);
349         return NULL;
350     } else {
351         return path_combine(backed, backing);
352     }
353 }
354 
355 /*
356  * If @filename is empty or NULL, this function returns NULL without
357  * setting @errp.  In all other cases, NULL will only be returned with
358  * @errp set.
359  */
360 static char *bdrv_make_absolute_filename(BlockDriverState *relative_to,
361                                          const char *filename, Error **errp)
362 {
363     char *dir, *full_name;
364 
365     if (!filename || filename[0] == '\0') {
366         return NULL;
367     } else if (path_has_protocol(filename) || path_is_absolute(filename)) {
368         return g_strdup(filename);
369     }
370 
371     dir = bdrv_dirname(relative_to, errp);
372     if (!dir) {
373         return NULL;
374     }
375 
376     full_name = g_strconcat(dir, filename, NULL);
377     g_free(dir);
378     return full_name;
379 }
380 
381 char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp)
382 {
383     return bdrv_make_absolute_filename(bs, bs->backing_file, errp);
384 }
385 
386 void bdrv_register(BlockDriver *bdrv)
387 {
388     assert(bdrv->format_name);
389     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
390 }
391 
392 BlockDriverState *bdrv_new(void)
393 {
394     BlockDriverState *bs;
395     int i;
396 
397     bs = g_new0(BlockDriverState, 1);
398     QLIST_INIT(&bs->dirty_bitmaps);
399     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
400         QLIST_INIT(&bs->op_blockers[i]);
401     }
402     qemu_co_mutex_init(&bs->reqs_lock);
403     qemu_mutex_init(&bs->dirty_bitmap_mutex);
404     bs->refcnt = 1;
405     bs->aio_context = qemu_get_aio_context();
406 
407     qemu_co_queue_init(&bs->flush_queue);
408 
409     qemu_co_mutex_init(&bs->bsc_modify_lock);
410     bs->block_status_cache = g_new0(BdrvBlockStatusCache, 1);
411 
412     for (i = 0; i < bdrv_drain_all_count; i++) {
413         bdrv_drained_begin(bs);
414     }
415 
416     QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
417 
418     return bs;
419 }
420 
421 static BlockDriver *bdrv_do_find_format(const char *format_name)
422 {
423     BlockDriver *drv1;
424 
425     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
426         if (!strcmp(drv1->format_name, format_name)) {
427             return drv1;
428         }
429     }
430 
431     return NULL;
432 }
433 
434 BlockDriver *bdrv_find_format(const char *format_name)
435 {
436     BlockDriver *drv1;
437     int i;
438 
439     drv1 = bdrv_do_find_format(format_name);
440     if (drv1) {
441         return drv1;
442     }
443 
444     /* The driver isn't registered, maybe we need to load a module */
445     for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
446         if (!strcmp(block_driver_modules[i].format_name, format_name)) {
447             block_module_load_one(block_driver_modules[i].library_name);
448             break;
449         }
450     }
451 
452     return bdrv_do_find_format(format_name);
453 }
454 
455 static int bdrv_format_is_whitelisted(const char *format_name, bool read_only)
456 {
457     static const char *whitelist_rw[] = {
458         CONFIG_BDRV_RW_WHITELIST
459         NULL
460     };
461     static const char *whitelist_ro[] = {
462         CONFIG_BDRV_RO_WHITELIST
463         NULL
464     };
465     const char **p;
466 
467     if (!whitelist_rw[0] && !whitelist_ro[0]) {
468         return 1;               /* no whitelist, anything goes */
469     }
470 
471     for (p = whitelist_rw; *p; p++) {
472         if (!strcmp(format_name, *p)) {
473             return 1;
474         }
475     }
476     if (read_only) {
477         for (p = whitelist_ro; *p; p++) {
478             if (!strcmp(format_name, *p)) {
479                 return 1;
480             }
481         }
482     }
483     return 0;
484 }
485 
486 int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
487 {
488     return bdrv_format_is_whitelisted(drv->format_name, read_only);
489 }
490 
491 bool bdrv_uses_whitelist(void)
492 {
493     return use_bdrv_whitelist;
494 }
495 
496 typedef struct CreateCo {
497     BlockDriver *drv;
498     char *filename;
499     QemuOpts *opts;
500     int ret;
501     Error *err;
502 } CreateCo;
503 
504 static void coroutine_fn bdrv_create_co_entry(void *opaque)
505 {
506     Error *local_err = NULL;
507     int ret;
508 
509     CreateCo *cco = opaque;
510     assert(cco->drv);
511 
512     ret = cco->drv->bdrv_co_create_opts(cco->drv,
513                                         cco->filename, cco->opts, &local_err);
514     error_propagate(&cco->err, local_err);
515     cco->ret = ret;
516 }
517 
518 int bdrv_create(BlockDriver *drv, const char* filename,
519                 QemuOpts *opts, Error **errp)
520 {
521     int ret;
522 
523     Coroutine *co;
524     CreateCo cco = {
525         .drv = drv,
526         .filename = g_strdup(filename),
527         .opts = opts,
528         .ret = NOT_DONE,
529         .err = NULL,
530     };
531 
532     if (!drv->bdrv_co_create_opts) {
533         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
534         ret = -ENOTSUP;
535         goto out;
536     }
537 
538     if (qemu_in_coroutine()) {
539         /* Fast-path if already in coroutine context */
540         bdrv_create_co_entry(&cco);
541     } else {
542         co = qemu_coroutine_create(bdrv_create_co_entry, &cco);
543         qemu_coroutine_enter(co);
544         while (cco.ret == NOT_DONE) {
545             aio_poll(qemu_get_aio_context(), true);
546         }
547     }
548 
549     ret = cco.ret;
550     if (ret < 0) {
551         if (cco.err) {
552             error_propagate(errp, cco.err);
553         } else {
554             error_setg_errno(errp, -ret, "Could not create image");
555         }
556     }
557 
558 out:
559     g_free(cco.filename);
560     return ret;
561 }
562 
563 /**
564  * Helper function for bdrv_create_file_fallback(): Resize @blk to at
565  * least the given @minimum_size.
566  *
567  * On success, return @blk's actual length.
568  * Otherwise, return -errno.
569  */
570 static int64_t create_file_fallback_truncate(BlockBackend *blk,
571                                              int64_t minimum_size, Error **errp)
572 {
573     Error *local_err = NULL;
574     int64_t size;
575     int ret;
576 
577     ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0,
578                        &local_err);
579     if (ret < 0 && ret != -ENOTSUP) {
580         error_propagate(errp, local_err);
581         return ret;
582     }
583 
584     size = blk_getlength(blk);
585     if (size < 0) {
586         error_free(local_err);
587         error_setg_errno(errp, -size,
588                          "Failed to inquire the new image file's length");
589         return size;
590     }
591 
592     if (size < minimum_size) {
593         /* Need to grow the image, but we failed to do that */
594         error_propagate(errp, local_err);
595         return -ENOTSUP;
596     }
597 
598     error_free(local_err);
599     local_err = NULL;
600 
601     return size;
602 }
603 
604 /**
605  * Helper function for bdrv_create_file_fallback(): Zero the first
606  * sector to remove any potentially pre-existing image header.
607  */
608 static int create_file_fallback_zero_first_sector(BlockBackend *blk,
609                                                   int64_t current_size,
610                                                   Error **errp)
611 {
612     int64_t bytes_to_clear;
613     int ret;
614 
615     bytes_to_clear = MIN(current_size, BDRV_SECTOR_SIZE);
616     if (bytes_to_clear) {
617         ret = blk_pwrite_zeroes(blk, 0, bytes_to_clear, BDRV_REQ_MAY_UNMAP);
618         if (ret < 0) {
619             error_setg_errno(errp, -ret,
620                              "Failed to clear the new image's first sector");
621             return ret;
622         }
623     }
624 
625     return 0;
626 }
627 
628 /**
629  * Simple implementation of bdrv_co_create_opts for protocol drivers
630  * which only support creation via opening a file
631  * (usually existing raw storage device)
632  */
633 int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,
634                                             const char *filename,
635                                             QemuOpts *opts,
636                                             Error **errp)
637 {
638     BlockBackend *blk;
639     QDict *options;
640     int64_t size = 0;
641     char *buf = NULL;
642     PreallocMode prealloc;
643     Error *local_err = NULL;
644     int ret;
645 
646     size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
647     buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
648     prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
649                                PREALLOC_MODE_OFF, &local_err);
650     g_free(buf);
651     if (local_err) {
652         error_propagate(errp, local_err);
653         return -EINVAL;
654     }
655 
656     if (prealloc != PREALLOC_MODE_OFF) {
657         error_setg(errp, "Unsupported preallocation mode '%s'",
658                    PreallocMode_str(prealloc));
659         return -ENOTSUP;
660     }
661 
662     options = qdict_new();
663     qdict_put_str(options, "driver", drv->format_name);
664 
665     blk = blk_new_open(filename, NULL, options,
666                        BDRV_O_RDWR | BDRV_O_RESIZE, errp);
667     if (!blk) {
668         error_prepend(errp, "Protocol driver '%s' does not support image "
669                       "creation, and opening the image failed: ",
670                       drv->format_name);
671         return -EINVAL;
672     }
673 
674     size = create_file_fallback_truncate(blk, size, errp);
675     if (size < 0) {
676         ret = size;
677         goto out;
678     }
679 
680     ret = create_file_fallback_zero_first_sector(blk, size, errp);
681     if (ret < 0) {
682         goto out;
683     }
684 
685     ret = 0;
686 out:
687     blk_unref(blk);
688     return ret;
689 }
690 
691 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
692 {
693     QemuOpts *protocol_opts;
694     BlockDriver *drv;
695     QDict *qdict;
696     int ret;
697 
698     drv = bdrv_find_protocol(filename, true, errp);
699     if (drv == NULL) {
700         return -ENOENT;
701     }
702 
703     if (!drv->create_opts) {
704         error_setg(errp, "Driver '%s' does not support image creation",
705                    drv->format_name);
706         return -ENOTSUP;
707     }
708 
709     /*
710      * 'opts' contains a QemuOptsList with a combination of format and protocol
711      * default values.
712      *
713      * The format properly removes its options, but the default values remain
714      * in 'opts->list'.  So if the protocol has options with the same name
715      * (e.g. rbd has 'cluster_size' as qcow2), it will see the default values
716      * of the format, since for overlapping options, the format wins.
717      *
718      * To avoid this issue, lets convert QemuOpts to QDict, in this way we take
719      * only the set options, and then convert it back to QemuOpts, using the
720      * create_opts of the protocol. So the new QemuOpts, will contain only the
721      * protocol defaults.
722      */
723     qdict = qemu_opts_to_qdict(opts, NULL);
724     protocol_opts = qemu_opts_from_qdict(drv->create_opts, qdict, errp);
725     if (protocol_opts == NULL) {
726         ret = -EINVAL;
727         goto out;
728     }
729 
730     ret = bdrv_create(drv, filename, protocol_opts, errp);
731 out:
732     qemu_opts_del(protocol_opts);
733     qobject_unref(qdict);
734     return ret;
735 }
736 
737 int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp)
738 {
739     Error *local_err = NULL;
740     int ret;
741 
742     assert(bs != NULL);
743 
744     if (!bs->drv) {
745         error_setg(errp, "Block node '%s' is not opened", bs->filename);
746         return -ENOMEDIUM;
747     }
748 
749     if (!bs->drv->bdrv_co_delete_file) {
750         error_setg(errp, "Driver '%s' does not support image deletion",
751                    bs->drv->format_name);
752         return -ENOTSUP;
753     }
754 
755     ret = bs->drv->bdrv_co_delete_file(bs, &local_err);
756     if (ret < 0) {
757         error_propagate(errp, local_err);
758     }
759 
760     return ret;
761 }
762 
763 void coroutine_fn bdrv_co_delete_file_noerr(BlockDriverState *bs)
764 {
765     Error *local_err = NULL;
766     int ret;
767 
768     if (!bs) {
769         return;
770     }
771 
772     ret = bdrv_co_delete_file(bs, &local_err);
773     /*
774      * ENOTSUP will happen if the block driver doesn't support
775      * the 'bdrv_co_delete_file' interface. This is a predictable
776      * scenario and shouldn't be reported back to the user.
777      */
778     if (ret == -ENOTSUP) {
779         error_free(local_err);
780     } else if (ret < 0) {
781         error_report_err(local_err);
782     }
783 }
784 
785 /**
786  * Try to get @bs's logical and physical block size.
787  * On success, store them in @bsz struct and return 0.
788  * On failure return -errno.
789  * @bs must not be empty.
790  */
791 int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
792 {
793     BlockDriver *drv = bs->drv;
794     BlockDriverState *filtered = bdrv_filter_bs(bs);
795 
796     if (drv && drv->bdrv_probe_blocksizes) {
797         return drv->bdrv_probe_blocksizes(bs, bsz);
798     } else if (filtered) {
799         return bdrv_probe_blocksizes(filtered, bsz);
800     }
801 
802     return -ENOTSUP;
803 }
804 
805 /**
806  * Try to get @bs's geometry (cyls, heads, sectors).
807  * On success, store them in @geo struct and return 0.
808  * On failure return -errno.
809  * @bs must not be empty.
810  */
811 int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
812 {
813     BlockDriver *drv = bs->drv;
814     BlockDriverState *filtered = bdrv_filter_bs(bs);
815 
816     if (drv && drv->bdrv_probe_geometry) {
817         return drv->bdrv_probe_geometry(bs, geo);
818     } else if (filtered) {
819         return bdrv_probe_geometry(filtered, geo);
820     }
821 
822     return -ENOTSUP;
823 }
824 
825 /*
826  * Create a uniquely-named empty temporary file.
827  * Return 0 upon success, otherwise a negative errno value.
828  */
829 int get_tmp_filename(char *filename, int size)
830 {
831 #ifdef _WIN32
832     char temp_dir[MAX_PATH];
833     /* GetTempFileName requires that its output buffer (4th param)
834        have length MAX_PATH or greater.  */
835     assert(size >= MAX_PATH);
836     return (GetTempPath(MAX_PATH, temp_dir)
837             && GetTempFileName(temp_dir, "qem", 0, filename)
838             ? 0 : -GetLastError());
839 #else
840     int fd;
841     const char *tmpdir;
842     tmpdir = getenv("TMPDIR");
843     if (!tmpdir) {
844         tmpdir = "/var/tmp";
845     }
846     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
847         return -EOVERFLOW;
848     }
849     fd = mkstemp(filename);
850     if (fd < 0) {
851         return -errno;
852     }
853     if (close(fd) != 0) {
854         unlink(filename);
855         return -errno;
856     }
857     return 0;
858 #endif
859 }
860 
861 /*
862  * Detect host devices. By convention, /dev/cdrom[N] is always
863  * recognized as a host CDROM.
864  */
865 static BlockDriver *find_hdev_driver(const char *filename)
866 {
867     int score_max = 0, score;
868     BlockDriver *drv = NULL, *d;
869 
870     QLIST_FOREACH(d, &bdrv_drivers, list) {
871         if (d->bdrv_probe_device) {
872             score = d->bdrv_probe_device(filename);
873             if (score > score_max) {
874                 score_max = score;
875                 drv = d;
876             }
877         }
878     }
879 
880     return drv;
881 }
882 
883 static BlockDriver *bdrv_do_find_protocol(const char *protocol)
884 {
885     BlockDriver *drv1;
886 
887     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
888         if (drv1->protocol_name && !strcmp(drv1->protocol_name, protocol)) {
889             return drv1;
890         }
891     }
892 
893     return NULL;
894 }
895 
896 BlockDriver *bdrv_find_protocol(const char *filename,
897                                 bool allow_protocol_prefix,
898                                 Error **errp)
899 {
900     BlockDriver *drv1;
901     char protocol[128];
902     int len;
903     const char *p;
904     int i;
905 
906     /* TODO Drivers without bdrv_file_open must be specified explicitly */
907 
908     /*
909      * XXX(hch): we really should not let host device detection
910      * override an explicit protocol specification, but moving this
911      * later breaks access to device names with colons in them.
912      * Thanks to the brain-dead persistent naming schemes on udev-
913      * based Linux systems those actually are quite common.
914      */
915     drv1 = find_hdev_driver(filename);
916     if (drv1) {
917         return drv1;
918     }
919 
920     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
921         return &bdrv_file;
922     }
923 
924     p = strchr(filename, ':');
925     assert(p != NULL);
926     len = p - filename;
927     if (len > sizeof(protocol) - 1)
928         len = sizeof(protocol) - 1;
929     memcpy(protocol, filename, len);
930     protocol[len] = '\0';
931 
932     drv1 = bdrv_do_find_protocol(protocol);
933     if (drv1) {
934         return drv1;
935     }
936 
937     for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
938         if (block_driver_modules[i].protocol_name &&
939             !strcmp(block_driver_modules[i].protocol_name, protocol)) {
940             block_module_load_one(block_driver_modules[i].library_name);
941             break;
942         }
943     }
944 
945     drv1 = bdrv_do_find_protocol(protocol);
946     if (!drv1) {
947         error_setg(errp, "Unknown protocol '%s'", protocol);
948     }
949     return drv1;
950 }
951 
952 /*
953  * Guess image format by probing its contents.
954  * This is not a good idea when your image is raw (CVE-2008-2004), but
955  * we do it anyway for backward compatibility.
956  *
957  * @buf         contains the image's first @buf_size bytes.
958  * @buf_size    is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
959  *              but can be smaller if the image file is smaller)
960  * @filename    is its filename.
961  *
962  * For all block drivers, call the bdrv_probe() method to get its
963  * probing score.
964  * Return the first block driver with the highest probing score.
965  */
966 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
967                             const char *filename)
968 {
969     int score_max = 0, score;
970     BlockDriver *drv = NULL, *d;
971 
972     QLIST_FOREACH(d, &bdrv_drivers, list) {
973         if (d->bdrv_probe) {
974             score = d->bdrv_probe(buf, buf_size, filename);
975             if (score > score_max) {
976                 score_max = score;
977                 drv = d;
978             }
979         }
980     }
981 
982     return drv;
983 }
984 
985 static int find_image_format(BlockBackend *file, const char *filename,
986                              BlockDriver **pdrv, Error **errp)
987 {
988     BlockDriver *drv;
989     uint8_t buf[BLOCK_PROBE_BUF_SIZE];
990     int ret = 0;
991 
992     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
993     if (blk_is_sg(file) || !blk_is_inserted(file) || blk_getlength(file) == 0) {
994         *pdrv = &bdrv_raw;
995         return ret;
996     }
997 
998     ret = blk_pread(file, 0, buf, sizeof(buf));
999     if (ret < 0) {
1000         error_setg_errno(errp, -ret, "Could not read image for determining its "
1001                          "format");
1002         *pdrv = NULL;
1003         return ret;
1004     }
1005 
1006     drv = bdrv_probe_all(buf, ret, filename);
1007     if (!drv) {
1008         error_setg(errp, "Could not determine image format: No compatible "
1009                    "driver found");
1010         ret = -ENOENT;
1011     }
1012     *pdrv = drv;
1013     return ret;
1014 }
1015 
1016 /**
1017  * Set the current 'total_sectors' value
1018  * Return 0 on success, -errno on error.
1019  */
1020 int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
1021 {
1022     BlockDriver *drv = bs->drv;
1023 
1024     if (!drv) {
1025         return -ENOMEDIUM;
1026     }
1027 
1028     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
1029     if (bdrv_is_sg(bs))
1030         return 0;
1031 
1032     /* query actual device if possible, otherwise just trust the hint */
1033     if (drv->bdrv_getlength) {
1034         int64_t length = drv->bdrv_getlength(bs);
1035         if (length < 0) {
1036             return length;
1037         }
1038         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
1039     }
1040 
1041     bs->total_sectors = hint;
1042 
1043     if (bs->total_sectors * BDRV_SECTOR_SIZE > BDRV_MAX_LENGTH) {
1044         return -EFBIG;
1045     }
1046 
1047     return 0;
1048 }
1049 
1050 /**
1051  * Combines a QDict of new block driver @options with any missing options taken
1052  * from @old_options, so that leaving out an option defaults to its old value.
1053  */
1054 static void bdrv_join_options(BlockDriverState *bs, QDict *options,
1055                               QDict *old_options)
1056 {
1057     if (bs->drv && bs->drv->bdrv_join_options) {
1058         bs->drv->bdrv_join_options(options, old_options);
1059     } else {
1060         qdict_join(options, old_options, false);
1061     }
1062 }
1063 
1064 static BlockdevDetectZeroesOptions bdrv_parse_detect_zeroes(QemuOpts *opts,
1065                                                             int open_flags,
1066                                                             Error **errp)
1067 {
1068     Error *local_err = NULL;
1069     char *value = qemu_opt_get_del(opts, "detect-zeroes");
1070     BlockdevDetectZeroesOptions detect_zeroes =
1071         qapi_enum_parse(&BlockdevDetectZeroesOptions_lookup, value,
1072                         BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF, &local_err);
1073     g_free(value);
1074     if (local_err) {
1075         error_propagate(errp, local_err);
1076         return detect_zeroes;
1077     }
1078 
1079     if (detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
1080         !(open_flags & BDRV_O_UNMAP))
1081     {
1082         error_setg(errp, "setting detect-zeroes to unmap is not allowed "
1083                    "without setting discard operation to unmap");
1084     }
1085 
1086     return detect_zeroes;
1087 }
1088 
1089 /**
1090  * Set open flags for aio engine
1091  *
1092  * Return 0 on success, -1 if the engine specified is invalid
1093  */
1094 int bdrv_parse_aio(const char *mode, int *flags)
1095 {
1096     if (!strcmp(mode, "threads")) {
1097         /* do nothing, default */
1098     } else if (!strcmp(mode, "native")) {
1099         *flags |= BDRV_O_NATIVE_AIO;
1100 #ifdef CONFIG_LINUX_IO_URING
1101     } else if (!strcmp(mode, "io_uring")) {
1102         *flags |= BDRV_O_IO_URING;
1103 #endif
1104     } else {
1105         return -1;
1106     }
1107 
1108     return 0;
1109 }
1110 
1111 /**
1112  * Set open flags for a given discard mode
1113  *
1114  * Return 0 on success, -1 if the discard mode was invalid.
1115  */
1116 int bdrv_parse_discard_flags(const char *mode, int *flags)
1117 {
1118     *flags &= ~BDRV_O_UNMAP;
1119 
1120     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
1121         /* do nothing */
1122     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
1123         *flags |= BDRV_O_UNMAP;
1124     } else {
1125         return -1;
1126     }
1127 
1128     return 0;
1129 }
1130 
1131 /**
1132  * Set open flags for a given cache mode
1133  *
1134  * Return 0 on success, -1 if the cache mode was invalid.
1135  */
1136 int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough)
1137 {
1138     *flags &= ~BDRV_O_CACHE_MASK;
1139 
1140     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
1141         *writethrough = false;
1142         *flags |= BDRV_O_NOCACHE;
1143     } else if (!strcmp(mode, "directsync")) {
1144         *writethrough = true;
1145         *flags |= BDRV_O_NOCACHE;
1146     } else if (!strcmp(mode, "writeback")) {
1147         *writethrough = false;
1148     } else if (!strcmp(mode, "unsafe")) {
1149         *writethrough = false;
1150         *flags |= BDRV_O_NO_FLUSH;
1151     } else if (!strcmp(mode, "writethrough")) {
1152         *writethrough = true;
1153     } else {
1154         return -1;
1155     }
1156 
1157     return 0;
1158 }
1159 
1160 static char *bdrv_child_get_parent_desc(BdrvChild *c)
1161 {
1162     BlockDriverState *parent = c->opaque;
1163     return g_strdup_printf("node '%s'", bdrv_get_node_name(parent));
1164 }
1165 
1166 static void bdrv_child_cb_drained_begin(BdrvChild *child)
1167 {
1168     BlockDriverState *bs = child->opaque;
1169     bdrv_do_drained_begin_quiesce(bs, NULL, false);
1170 }
1171 
1172 static bool bdrv_child_cb_drained_poll(BdrvChild *child)
1173 {
1174     BlockDriverState *bs = child->opaque;
1175     return bdrv_drain_poll(bs, false, NULL, false);
1176 }
1177 
1178 static void bdrv_child_cb_drained_end(BdrvChild *child,
1179                                       int *drained_end_counter)
1180 {
1181     BlockDriverState *bs = child->opaque;
1182     bdrv_drained_end_no_poll(bs, drained_end_counter);
1183 }
1184 
1185 static int bdrv_child_cb_inactivate(BdrvChild *child)
1186 {
1187     BlockDriverState *bs = child->opaque;
1188     assert(bs->open_flags & BDRV_O_INACTIVE);
1189     return 0;
1190 }
1191 
1192 static bool bdrv_child_cb_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
1193                                           GSList **ignore, Error **errp)
1194 {
1195     BlockDriverState *bs = child->opaque;
1196     return bdrv_can_set_aio_context(bs, ctx, ignore, errp);
1197 }
1198 
1199 static void bdrv_child_cb_set_aio_ctx(BdrvChild *child, AioContext *ctx,
1200                                       GSList **ignore)
1201 {
1202     BlockDriverState *bs = child->opaque;
1203     return bdrv_set_aio_context_ignore(bs, ctx, ignore);
1204 }
1205 
1206 /*
1207  * Returns the options and flags that a temporary snapshot should get, based on
1208  * the originally requested flags (the originally requested image will have
1209  * flags like a backing file)
1210  */
1211 static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options,
1212                                        int parent_flags, QDict *parent_options)
1213 {
1214     *child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
1215 
1216     /* For temporary files, unconditional cache=unsafe is fine */
1217     qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off");
1218     qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on");
1219 
1220     /* Copy the read-only and discard options from the parent */
1221     qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
1222     qdict_copy_default(child_options, parent_options, BDRV_OPT_DISCARD);
1223 
1224     /* aio=native doesn't work for cache.direct=off, so disable it for the
1225      * temporary snapshot */
1226     *child_flags &= ~BDRV_O_NATIVE_AIO;
1227 }
1228 
1229 static void bdrv_backing_attach(BdrvChild *c)
1230 {
1231     BlockDriverState *parent = c->opaque;
1232     BlockDriverState *backing_hd = c->bs;
1233 
1234     assert(!parent->backing_blocker);
1235     error_setg(&parent->backing_blocker,
1236                "node is used as backing hd of '%s'",
1237                bdrv_get_device_or_node_name(parent));
1238 
1239     bdrv_refresh_filename(backing_hd);
1240 
1241     parent->open_flags &= ~BDRV_O_NO_BACKING;
1242 
1243     bdrv_op_block_all(backing_hd, parent->backing_blocker);
1244     /* Otherwise we won't be able to commit or stream */
1245     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
1246                     parent->backing_blocker);
1247     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_STREAM,
1248                     parent->backing_blocker);
1249     /*
1250      * We do backup in 3 ways:
1251      * 1. drive backup
1252      *    The target bs is new opened, and the source is top BDS
1253      * 2. blockdev backup
1254      *    Both the source and the target are top BDSes.
1255      * 3. internal backup(used for block replication)
1256      *    Both the source and the target are backing file
1257      *
1258      * In case 1 and 2, neither the source nor the target is the backing file.
1259      * In case 3, we will block the top BDS, so there is only one block job
1260      * for the top BDS and its backing chain.
1261      */
1262     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_SOURCE,
1263                     parent->backing_blocker);
1264     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
1265                     parent->backing_blocker);
1266 }
1267 
1268 static void bdrv_backing_detach(BdrvChild *c)
1269 {
1270     BlockDriverState *parent = c->opaque;
1271 
1272     assert(parent->backing_blocker);
1273     bdrv_op_unblock_all(c->bs, parent->backing_blocker);
1274     error_free(parent->backing_blocker);
1275     parent->backing_blocker = NULL;
1276 }
1277 
1278 static int bdrv_backing_update_filename(BdrvChild *c, BlockDriverState *base,
1279                                         const char *filename, Error **errp)
1280 {
1281     BlockDriverState *parent = c->opaque;
1282     bool read_only = bdrv_is_read_only(parent);
1283     int ret;
1284 
1285     if (read_only) {
1286         ret = bdrv_reopen_set_read_only(parent, false, errp);
1287         if (ret < 0) {
1288             return ret;
1289         }
1290     }
1291 
1292     ret = bdrv_change_backing_file(parent, filename,
1293                                    base->drv ? base->drv->format_name : "",
1294                                    false);
1295     if (ret < 0) {
1296         error_setg_errno(errp, -ret, "Could not update backing file link");
1297     }
1298 
1299     if (read_only) {
1300         bdrv_reopen_set_read_only(parent, true, NULL);
1301     }
1302 
1303     return ret;
1304 }
1305 
1306 /*
1307  * Returns the options and flags that a generic child of a BDS should
1308  * get, based on the given options and flags for the parent BDS.
1309  */
1310 static void bdrv_inherited_options(BdrvChildRole role, bool parent_is_format,
1311                                    int *child_flags, QDict *child_options,
1312                                    int parent_flags, QDict *parent_options)
1313 {
1314     int flags = parent_flags;
1315 
1316     /*
1317      * First, decide whether to set, clear, or leave BDRV_O_PROTOCOL.
1318      * Generally, the question to answer is: Should this child be
1319      * format-probed by default?
1320      */
1321 
1322     /*
1323      * Pure and non-filtered data children of non-format nodes should
1324      * be probed by default (even when the node itself has BDRV_O_PROTOCOL
1325      * set).  This only affects a very limited set of drivers (namely
1326      * quorum and blkverify when this comment was written).
1327      * Force-clear BDRV_O_PROTOCOL then.
1328      */
1329     if (!parent_is_format &&
1330         (role & BDRV_CHILD_DATA) &&
1331         !(role & (BDRV_CHILD_METADATA | BDRV_CHILD_FILTERED)))
1332     {
1333         flags &= ~BDRV_O_PROTOCOL;
1334     }
1335 
1336     /*
1337      * All children of format nodes (except for COW children) and all
1338      * metadata children in general should never be format-probed.
1339      * Force-set BDRV_O_PROTOCOL then.
1340      */
1341     if ((parent_is_format && !(role & BDRV_CHILD_COW)) ||
1342         (role & BDRV_CHILD_METADATA))
1343     {
1344         flags |= BDRV_O_PROTOCOL;
1345     }
1346 
1347     /*
1348      * If the cache mode isn't explicitly set, inherit direct and no-flush from
1349      * the parent.
1350      */
1351     qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_DIRECT);
1352     qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_NO_FLUSH);
1353     qdict_copy_default(child_options, parent_options, BDRV_OPT_FORCE_SHARE);
1354 
1355     if (role & BDRV_CHILD_COW) {
1356         /* backing files are opened read-only by default */
1357         qdict_set_default_str(child_options, BDRV_OPT_READ_ONLY, "on");
1358         qdict_set_default_str(child_options, BDRV_OPT_AUTO_READ_ONLY, "off");
1359     } else {
1360         /* Inherit the read-only option from the parent if it's not set */
1361         qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
1362         qdict_copy_default(child_options, parent_options,
1363                            BDRV_OPT_AUTO_READ_ONLY);
1364     }
1365 
1366     /*
1367      * bdrv_co_pdiscard() respects unmap policy for the parent, so we
1368      * can default to enable it on lower layers regardless of the
1369      * parent option.
1370      */
1371     qdict_set_default_str(child_options, BDRV_OPT_DISCARD, "unmap");
1372 
1373     /* Clear flags that only apply to the top layer */
1374     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
1375 
1376     if (role & BDRV_CHILD_METADATA) {
1377         flags &= ~BDRV_O_NO_IO;
1378     }
1379     if (role & BDRV_CHILD_COW) {
1380         flags &= ~BDRV_O_TEMPORARY;
1381     }
1382 
1383     *child_flags = flags;
1384 }
1385 
1386 static void bdrv_child_cb_attach(BdrvChild *child)
1387 {
1388     BlockDriverState *bs = child->opaque;
1389 
1390     if (child->role & BDRV_CHILD_COW) {
1391         bdrv_backing_attach(child);
1392     }
1393 
1394     bdrv_apply_subtree_drain(child, bs);
1395 }
1396 
1397 static void bdrv_child_cb_detach(BdrvChild *child)
1398 {
1399     BlockDriverState *bs = child->opaque;
1400 
1401     if (child->role & BDRV_CHILD_COW) {
1402         bdrv_backing_detach(child);
1403     }
1404 
1405     bdrv_unapply_subtree_drain(child, bs);
1406 }
1407 
1408 static int bdrv_child_cb_update_filename(BdrvChild *c, BlockDriverState *base,
1409                                          const char *filename, Error **errp)
1410 {
1411     if (c->role & BDRV_CHILD_COW) {
1412         return bdrv_backing_update_filename(c, base, filename, errp);
1413     }
1414     return 0;
1415 }
1416 
1417 AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c)
1418 {
1419     BlockDriverState *bs = c->opaque;
1420 
1421     return bdrv_get_aio_context(bs);
1422 }
1423 
1424 const BdrvChildClass child_of_bds = {
1425     .parent_is_bds   = true,
1426     .get_parent_desc = bdrv_child_get_parent_desc,
1427     .inherit_options = bdrv_inherited_options,
1428     .drained_begin   = bdrv_child_cb_drained_begin,
1429     .drained_poll    = bdrv_child_cb_drained_poll,
1430     .drained_end     = bdrv_child_cb_drained_end,
1431     .attach          = bdrv_child_cb_attach,
1432     .detach          = bdrv_child_cb_detach,
1433     .inactivate      = bdrv_child_cb_inactivate,
1434     .can_set_aio_ctx = bdrv_child_cb_can_set_aio_ctx,
1435     .set_aio_ctx     = bdrv_child_cb_set_aio_ctx,
1436     .update_filename = bdrv_child_cb_update_filename,
1437     .get_parent_aio_context = child_of_bds_get_parent_aio_context,
1438 };
1439 
1440 AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c)
1441 {
1442     return c->klass->get_parent_aio_context(c);
1443 }
1444 
1445 static int bdrv_open_flags(BlockDriverState *bs, int flags)
1446 {
1447     int open_flags = flags;
1448 
1449     /*
1450      * Clear flags that are internal to the block layer before opening the
1451      * image.
1452      */
1453     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
1454 
1455     return open_flags;
1456 }
1457 
1458 static void update_flags_from_options(int *flags, QemuOpts *opts)
1459 {
1460     *flags &= ~(BDRV_O_CACHE_MASK | BDRV_O_RDWR | BDRV_O_AUTO_RDONLY);
1461 
1462     if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_NO_FLUSH, false)) {
1463         *flags |= BDRV_O_NO_FLUSH;
1464     }
1465 
1466     if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_DIRECT, false)) {
1467         *flags |= BDRV_O_NOCACHE;
1468     }
1469 
1470     if (!qemu_opt_get_bool_del(opts, BDRV_OPT_READ_ONLY, false)) {
1471         *flags |= BDRV_O_RDWR;
1472     }
1473 
1474     if (qemu_opt_get_bool_del(opts, BDRV_OPT_AUTO_READ_ONLY, false)) {
1475         *flags |= BDRV_O_AUTO_RDONLY;
1476     }
1477 }
1478 
1479 static void update_options_from_flags(QDict *options, int flags)
1480 {
1481     if (!qdict_haskey(options, BDRV_OPT_CACHE_DIRECT)) {
1482         qdict_put_bool(options, BDRV_OPT_CACHE_DIRECT, flags & BDRV_O_NOCACHE);
1483     }
1484     if (!qdict_haskey(options, BDRV_OPT_CACHE_NO_FLUSH)) {
1485         qdict_put_bool(options, BDRV_OPT_CACHE_NO_FLUSH,
1486                        flags & BDRV_O_NO_FLUSH);
1487     }
1488     if (!qdict_haskey(options, BDRV_OPT_READ_ONLY)) {
1489         qdict_put_bool(options, BDRV_OPT_READ_ONLY, !(flags & BDRV_O_RDWR));
1490     }
1491     if (!qdict_haskey(options, BDRV_OPT_AUTO_READ_ONLY)) {
1492         qdict_put_bool(options, BDRV_OPT_AUTO_READ_ONLY,
1493                        flags & BDRV_O_AUTO_RDONLY);
1494     }
1495 }
1496 
1497 static void bdrv_assign_node_name(BlockDriverState *bs,
1498                                   const char *node_name,
1499                                   Error **errp)
1500 {
1501     char *gen_node_name = NULL;
1502 
1503     if (!node_name) {
1504         node_name = gen_node_name = id_generate(ID_BLOCK);
1505     } else if (!id_wellformed(node_name)) {
1506         /*
1507          * Check for empty string or invalid characters, but not if it is
1508          * generated (generated names use characters not available to the user)
1509          */
1510         error_setg(errp, "Invalid node-name: '%s'", node_name);
1511         return;
1512     }
1513 
1514     /* takes care of avoiding namespaces collisions */
1515     if (blk_by_name(node_name)) {
1516         error_setg(errp, "node-name=%s is conflicting with a device id",
1517                    node_name);
1518         goto out;
1519     }
1520 
1521     /* takes care of avoiding duplicates node names */
1522     if (bdrv_find_node(node_name)) {
1523         error_setg(errp, "Duplicate nodes with node-name='%s'", node_name);
1524         goto out;
1525     }
1526 
1527     /* Make sure that the node name isn't truncated */
1528     if (strlen(node_name) >= sizeof(bs->node_name)) {
1529         error_setg(errp, "Node name too long");
1530         goto out;
1531     }
1532 
1533     /* copy node name into the bs and insert it into the graph list */
1534     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
1535     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
1536 out:
1537     g_free(gen_node_name);
1538 }
1539 
1540 static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
1541                             const char *node_name, QDict *options,
1542                             int open_flags, Error **errp)
1543 {
1544     Error *local_err = NULL;
1545     int i, ret;
1546 
1547     bdrv_assign_node_name(bs, node_name, &local_err);
1548     if (local_err) {
1549         error_propagate(errp, local_err);
1550         return -EINVAL;
1551     }
1552 
1553     bs->drv = drv;
1554     bs->opaque = g_malloc0(drv->instance_size);
1555 
1556     if (drv->bdrv_file_open) {
1557         assert(!drv->bdrv_needs_filename || bs->filename[0]);
1558         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
1559     } else if (drv->bdrv_open) {
1560         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1561     } else {
1562         ret = 0;
1563     }
1564 
1565     if (ret < 0) {
1566         if (local_err) {
1567             error_propagate(errp, local_err);
1568         } else if (bs->filename[0]) {
1569             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1570         } else {
1571             error_setg_errno(errp, -ret, "Could not open image");
1572         }
1573         goto open_failed;
1574     }
1575 
1576     ret = refresh_total_sectors(bs, bs->total_sectors);
1577     if (ret < 0) {
1578         error_setg_errno(errp, -ret, "Could not refresh total sector count");
1579         return ret;
1580     }
1581 
1582     bdrv_refresh_limits(bs, NULL, &local_err);
1583     if (local_err) {
1584         error_propagate(errp, local_err);
1585         return -EINVAL;
1586     }
1587 
1588     assert(bdrv_opt_mem_align(bs) != 0);
1589     assert(bdrv_min_mem_align(bs) != 0);
1590     assert(is_power_of_2(bs->bl.request_alignment));
1591 
1592     for (i = 0; i < bs->quiesce_counter; i++) {
1593         if (drv->bdrv_co_drain_begin) {
1594             drv->bdrv_co_drain_begin(bs);
1595         }
1596     }
1597 
1598     return 0;
1599 open_failed:
1600     bs->drv = NULL;
1601     if (bs->file != NULL) {
1602         bdrv_unref_child(bs, bs->file);
1603         bs->file = NULL;
1604     }
1605     g_free(bs->opaque);
1606     bs->opaque = NULL;
1607     return ret;
1608 }
1609 
1610 /*
1611  * Create and open a block node.
1612  *
1613  * @options is a QDict of options to pass to the block drivers, or NULL for an
1614  * empty set of options. The reference to the QDict belongs to the block layer
1615  * after the call (even on failure), so if the caller intends to reuse the
1616  * dictionary, it needs to use qobject_ref() before calling bdrv_open.
1617  */
1618 BlockDriverState *bdrv_new_open_driver_opts(BlockDriver *drv,
1619                                             const char *node_name,
1620                                             QDict *options, int flags,
1621                                             Error **errp)
1622 {
1623     BlockDriverState *bs;
1624     int ret;
1625 
1626     bs = bdrv_new();
1627     bs->open_flags = flags;
1628     bs->options = options ?: qdict_new();
1629     bs->explicit_options = qdict_clone_shallow(bs->options);
1630     bs->opaque = NULL;
1631 
1632     update_options_from_flags(bs->options, flags);
1633 
1634     ret = bdrv_open_driver(bs, drv, node_name, bs->options, flags, errp);
1635     if (ret < 0) {
1636         qobject_unref(bs->explicit_options);
1637         bs->explicit_options = NULL;
1638         qobject_unref(bs->options);
1639         bs->options = NULL;
1640         bdrv_unref(bs);
1641         return NULL;
1642     }
1643 
1644     return bs;
1645 }
1646 
1647 /* Create and open a block node. */
1648 BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
1649                                        int flags, Error **errp)
1650 {
1651     return bdrv_new_open_driver_opts(drv, node_name, NULL, flags, errp);
1652 }
1653 
1654 QemuOptsList bdrv_runtime_opts = {
1655     .name = "bdrv_common",
1656     .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
1657     .desc = {
1658         {
1659             .name = "node-name",
1660             .type = QEMU_OPT_STRING,
1661             .help = "Node name of the block device node",
1662         },
1663         {
1664             .name = "driver",
1665             .type = QEMU_OPT_STRING,
1666             .help = "Block driver to use for the node",
1667         },
1668         {
1669             .name = BDRV_OPT_CACHE_DIRECT,
1670             .type = QEMU_OPT_BOOL,
1671             .help = "Bypass software writeback cache on the host",
1672         },
1673         {
1674             .name = BDRV_OPT_CACHE_NO_FLUSH,
1675             .type = QEMU_OPT_BOOL,
1676             .help = "Ignore flush requests",
1677         },
1678         {
1679             .name = BDRV_OPT_READ_ONLY,
1680             .type = QEMU_OPT_BOOL,
1681             .help = "Node is opened in read-only mode",
1682         },
1683         {
1684             .name = BDRV_OPT_AUTO_READ_ONLY,
1685             .type = QEMU_OPT_BOOL,
1686             .help = "Node can become read-only if opening read-write fails",
1687         },
1688         {
1689             .name = "detect-zeroes",
1690             .type = QEMU_OPT_STRING,
1691             .help = "try to optimize zero writes (off, on, unmap)",
1692         },
1693         {
1694             .name = BDRV_OPT_DISCARD,
1695             .type = QEMU_OPT_STRING,
1696             .help = "discard operation (ignore/off, unmap/on)",
1697         },
1698         {
1699             .name = BDRV_OPT_FORCE_SHARE,
1700             .type = QEMU_OPT_BOOL,
1701             .help = "always accept other writers (default: off)",
1702         },
1703         { /* end of list */ }
1704     },
1705 };
1706 
1707 QemuOptsList bdrv_create_opts_simple = {
1708     .name = "simple-create-opts",
1709     .head = QTAILQ_HEAD_INITIALIZER(bdrv_create_opts_simple.head),
1710     .desc = {
1711         {
1712             .name = BLOCK_OPT_SIZE,
1713             .type = QEMU_OPT_SIZE,
1714             .help = "Virtual disk size"
1715         },
1716         {
1717             .name = BLOCK_OPT_PREALLOC,
1718             .type = QEMU_OPT_STRING,
1719             .help = "Preallocation mode (allowed values: off)"
1720         },
1721         { /* end of list */ }
1722     }
1723 };
1724 
1725 /*
1726  * Common part for opening disk images and files
1727  *
1728  * Removes all processed options from *options.
1729  */
1730 static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file,
1731                             QDict *options, Error **errp)
1732 {
1733     int ret, open_flags;
1734     const char *filename;
1735     const char *driver_name = NULL;
1736     const char *node_name = NULL;
1737     const char *discard;
1738     QemuOpts *opts;
1739     BlockDriver *drv;
1740     Error *local_err = NULL;
1741     bool ro;
1742 
1743     assert(bs->file == NULL);
1744     assert(options != NULL && bs->options != options);
1745 
1746     opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
1747     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
1748         ret = -EINVAL;
1749         goto fail_opts;
1750     }
1751 
1752     update_flags_from_options(&bs->open_flags, opts);
1753 
1754     driver_name = qemu_opt_get(opts, "driver");
1755     drv = bdrv_find_format(driver_name);
1756     assert(drv != NULL);
1757 
1758     bs->force_share = qemu_opt_get_bool(opts, BDRV_OPT_FORCE_SHARE, false);
1759 
1760     if (bs->force_share && (bs->open_flags & BDRV_O_RDWR)) {
1761         error_setg(errp,
1762                    BDRV_OPT_FORCE_SHARE
1763                    "=on can only be used with read-only images");
1764         ret = -EINVAL;
1765         goto fail_opts;
1766     }
1767 
1768     if (file != NULL) {
1769         bdrv_refresh_filename(blk_bs(file));
1770         filename = blk_bs(file)->filename;
1771     } else {
1772         /*
1773          * Caution: while qdict_get_try_str() is fine, getting
1774          * non-string types would require more care.  When @options
1775          * come from -blockdev or blockdev_add, its members are typed
1776          * according to the QAPI schema, but when they come from
1777          * -drive, they're all QString.
1778          */
1779         filename = qdict_get_try_str(options, "filename");
1780     }
1781 
1782     if (drv->bdrv_needs_filename && (!filename || !filename[0])) {
1783         error_setg(errp, "The '%s' block driver requires a file name",
1784                    drv->format_name);
1785         ret = -EINVAL;
1786         goto fail_opts;
1787     }
1788 
1789     trace_bdrv_open_common(bs, filename ?: "", bs->open_flags,
1790                            drv->format_name);
1791 
1792     ro = bdrv_is_read_only(bs);
1793 
1794     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, ro)) {
1795         if (!ro && bdrv_is_whitelisted(drv, true)) {
1796             ret = bdrv_apply_auto_read_only(bs, NULL, NULL);
1797         } else {
1798             ret = -ENOTSUP;
1799         }
1800         if (ret < 0) {
1801             error_setg(errp,
1802                        !ro && bdrv_is_whitelisted(drv, true)
1803                        ? "Driver '%s' can only be used for read-only devices"
1804                        : "Driver '%s' is not whitelisted",
1805                        drv->format_name);
1806             goto fail_opts;
1807         }
1808     }
1809 
1810     /* bdrv_new() and bdrv_close() make it so */
1811     assert(qatomic_read(&bs->copy_on_read) == 0);
1812 
1813     if (bs->open_flags & BDRV_O_COPY_ON_READ) {
1814         if (!ro) {
1815             bdrv_enable_copy_on_read(bs);
1816         } else {
1817             error_setg(errp, "Can't use copy-on-read on read-only device");
1818             ret = -EINVAL;
1819             goto fail_opts;
1820         }
1821     }
1822 
1823     discard = qemu_opt_get(opts, BDRV_OPT_DISCARD);
1824     if (discard != NULL) {
1825         if (bdrv_parse_discard_flags(discard, &bs->open_flags) != 0) {
1826             error_setg(errp, "Invalid discard option");
1827             ret = -EINVAL;
1828             goto fail_opts;
1829         }
1830     }
1831 
1832     bs->detect_zeroes =
1833         bdrv_parse_detect_zeroes(opts, bs->open_flags, &local_err);
1834     if (local_err) {
1835         error_propagate(errp, local_err);
1836         ret = -EINVAL;
1837         goto fail_opts;
1838     }
1839 
1840     if (filename != NULL) {
1841         pstrcpy(bs->filename, sizeof(bs->filename), filename);
1842     } else {
1843         bs->filename[0] = '\0';
1844     }
1845     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
1846 
1847     /* Open the image, either directly or using a protocol */
1848     open_flags = bdrv_open_flags(bs, bs->open_flags);
1849     node_name = qemu_opt_get(opts, "node-name");
1850 
1851     assert(!drv->bdrv_file_open || file == NULL);
1852     ret = bdrv_open_driver(bs, drv, node_name, options, open_flags, errp);
1853     if (ret < 0) {
1854         goto fail_opts;
1855     }
1856 
1857     qemu_opts_del(opts);
1858     return 0;
1859 
1860 fail_opts:
1861     qemu_opts_del(opts);
1862     return ret;
1863 }
1864 
1865 static QDict *parse_json_filename(const char *filename, Error **errp)
1866 {
1867     QObject *options_obj;
1868     QDict *options;
1869     int ret;
1870 
1871     ret = strstart(filename, "json:", &filename);
1872     assert(ret);
1873 
1874     options_obj = qobject_from_json(filename, errp);
1875     if (!options_obj) {
1876         error_prepend(errp, "Could not parse the JSON options: ");
1877         return NULL;
1878     }
1879 
1880     options = qobject_to(QDict, options_obj);
1881     if (!options) {
1882         qobject_unref(options_obj);
1883         error_setg(errp, "Invalid JSON object given");
1884         return NULL;
1885     }
1886 
1887     qdict_flatten(options);
1888 
1889     return options;
1890 }
1891 
1892 static void parse_json_protocol(QDict *options, const char **pfilename,
1893                                 Error **errp)
1894 {
1895     QDict *json_options;
1896     Error *local_err = NULL;
1897 
1898     /* Parse json: pseudo-protocol */
1899     if (!*pfilename || !g_str_has_prefix(*pfilename, "json:")) {
1900         return;
1901     }
1902 
1903     json_options = parse_json_filename(*pfilename, &local_err);
1904     if (local_err) {
1905         error_propagate(errp, local_err);
1906         return;
1907     }
1908 
1909     /* Options given in the filename have lower priority than options
1910      * specified directly */
1911     qdict_join(options, json_options, false);
1912     qobject_unref(json_options);
1913     *pfilename = NULL;
1914 }
1915 
1916 /*
1917  * Fills in default options for opening images and converts the legacy
1918  * filename/flags pair to option QDict entries.
1919  * The BDRV_O_PROTOCOL flag in *flags will be set or cleared accordingly if a
1920  * block driver has been specified explicitly.
1921  */
1922 static int bdrv_fill_options(QDict **options, const char *filename,
1923                              int *flags, Error **errp)
1924 {
1925     const char *drvname;
1926     bool protocol = *flags & BDRV_O_PROTOCOL;
1927     bool parse_filename = false;
1928     BlockDriver *drv = NULL;
1929     Error *local_err = NULL;
1930 
1931     /*
1932      * Caution: while qdict_get_try_str() is fine, getting non-string
1933      * types would require more care.  When @options come from
1934      * -blockdev or blockdev_add, its members are typed according to
1935      * the QAPI schema, but when they come from -drive, they're all
1936      * QString.
1937      */
1938     drvname = qdict_get_try_str(*options, "driver");
1939     if (drvname) {
1940         drv = bdrv_find_format(drvname);
1941         if (!drv) {
1942             error_setg(errp, "Unknown driver '%s'", drvname);
1943             return -ENOENT;
1944         }
1945         /* If the user has explicitly specified the driver, this choice should
1946          * override the BDRV_O_PROTOCOL flag */
1947         protocol = drv->bdrv_file_open;
1948     }
1949 
1950     if (protocol) {
1951         *flags |= BDRV_O_PROTOCOL;
1952     } else {
1953         *flags &= ~BDRV_O_PROTOCOL;
1954     }
1955 
1956     /* Translate cache options from flags into options */
1957     update_options_from_flags(*options, *flags);
1958 
1959     /* Fetch the file name from the options QDict if necessary */
1960     if (protocol && filename) {
1961         if (!qdict_haskey(*options, "filename")) {
1962             qdict_put_str(*options, "filename", filename);
1963             parse_filename = true;
1964         } else {
1965             error_setg(errp, "Can't specify 'file' and 'filename' options at "
1966                              "the same time");
1967             return -EINVAL;
1968         }
1969     }
1970 
1971     /* Find the right block driver */
1972     /* See cautionary note on accessing @options above */
1973     filename = qdict_get_try_str(*options, "filename");
1974 
1975     if (!drvname && protocol) {
1976         if (filename) {
1977             drv = bdrv_find_protocol(filename, parse_filename, errp);
1978             if (!drv) {
1979                 return -EINVAL;
1980             }
1981 
1982             drvname = drv->format_name;
1983             qdict_put_str(*options, "driver", drvname);
1984         } else {
1985             error_setg(errp, "Must specify either driver or file");
1986             return -EINVAL;
1987         }
1988     }
1989 
1990     assert(drv || !protocol);
1991 
1992     /* Driver-specific filename parsing */
1993     if (drv && drv->bdrv_parse_filename && parse_filename) {
1994         drv->bdrv_parse_filename(filename, *options, &local_err);
1995         if (local_err) {
1996             error_propagate(errp, local_err);
1997             return -EINVAL;
1998         }
1999 
2000         if (!drv->bdrv_needs_filename) {
2001             qdict_del(*options, "filename");
2002         }
2003     }
2004 
2005     return 0;
2006 }
2007 
2008 typedef struct BlockReopenQueueEntry {
2009      bool prepared;
2010      bool perms_checked;
2011      BDRVReopenState state;
2012      QTAILQ_ENTRY(BlockReopenQueueEntry) entry;
2013 } BlockReopenQueueEntry;
2014 
2015 /*
2016  * Return the flags that @bs will have after the reopens in @q have
2017  * successfully completed. If @q is NULL (or @bs is not contained in @q),
2018  * return the current flags.
2019  */
2020 static int bdrv_reopen_get_flags(BlockReopenQueue *q, BlockDriverState *bs)
2021 {
2022     BlockReopenQueueEntry *entry;
2023 
2024     if (q != NULL) {
2025         QTAILQ_FOREACH(entry, q, entry) {
2026             if (entry->state.bs == bs) {
2027                 return entry->state.flags;
2028             }
2029         }
2030     }
2031 
2032     return bs->open_flags;
2033 }
2034 
2035 /* Returns whether the image file can be written to after the reopen queue @q
2036  * has been successfully applied, or right now if @q is NULL. */
2037 static bool bdrv_is_writable_after_reopen(BlockDriverState *bs,
2038                                           BlockReopenQueue *q)
2039 {
2040     int flags = bdrv_reopen_get_flags(q, bs);
2041 
2042     return (flags & (BDRV_O_RDWR | BDRV_O_INACTIVE)) == BDRV_O_RDWR;
2043 }
2044 
2045 /*
2046  * Return whether the BDS can be written to.  This is not necessarily
2047  * the same as !bdrv_is_read_only(bs), as inactivated images may not
2048  * be written to but do not count as read-only images.
2049  */
2050 bool bdrv_is_writable(BlockDriverState *bs)
2051 {
2052     return bdrv_is_writable_after_reopen(bs, NULL);
2053 }
2054 
2055 static char *bdrv_child_user_desc(BdrvChild *c)
2056 {
2057     return c->klass->get_parent_desc(c);
2058 }
2059 
2060 /*
2061  * Check that @a allows everything that @b needs. @a and @b must reference same
2062  * child node.
2063  */
2064 static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, Error **errp)
2065 {
2066     const char *child_bs_name;
2067     g_autofree char *a_user = NULL;
2068     g_autofree char *b_user = NULL;
2069     g_autofree char *perms = NULL;
2070 
2071     assert(a->bs);
2072     assert(a->bs == b->bs);
2073 
2074     if ((b->perm & a->shared_perm) == b->perm) {
2075         return true;
2076     }
2077 
2078     child_bs_name = bdrv_get_node_name(b->bs);
2079     a_user = bdrv_child_user_desc(a);
2080     b_user = bdrv_child_user_desc(b);
2081     perms = bdrv_perm_names(b->perm & ~a->shared_perm);
2082 
2083     error_setg(errp, "Permission conflict on node '%s': permissions '%s' are "
2084                "both required by %s (uses node '%s' as '%s' child) and "
2085                "unshared by %s (uses node '%s' as '%s' child).",
2086                child_bs_name, perms,
2087                b_user, child_bs_name, b->name,
2088                a_user, child_bs_name, a->name);
2089 
2090     return false;
2091 }
2092 
2093 static bool bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
2094 {
2095     BdrvChild *a, *b;
2096 
2097     /*
2098      * During the loop we'll look at each pair twice. That's correct because
2099      * bdrv_a_allow_b() is asymmetric and we should check each pair in both
2100      * directions.
2101      */
2102     QLIST_FOREACH(a, &bs->parents, next_parent) {
2103         QLIST_FOREACH(b, &bs->parents, next_parent) {
2104             if (a == b) {
2105                 continue;
2106             }
2107 
2108             if (!bdrv_a_allow_b(a, b, errp)) {
2109                 return true;
2110             }
2111         }
2112     }
2113 
2114     return false;
2115 }
2116 
2117 static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
2118                             BdrvChild *c, BdrvChildRole role,
2119                             BlockReopenQueue *reopen_queue,
2120                             uint64_t parent_perm, uint64_t parent_shared,
2121                             uint64_t *nperm, uint64_t *nshared)
2122 {
2123     assert(bs->drv && bs->drv->bdrv_child_perm);
2124     bs->drv->bdrv_child_perm(bs, c, role, reopen_queue,
2125                              parent_perm, parent_shared,
2126                              nperm, nshared);
2127     /* TODO Take force_share from reopen_queue */
2128     if (child_bs && child_bs->force_share) {
2129         *nshared = BLK_PERM_ALL;
2130     }
2131 }
2132 
2133 /*
2134  * Adds the whole subtree of @bs (including @bs itself) to the @list (except for
2135  * nodes that are already in the @list, of course) so that final list is
2136  * topologically sorted. Return the result (GSList @list object is updated, so
2137  * don't use old reference after function call).
2138  *
2139  * On function start @list must be already topologically sorted and for any node
2140  * in the @list the whole subtree of the node must be in the @list as well. The
2141  * simplest way to satisfy this criteria: use only result of
2142  * bdrv_topological_dfs() or NULL as @list parameter.
2143  */
2144 static GSList *bdrv_topological_dfs(GSList *list, GHashTable *found,
2145                                     BlockDriverState *bs)
2146 {
2147     BdrvChild *child;
2148     g_autoptr(GHashTable) local_found = NULL;
2149 
2150     if (!found) {
2151         assert(!list);
2152         found = local_found = g_hash_table_new(NULL, NULL);
2153     }
2154 
2155     if (g_hash_table_contains(found, bs)) {
2156         return list;
2157     }
2158     g_hash_table_add(found, bs);
2159 
2160     QLIST_FOREACH(child, &bs->children, next) {
2161         list = bdrv_topological_dfs(list, found, child->bs);
2162     }
2163 
2164     return g_slist_prepend(list, bs);
2165 }
2166 
2167 typedef struct BdrvChildSetPermState {
2168     BdrvChild *child;
2169     uint64_t old_perm;
2170     uint64_t old_shared_perm;
2171 } BdrvChildSetPermState;
2172 
2173 static void bdrv_child_set_perm_abort(void *opaque)
2174 {
2175     BdrvChildSetPermState *s = opaque;
2176 
2177     s->child->perm = s->old_perm;
2178     s->child->shared_perm = s->old_shared_perm;
2179 }
2180 
2181 static TransactionActionDrv bdrv_child_set_pem_drv = {
2182     .abort = bdrv_child_set_perm_abort,
2183     .clean = g_free,
2184 };
2185 
2186 static void bdrv_child_set_perm(BdrvChild *c, uint64_t perm,
2187                                 uint64_t shared, Transaction *tran)
2188 {
2189     BdrvChildSetPermState *s = g_new(BdrvChildSetPermState, 1);
2190 
2191     *s = (BdrvChildSetPermState) {
2192         .child = c,
2193         .old_perm = c->perm,
2194         .old_shared_perm = c->shared_perm,
2195     };
2196 
2197     c->perm = perm;
2198     c->shared_perm = shared;
2199 
2200     tran_add(tran, &bdrv_child_set_pem_drv, s);
2201 }
2202 
2203 static void bdrv_drv_set_perm_commit(void *opaque)
2204 {
2205     BlockDriverState *bs = opaque;
2206     uint64_t cumulative_perms, cumulative_shared_perms;
2207 
2208     if (bs->drv->bdrv_set_perm) {
2209         bdrv_get_cumulative_perm(bs, &cumulative_perms,
2210                                  &cumulative_shared_perms);
2211         bs->drv->bdrv_set_perm(bs, cumulative_perms, cumulative_shared_perms);
2212     }
2213 }
2214 
2215 static void bdrv_drv_set_perm_abort(void *opaque)
2216 {
2217     BlockDriverState *bs = opaque;
2218 
2219     if (bs->drv->bdrv_abort_perm_update) {
2220         bs->drv->bdrv_abort_perm_update(bs);
2221     }
2222 }
2223 
2224 TransactionActionDrv bdrv_drv_set_perm_drv = {
2225     .abort = bdrv_drv_set_perm_abort,
2226     .commit = bdrv_drv_set_perm_commit,
2227 };
2228 
2229 static int bdrv_drv_set_perm(BlockDriverState *bs, uint64_t perm,
2230                              uint64_t shared_perm, Transaction *tran,
2231                              Error **errp)
2232 {
2233     if (!bs->drv) {
2234         return 0;
2235     }
2236 
2237     if (bs->drv->bdrv_check_perm) {
2238         int ret = bs->drv->bdrv_check_perm(bs, perm, shared_perm, errp);
2239         if (ret < 0) {
2240             return ret;
2241         }
2242     }
2243 
2244     if (tran) {
2245         tran_add(tran, &bdrv_drv_set_perm_drv, bs);
2246     }
2247 
2248     return 0;
2249 }
2250 
2251 typedef struct BdrvReplaceChildState {
2252     BdrvChild *child;
2253     BlockDriverState *old_bs;
2254 } BdrvReplaceChildState;
2255 
2256 static void bdrv_replace_child_commit(void *opaque)
2257 {
2258     BdrvReplaceChildState *s = opaque;
2259 
2260     bdrv_unref(s->old_bs);
2261 }
2262 
2263 static void bdrv_replace_child_abort(void *opaque)
2264 {
2265     BdrvReplaceChildState *s = opaque;
2266     BlockDriverState *new_bs = s->child->bs;
2267 
2268     /* old_bs reference is transparently moved from @s to @s->child */
2269     bdrv_replace_child_noperm(s->child, s->old_bs);
2270     bdrv_unref(new_bs);
2271 }
2272 
2273 static TransactionActionDrv bdrv_replace_child_drv = {
2274     .commit = bdrv_replace_child_commit,
2275     .abort = bdrv_replace_child_abort,
2276     .clean = g_free,
2277 };
2278 
2279 /*
2280  * bdrv_replace_child_tran
2281  *
2282  * Note: real unref of old_bs is done only on commit.
2283  *
2284  * The function doesn't update permissions, caller is responsible for this.
2285  */
2286 static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
2287                                     Transaction *tran)
2288 {
2289     BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
2290     *s = (BdrvReplaceChildState) {
2291         .child = child,
2292         .old_bs = child->bs,
2293     };
2294     tran_add(tran, &bdrv_replace_child_drv, s);
2295 
2296     if (new_bs) {
2297         bdrv_ref(new_bs);
2298     }
2299     bdrv_replace_child_noperm(child, new_bs);
2300     /* old_bs reference is transparently moved from @child to @s */
2301 }
2302 
2303 /*
2304  * Refresh permissions in @bs subtree. The function is intended to be called
2305  * after some graph modification that was done without permission update.
2306  */
2307 static int bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
2308                                   Transaction *tran, Error **errp)
2309 {
2310     BlockDriver *drv = bs->drv;
2311     BdrvChild *c;
2312     int ret;
2313     uint64_t cumulative_perms, cumulative_shared_perms;
2314 
2315     bdrv_get_cumulative_perm(bs, &cumulative_perms, &cumulative_shared_perms);
2316 
2317     /* Write permissions never work with read-only images */
2318     if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
2319         !bdrv_is_writable_after_reopen(bs, q))
2320     {
2321         if (!bdrv_is_writable_after_reopen(bs, NULL)) {
2322             error_setg(errp, "Block node is read-only");
2323         } else {
2324             error_setg(errp, "Read-only block node '%s' cannot support "
2325                        "read-write users", bdrv_get_node_name(bs));
2326         }
2327 
2328         return -EPERM;
2329     }
2330 
2331     /*
2332      * Unaligned requests will automatically be aligned to bl.request_alignment
2333      * and without RESIZE we can't extend requests to write to space beyond the
2334      * end of the image, so it's required that the image size is aligned.
2335      */
2336     if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
2337         !(cumulative_perms & BLK_PERM_RESIZE))
2338     {
2339         if ((bs->total_sectors * BDRV_SECTOR_SIZE) % bs->bl.request_alignment) {
2340             error_setg(errp, "Cannot get 'write' permission without 'resize': "
2341                              "Image size is not a multiple of request "
2342                              "alignment");
2343             return -EPERM;
2344         }
2345     }
2346 
2347     /* Check this node */
2348     if (!drv) {
2349         return 0;
2350     }
2351 
2352     ret = bdrv_drv_set_perm(bs, cumulative_perms, cumulative_shared_perms, tran,
2353                             errp);
2354     if (ret < 0) {
2355         return ret;
2356     }
2357 
2358     /* Drivers that never have children can omit .bdrv_child_perm() */
2359     if (!drv->bdrv_child_perm) {
2360         assert(QLIST_EMPTY(&bs->children));
2361         return 0;
2362     }
2363 
2364     /* Check all children */
2365     QLIST_FOREACH(c, &bs->children, next) {
2366         uint64_t cur_perm, cur_shared;
2367 
2368         bdrv_child_perm(bs, c->bs, c, c->role, q,
2369                         cumulative_perms, cumulative_shared_perms,
2370                         &cur_perm, &cur_shared);
2371         bdrv_child_set_perm(c, cur_perm, cur_shared, tran);
2372     }
2373 
2374     return 0;
2375 }
2376 
2377 static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q,
2378                                    Transaction *tran, Error **errp)
2379 {
2380     int ret;
2381     BlockDriverState *bs;
2382 
2383     for ( ; list; list = list->next) {
2384         bs = list->data;
2385 
2386         if (bdrv_parent_perms_conflict(bs, errp)) {
2387             return -EINVAL;
2388         }
2389 
2390         ret = bdrv_node_refresh_perm(bs, q, tran, errp);
2391         if (ret < 0) {
2392             return ret;
2393         }
2394     }
2395 
2396     return 0;
2397 }
2398 
2399 void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
2400                               uint64_t *shared_perm)
2401 {
2402     BdrvChild *c;
2403     uint64_t cumulative_perms = 0;
2404     uint64_t cumulative_shared_perms = BLK_PERM_ALL;
2405 
2406     QLIST_FOREACH(c, &bs->parents, next_parent) {
2407         cumulative_perms |= c->perm;
2408         cumulative_shared_perms &= c->shared_perm;
2409     }
2410 
2411     *perm = cumulative_perms;
2412     *shared_perm = cumulative_shared_perms;
2413 }
2414 
2415 char *bdrv_perm_names(uint64_t perm)
2416 {
2417     struct perm_name {
2418         uint64_t perm;
2419         const char *name;
2420     } permissions[] = {
2421         { BLK_PERM_CONSISTENT_READ, "consistent read" },
2422         { BLK_PERM_WRITE,           "write" },
2423         { BLK_PERM_WRITE_UNCHANGED, "write unchanged" },
2424         { BLK_PERM_RESIZE,          "resize" },
2425         { BLK_PERM_GRAPH_MOD,       "change children" },
2426         { 0, NULL }
2427     };
2428 
2429     GString *result = g_string_sized_new(30);
2430     struct perm_name *p;
2431 
2432     for (p = permissions; p->name; p++) {
2433         if (perm & p->perm) {
2434             if (result->len > 0) {
2435                 g_string_append(result, ", ");
2436             }
2437             g_string_append(result, p->name);
2438         }
2439     }
2440 
2441     return g_string_free(result, FALSE);
2442 }
2443 
2444 
2445 static int bdrv_refresh_perms(BlockDriverState *bs, Error **errp)
2446 {
2447     int ret;
2448     Transaction *tran = tran_new();
2449     g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, bs);
2450 
2451     ret = bdrv_list_refresh_perms(list, NULL, tran, errp);
2452     tran_finalize(tran, ret);
2453 
2454     return ret;
2455 }
2456 
2457 int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
2458                             Error **errp)
2459 {
2460     Error *local_err = NULL;
2461     Transaction *tran = tran_new();
2462     int ret;
2463 
2464     bdrv_child_set_perm(c, perm, shared, tran);
2465 
2466     ret = bdrv_refresh_perms(c->bs, &local_err);
2467 
2468     tran_finalize(tran, ret);
2469 
2470     if (ret < 0) {
2471         if ((perm & ~c->perm) || (c->shared_perm & ~shared)) {
2472             /* tighten permissions */
2473             error_propagate(errp, local_err);
2474         } else {
2475             /*
2476              * Our caller may intend to only loosen restrictions and
2477              * does not expect this function to fail.  Errors are not
2478              * fatal in such a case, so we can just hide them from our
2479              * caller.
2480              */
2481             error_free(local_err);
2482             ret = 0;
2483         }
2484     }
2485 
2486     return ret;
2487 }
2488 
2489 int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp)
2490 {
2491     uint64_t parent_perms, parent_shared;
2492     uint64_t perms, shared;
2493 
2494     bdrv_get_cumulative_perm(bs, &parent_perms, &parent_shared);
2495     bdrv_child_perm(bs, c->bs, c, c->role, NULL,
2496                     parent_perms, parent_shared, &perms, &shared);
2497 
2498     return bdrv_child_try_set_perm(c, perms, shared, errp);
2499 }
2500 
2501 /*
2502  * Default implementation for .bdrv_child_perm() for block filters:
2503  * Forward CONSISTENT_READ, WRITE, WRITE_UNCHANGED, and RESIZE to the
2504  * filtered child.
2505  */
2506 static void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
2507                                       BdrvChildRole role,
2508                                       BlockReopenQueue *reopen_queue,
2509                                       uint64_t perm, uint64_t shared,
2510                                       uint64_t *nperm, uint64_t *nshared)
2511 {
2512     *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
2513     *nshared = (shared & DEFAULT_PERM_PASSTHROUGH) | DEFAULT_PERM_UNCHANGED;
2514 }
2515 
2516 static void bdrv_default_perms_for_cow(BlockDriverState *bs, BdrvChild *c,
2517                                        BdrvChildRole role,
2518                                        BlockReopenQueue *reopen_queue,
2519                                        uint64_t perm, uint64_t shared,
2520                                        uint64_t *nperm, uint64_t *nshared)
2521 {
2522     assert(role & BDRV_CHILD_COW);
2523 
2524     /*
2525      * We want consistent read from backing files if the parent needs it.
2526      * No other operations are performed on backing files.
2527      */
2528     perm &= BLK_PERM_CONSISTENT_READ;
2529 
2530     /*
2531      * If the parent can deal with changing data, we're okay with a
2532      * writable and resizable backing file.
2533      * TODO Require !(perm & BLK_PERM_CONSISTENT_READ), too?
2534      */
2535     if (shared & BLK_PERM_WRITE) {
2536         shared = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2537     } else {
2538         shared = 0;
2539     }
2540 
2541     shared |= BLK_PERM_CONSISTENT_READ | BLK_PERM_GRAPH_MOD |
2542               BLK_PERM_WRITE_UNCHANGED;
2543 
2544     if (bs->open_flags & BDRV_O_INACTIVE) {
2545         shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2546     }
2547 
2548     *nperm = perm;
2549     *nshared = shared;
2550 }
2551 
2552 static void bdrv_default_perms_for_storage(BlockDriverState *bs, BdrvChild *c,
2553                                            BdrvChildRole role,
2554                                            BlockReopenQueue *reopen_queue,
2555                                            uint64_t perm, uint64_t shared,
2556                                            uint64_t *nperm, uint64_t *nshared)
2557 {
2558     int flags;
2559 
2560     assert(role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA));
2561 
2562     flags = bdrv_reopen_get_flags(reopen_queue, bs);
2563 
2564     /*
2565      * Apart from the modifications below, the same permissions are
2566      * forwarded and left alone as for filters
2567      */
2568     bdrv_filter_default_perms(bs, c, role, reopen_queue,
2569                               perm, shared, &perm, &shared);
2570 
2571     if (role & BDRV_CHILD_METADATA) {
2572         /* Format drivers may touch metadata even if the guest doesn't write */
2573         if (bdrv_is_writable_after_reopen(bs, reopen_queue)) {
2574             perm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2575         }
2576 
2577         /*
2578          * bs->file always needs to be consistent because of the
2579          * metadata. We can never allow other users to resize or write
2580          * to it.
2581          */
2582         if (!(flags & BDRV_O_NO_IO)) {
2583             perm |= BLK_PERM_CONSISTENT_READ;
2584         }
2585         shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
2586     }
2587 
2588     if (role & BDRV_CHILD_DATA) {
2589         /*
2590          * Technically, everything in this block is a subset of the
2591          * BDRV_CHILD_METADATA path taken above, and so this could
2592          * be an "else if" branch.  However, that is not obvious, and
2593          * this function is not performance critical, therefore we let
2594          * this be an independent "if".
2595          */
2596 
2597         /*
2598          * We cannot allow other users to resize the file because the
2599          * format driver might have some assumptions about the size
2600          * (e.g. because it is stored in metadata, or because the file
2601          * is split into fixed-size data files).
2602          */
2603         shared &= ~BLK_PERM_RESIZE;
2604 
2605         /*
2606          * WRITE_UNCHANGED often cannot be performed as such on the
2607          * data file.  For example, the qcow2 driver may still need to
2608          * write copied clusters on copy-on-read.
2609          */
2610         if (perm & BLK_PERM_WRITE_UNCHANGED) {
2611             perm |= BLK_PERM_WRITE;
2612         }
2613 
2614         /*
2615          * If the data file is written to, the format driver may
2616          * expect to be able to resize it by writing beyond the EOF.
2617          */
2618         if (perm & BLK_PERM_WRITE) {
2619             perm |= BLK_PERM_RESIZE;
2620         }
2621     }
2622 
2623     if (bs->open_flags & BDRV_O_INACTIVE) {
2624         shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2625     }
2626 
2627     *nperm = perm;
2628     *nshared = shared;
2629 }
2630 
2631 void bdrv_default_perms(BlockDriverState *bs, BdrvChild *c,
2632                         BdrvChildRole role, BlockReopenQueue *reopen_queue,
2633                         uint64_t perm, uint64_t shared,
2634                         uint64_t *nperm, uint64_t *nshared)
2635 {
2636     if (role & BDRV_CHILD_FILTERED) {
2637         assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
2638                          BDRV_CHILD_COW)));
2639         bdrv_filter_default_perms(bs, c, role, reopen_queue,
2640                                   perm, shared, nperm, nshared);
2641     } else if (role & BDRV_CHILD_COW) {
2642         assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA)));
2643         bdrv_default_perms_for_cow(bs, c, role, reopen_queue,
2644                                    perm, shared, nperm, nshared);
2645     } else if (role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA)) {
2646         bdrv_default_perms_for_storage(bs, c, role, reopen_queue,
2647                                        perm, shared, nperm, nshared);
2648     } else {
2649         g_assert_not_reached();
2650     }
2651 }
2652 
2653 uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
2654 {
2655     static const uint64_t permissions[] = {
2656         [BLOCK_PERMISSION_CONSISTENT_READ]  = BLK_PERM_CONSISTENT_READ,
2657         [BLOCK_PERMISSION_WRITE]            = BLK_PERM_WRITE,
2658         [BLOCK_PERMISSION_WRITE_UNCHANGED]  = BLK_PERM_WRITE_UNCHANGED,
2659         [BLOCK_PERMISSION_RESIZE]           = BLK_PERM_RESIZE,
2660         [BLOCK_PERMISSION_GRAPH_MOD]        = BLK_PERM_GRAPH_MOD,
2661     };
2662 
2663     QEMU_BUILD_BUG_ON(ARRAY_SIZE(permissions) != BLOCK_PERMISSION__MAX);
2664     QEMU_BUILD_BUG_ON(1UL << ARRAY_SIZE(permissions) != BLK_PERM_ALL + 1);
2665 
2666     assert(qapi_perm < BLOCK_PERMISSION__MAX);
2667 
2668     return permissions[qapi_perm];
2669 }
2670 
2671 static void bdrv_replace_child_noperm(BdrvChild *child,
2672                                       BlockDriverState *new_bs)
2673 {
2674     BlockDriverState *old_bs = child->bs;
2675     int new_bs_quiesce_counter;
2676     int drain_saldo;
2677 
2678     assert(!child->frozen);
2679     assert(old_bs != new_bs);
2680 
2681     if (old_bs && new_bs) {
2682         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
2683     }
2684 
2685     new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
2686     drain_saldo = new_bs_quiesce_counter - child->parent_quiesce_counter;
2687 
2688     /*
2689      * If the new child node is drained but the old one was not, flush
2690      * all outstanding requests to the old child node.
2691      */
2692     while (drain_saldo > 0 && child->klass->drained_begin) {
2693         bdrv_parent_drained_begin_single(child, true);
2694         drain_saldo--;
2695     }
2696 
2697     if (old_bs) {
2698         /* Detach first so that the recursive drain sections coming from @child
2699          * are already gone and we only end the drain sections that came from
2700          * elsewhere. */
2701         if (child->klass->detach) {
2702             child->klass->detach(child);
2703         }
2704         QLIST_REMOVE(child, next_parent);
2705     }
2706 
2707     child->bs = new_bs;
2708 
2709     if (new_bs) {
2710         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
2711 
2712         /*
2713          * Detaching the old node may have led to the new node's
2714          * quiesce_counter having been decreased.  Not a problem, we
2715          * just need to recognize this here and then invoke
2716          * drained_end appropriately more often.
2717          */
2718         assert(new_bs->quiesce_counter <= new_bs_quiesce_counter);
2719         drain_saldo += new_bs->quiesce_counter - new_bs_quiesce_counter;
2720 
2721         /* Attach only after starting new drained sections, so that recursive
2722          * drain sections coming from @child don't get an extra .drained_begin
2723          * callback. */
2724         if (child->klass->attach) {
2725             child->klass->attach(child);
2726         }
2727     }
2728 
2729     /*
2730      * If the old child node was drained but the new one is not, allow
2731      * requests to come in only after the new node has been attached.
2732      */
2733     while (drain_saldo < 0 && child->klass->drained_end) {
2734         bdrv_parent_drained_end_single(child);
2735         drain_saldo++;
2736     }
2737 }
2738 
2739 static void bdrv_child_free(void *opaque)
2740 {
2741     BdrvChild *c = opaque;
2742 
2743     g_free(c->name);
2744     g_free(c);
2745 }
2746 
2747 static void bdrv_remove_empty_child(BdrvChild *child)
2748 {
2749     assert(!child->bs);
2750     QLIST_SAFE_REMOVE(child, next);
2751     bdrv_child_free(child);
2752 }
2753 
2754 typedef struct BdrvAttachChildCommonState {
2755     BdrvChild **child;
2756     AioContext *old_parent_ctx;
2757     AioContext *old_child_ctx;
2758 } BdrvAttachChildCommonState;
2759 
2760 static void bdrv_attach_child_common_abort(void *opaque)
2761 {
2762     BdrvAttachChildCommonState *s = opaque;
2763     BdrvChild *child = *s->child;
2764     BlockDriverState *bs = child->bs;
2765 
2766     bdrv_replace_child_noperm(child, NULL);
2767 
2768     if (bdrv_get_aio_context(bs) != s->old_child_ctx) {
2769         bdrv_try_set_aio_context(bs, s->old_child_ctx, &error_abort);
2770     }
2771 
2772     if (bdrv_child_get_parent_aio_context(child) != s->old_parent_ctx) {
2773         GSList *ignore = g_slist_prepend(NULL, child);
2774 
2775         child->klass->can_set_aio_ctx(child, s->old_parent_ctx, &ignore,
2776                                       &error_abort);
2777         g_slist_free(ignore);
2778         ignore = g_slist_prepend(NULL, child);
2779         child->klass->set_aio_ctx(child, s->old_parent_ctx, &ignore);
2780 
2781         g_slist_free(ignore);
2782     }
2783 
2784     bdrv_unref(bs);
2785     bdrv_remove_empty_child(child);
2786     *s->child = NULL;
2787 }
2788 
2789 static TransactionActionDrv bdrv_attach_child_common_drv = {
2790     .abort = bdrv_attach_child_common_abort,
2791     .clean = g_free,
2792 };
2793 
2794 /*
2795  * Common part of attaching bdrv child to bs or to blk or to job
2796  *
2797  * Resulting new child is returned through @child.
2798  * At start *@child must be NULL.
2799  * @child is saved to a new entry of @tran, so that *@child could be reverted to
2800  * NULL on abort(). So referenced variable must live at least until transaction
2801  * end.
2802  *
2803  * Function doesn't update permissions, caller is responsible for this.
2804  */
2805 static int bdrv_attach_child_common(BlockDriverState *child_bs,
2806                                     const char *child_name,
2807                                     const BdrvChildClass *child_class,
2808                                     BdrvChildRole child_role,
2809                                     uint64_t perm, uint64_t shared_perm,
2810                                     void *opaque, BdrvChild **child,
2811                                     Transaction *tran, Error **errp)
2812 {
2813     BdrvChild *new_child;
2814     AioContext *parent_ctx;
2815     AioContext *child_ctx = bdrv_get_aio_context(child_bs);
2816 
2817     assert(child);
2818     assert(*child == NULL);
2819     assert(child_class->get_parent_desc);
2820 
2821     new_child = g_new(BdrvChild, 1);
2822     *new_child = (BdrvChild) {
2823         .bs             = NULL,
2824         .name           = g_strdup(child_name),
2825         .klass          = child_class,
2826         .role           = child_role,
2827         .perm           = perm,
2828         .shared_perm    = shared_perm,
2829         .opaque         = opaque,
2830     };
2831 
2832     /*
2833      * If the AioContexts don't match, first try to move the subtree of
2834      * child_bs into the AioContext of the new parent. If this doesn't work,
2835      * try moving the parent into the AioContext of child_bs instead.
2836      */
2837     parent_ctx = bdrv_child_get_parent_aio_context(new_child);
2838     if (child_ctx != parent_ctx) {
2839         Error *local_err = NULL;
2840         int ret = bdrv_try_set_aio_context(child_bs, parent_ctx, &local_err);
2841 
2842         if (ret < 0 && child_class->can_set_aio_ctx) {
2843             GSList *ignore = g_slist_prepend(NULL, new_child);
2844             if (child_class->can_set_aio_ctx(new_child, child_ctx, &ignore,
2845                                              NULL))
2846             {
2847                 error_free(local_err);
2848                 ret = 0;
2849                 g_slist_free(ignore);
2850                 ignore = g_slist_prepend(NULL, new_child);
2851                 child_class->set_aio_ctx(new_child, child_ctx, &ignore);
2852             }
2853             g_slist_free(ignore);
2854         }
2855 
2856         if (ret < 0) {
2857             error_propagate(errp, local_err);
2858             bdrv_remove_empty_child(new_child);
2859             return ret;
2860         }
2861     }
2862 
2863     bdrv_ref(child_bs);
2864     bdrv_replace_child_noperm(new_child, child_bs);
2865 
2866     *child = new_child;
2867 
2868     BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1);
2869     *s = (BdrvAttachChildCommonState) {
2870         .child = child,
2871         .old_parent_ctx = parent_ctx,
2872         .old_child_ctx = child_ctx,
2873     };
2874     tran_add(tran, &bdrv_attach_child_common_drv, s);
2875 
2876     return 0;
2877 }
2878 
2879 /*
2880  * Variable referenced by @child must live at least until transaction end.
2881  * (see bdrv_attach_child_common() doc for details)
2882  *
2883  * Function doesn't update permissions, caller is responsible for this.
2884  */
2885 static int bdrv_attach_child_noperm(BlockDriverState *parent_bs,
2886                                     BlockDriverState *child_bs,
2887                                     const char *child_name,
2888                                     const BdrvChildClass *child_class,
2889                                     BdrvChildRole child_role,
2890                                     BdrvChild **child,
2891                                     Transaction *tran,
2892                                     Error **errp)
2893 {
2894     int ret;
2895     uint64_t perm, shared_perm;
2896 
2897     assert(parent_bs->drv);
2898 
2899     if (bdrv_recurse_has_child(child_bs, parent_bs)) {
2900         error_setg(errp, "Making '%s' a %s child of '%s' would create a cycle",
2901                    child_bs->node_name, child_name, parent_bs->node_name);
2902         return -EINVAL;
2903     }
2904 
2905     bdrv_get_cumulative_perm(parent_bs, &perm, &shared_perm);
2906     bdrv_child_perm(parent_bs, child_bs, NULL, child_role, NULL,
2907                     perm, shared_perm, &perm, &shared_perm);
2908 
2909     ret = bdrv_attach_child_common(child_bs, child_name, child_class,
2910                                    child_role, perm, shared_perm, parent_bs,
2911                                    child, tran, errp);
2912     if (ret < 0) {
2913         return ret;
2914     }
2915 
2916     QLIST_INSERT_HEAD(&parent_bs->children, *child, next);
2917     /*
2918      * child is removed in bdrv_attach_child_common_abort(), so don't care to
2919      * abort this change separately.
2920      */
2921 
2922     return 0;
2923 }
2924 
2925 static void bdrv_detach_child(BdrvChild *child)
2926 {
2927     BlockDriverState *old_bs = child->bs;
2928 
2929     bdrv_replace_child_noperm(child, NULL);
2930     bdrv_remove_empty_child(child);
2931 
2932     if (old_bs) {
2933         /*
2934          * Update permissions for old node. We're just taking a parent away, so
2935          * we're loosening restrictions. Errors of permission update are not
2936          * fatal in this case, ignore them.
2937          */
2938         bdrv_refresh_perms(old_bs, NULL);
2939 
2940         /*
2941          * When the parent requiring a non-default AioContext is removed, the
2942          * node moves back to the main AioContext
2943          */
2944         bdrv_try_set_aio_context(old_bs, qemu_get_aio_context(), NULL);
2945     }
2946 }
2947 
2948 /*
2949  * This function steals the reference to child_bs from the caller.
2950  * That reference is later dropped by bdrv_root_unref_child().
2951  *
2952  * On failure NULL is returned, errp is set and the reference to
2953  * child_bs is also dropped.
2954  *
2955  * The caller must hold the AioContext lock @child_bs, but not that of @ctx
2956  * (unless @child_bs is already in @ctx).
2957  */
2958 BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
2959                                   const char *child_name,
2960                                   const BdrvChildClass *child_class,
2961                                   BdrvChildRole child_role,
2962                                   uint64_t perm, uint64_t shared_perm,
2963                                   void *opaque, Error **errp)
2964 {
2965     int ret;
2966     BdrvChild *child = NULL;
2967     Transaction *tran = tran_new();
2968 
2969     ret = bdrv_attach_child_common(child_bs, child_name, child_class,
2970                                    child_role, perm, shared_perm, opaque,
2971                                    &child, tran, errp);
2972     if (ret < 0) {
2973         goto out;
2974     }
2975 
2976     ret = bdrv_refresh_perms(child_bs, errp);
2977 
2978 out:
2979     tran_finalize(tran, ret);
2980     /* child is unset on failure by bdrv_attach_child_common_abort() */
2981     assert((ret < 0) == !child);
2982 
2983     bdrv_unref(child_bs);
2984     return child;
2985 }
2986 
2987 /*
2988  * This function transfers the reference to child_bs from the caller
2989  * to parent_bs. That reference is later dropped by parent_bs on
2990  * bdrv_close() or if someone calls bdrv_unref_child().
2991  *
2992  * On failure NULL is returned, errp is set and the reference to
2993  * child_bs is also dropped.
2994  *
2995  * If @parent_bs and @child_bs are in different AioContexts, the caller must
2996  * hold the AioContext lock for @child_bs, but not for @parent_bs.
2997  */
2998 BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
2999                              BlockDriverState *child_bs,
3000                              const char *child_name,
3001                              const BdrvChildClass *child_class,
3002                              BdrvChildRole child_role,
3003                              Error **errp)
3004 {
3005     int ret;
3006     BdrvChild *child = NULL;
3007     Transaction *tran = tran_new();
3008 
3009     ret = bdrv_attach_child_noperm(parent_bs, child_bs, child_name, child_class,
3010                                    child_role, &child, tran, errp);
3011     if (ret < 0) {
3012         goto out;
3013     }
3014 
3015     ret = bdrv_refresh_perms(parent_bs, errp);
3016     if (ret < 0) {
3017         goto out;
3018     }
3019 
3020 out:
3021     tran_finalize(tran, ret);
3022     /* child is unset on failure by bdrv_attach_child_common_abort() */
3023     assert((ret < 0) == !child);
3024 
3025     bdrv_unref(child_bs);
3026 
3027     return child;
3028 }
3029 
3030 /* Callers must ensure that child->frozen is false. */
3031 void bdrv_root_unref_child(BdrvChild *child)
3032 {
3033     BlockDriverState *child_bs;
3034 
3035     child_bs = child->bs;
3036     bdrv_detach_child(child);
3037     bdrv_unref(child_bs);
3038 }
3039 
3040 typedef struct BdrvSetInheritsFrom {
3041     BlockDriverState *bs;
3042     BlockDriverState *old_inherits_from;
3043 } BdrvSetInheritsFrom;
3044 
3045 static void bdrv_set_inherits_from_abort(void *opaque)
3046 {
3047     BdrvSetInheritsFrom *s = opaque;
3048 
3049     s->bs->inherits_from = s->old_inherits_from;
3050 }
3051 
3052 static TransactionActionDrv bdrv_set_inherits_from_drv = {
3053     .abort = bdrv_set_inherits_from_abort,
3054     .clean = g_free,
3055 };
3056 
3057 /* @tran is allowed to be NULL. In this case no rollback is possible */
3058 static void bdrv_set_inherits_from(BlockDriverState *bs,
3059                                    BlockDriverState *new_inherits_from,
3060                                    Transaction *tran)
3061 {
3062     if (tran) {
3063         BdrvSetInheritsFrom *s = g_new(BdrvSetInheritsFrom, 1);
3064 
3065         *s = (BdrvSetInheritsFrom) {
3066             .bs = bs,
3067             .old_inherits_from = bs->inherits_from,
3068         };
3069 
3070         tran_add(tran, &bdrv_set_inherits_from_drv, s);
3071     }
3072 
3073     bs->inherits_from = new_inherits_from;
3074 }
3075 
3076 /**
3077  * Clear all inherits_from pointers from children and grandchildren of
3078  * @root that point to @root, where necessary.
3079  * @tran is allowed to be NULL. In this case no rollback is possible
3080  */
3081 static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child,
3082                                      Transaction *tran)
3083 {
3084     BdrvChild *c;
3085 
3086     if (child->bs->inherits_from == root) {
3087         /*
3088          * Remove inherits_from only when the last reference between root and
3089          * child->bs goes away.
3090          */
3091         QLIST_FOREACH(c, &root->children, next) {
3092             if (c != child && c->bs == child->bs) {
3093                 break;
3094             }
3095         }
3096         if (c == NULL) {
3097             bdrv_set_inherits_from(child->bs, NULL, tran);
3098         }
3099     }
3100 
3101     QLIST_FOREACH(c, &child->bs->children, next) {
3102         bdrv_unset_inherits_from(root, c, tran);
3103     }
3104 }
3105 
3106 /* Callers must ensure that child->frozen is false. */
3107 void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
3108 {
3109     if (child == NULL) {
3110         return;
3111     }
3112 
3113     bdrv_unset_inherits_from(parent, child, NULL);
3114     bdrv_root_unref_child(child);
3115 }
3116 
3117 
3118 static void bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
3119 {
3120     BdrvChild *c;
3121     QLIST_FOREACH(c, &bs->parents, next_parent) {
3122         if (c->klass->change_media) {
3123             c->klass->change_media(c, load);
3124         }
3125     }
3126 }
3127 
3128 /* Return true if you can reach parent going through child->inherits_from
3129  * recursively. If parent or child are NULL, return false */
3130 static bool bdrv_inherits_from_recursive(BlockDriverState *child,
3131                                          BlockDriverState *parent)
3132 {
3133     while (child && child != parent) {
3134         child = child->inherits_from;
3135     }
3136 
3137     return child != NULL;
3138 }
3139 
3140 /*
3141  * Return the BdrvChildRole for @bs's backing child.  bs->backing is
3142  * mostly used for COW backing children (role = COW), but also for
3143  * filtered children (role = FILTERED | PRIMARY).
3144  */
3145 static BdrvChildRole bdrv_backing_role(BlockDriverState *bs)
3146 {
3147     if (bs->drv && bs->drv->is_filter) {
3148         return BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY;
3149     } else {
3150         return BDRV_CHILD_COW;
3151     }
3152 }
3153 
3154 /*
3155  * Sets the bs->backing or bs->file link of a BDS. A new reference is created;
3156  * callers which don't need their own reference any more must call bdrv_unref().
3157  *
3158  * Function doesn't update permissions, caller is responsible for this.
3159  */
3160 static int bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
3161                                            BlockDriverState *child_bs,
3162                                            bool is_backing,
3163                                            Transaction *tran, Error **errp)
3164 {
3165     int ret = 0;
3166     bool update_inherits_from =
3167         bdrv_inherits_from_recursive(child_bs, parent_bs);
3168     BdrvChild *child = is_backing ? parent_bs->backing : parent_bs->file;
3169     BdrvChildRole role;
3170 
3171     if (!parent_bs->drv) {
3172         /*
3173          * Node without drv is an object without a class :/. TODO: finally fix
3174          * qcow2 driver to never clear bs->drv and implement format corruption
3175          * handling in other way.
3176          */
3177         error_setg(errp, "Node corrupted");
3178         return -EINVAL;
3179     }
3180 
3181     if (child && child->frozen) {
3182         error_setg(errp, "Cannot change frozen '%s' link from '%s' to '%s'",
3183                    child->name, parent_bs->node_name, child->bs->node_name);
3184         return -EPERM;
3185     }
3186 
3187     if (is_backing && !parent_bs->drv->is_filter &&
3188         !parent_bs->drv->supports_backing)
3189     {
3190         error_setg(errp, "Driver '%s' of node '%s' does not support backing "
3191                    "files", parent_bs->drv->format_name, parent_bs->node_name);
3192         return -EINVAL;
3193     }
3194 
3195     if (parent_bs->drv->is_filter) {
3196         role = BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY;
3197     } else if (is_backing) {
3198         role = BDRV_CHILD_COW;
3199     } else {
3200         /*
3201          * We only can use same role as it is in existing child. We don't have
3202          * infrastructure to determine role of file child in generic way
3203          */
3204         if (!child) {
3205             error_setg(errp, "Cannot set file child to format node without "
3206                        "file child");
3207             return -EINVAL;
3208         }
3209         role = child->role;
3210     }
3211 
3212     if (child) {
3213         bdrv_unset_inherits_from(parent_bs, child, tran);
3214         bdrv_remove_file_or_backing_child(parent_bs, child, tran);
3215     }
3216 
3217     if (!child_bs) {
3218         goto out;
3219     }
3220 
3221     ret = bdrv_attach_child_noperm(parent_bs, child_bs,
3222                                    is_backing ? "backing" : "file",
3223                                    &child_of_bds, role,
3224                                    is_backing ? &parent_bs->backing :
3225                                                 &parent_bs->file,
3226                                    tran, errp);
3227     if (ret < 0) {
3228         return ret;
3229     }
3230 
3231 
3232     /*
3233      * If inherits_from pointed recursively to bs then let's update it to
3234      * point directly to bs (else it will become NULL).
3235      */
3236     if (update_inherits_from) {
3237         bdrv_set_inherits_from(child_bs, parent_bs, tran);
3238     }
3239 
3240 out:
3241     bdrv_refresh_limits(parent_bs, tran, NULL);
3242 
3243     return 0;
3244 }
3245 
3246 static int bdrv_set_backing_noperm(BlockDriverState *bs,
3247                                    BlockDriverState *backing_hd,
3248                                    Transaction *tran, Error **errp)
3249 {
3250     return bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp);
3251 }
3252 
3253 int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
3254                         Error **errp)
3255 {
3256     int ret;
3257     Transaction *tran = tran_new();
3258 
3259     ret = bdrv_set_backing_noperm(bs, backing_hd, tran, errp);
3260     if (ret < 0) {
3261         goto out;
3262     }
3263 
3264     ret = bdrv_refresh_perms(bs, errp);
3265 out:
3266     tran_finalize(tran, ret);
3267 
3268     return ret;
3269 }
3270 
3271 /*
3272  * Opens the backing file for a BlockDriverState if not yet open
3273  *
3274  * bdref_key specifies the key for the image's BlockdevRef in the options QDict.
3275  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
3276  * itself, all options starting with "${bdref_key}." are considered part of the
3277  * BlockdevRef.
3278  *
3279  * TODO Can this be unified with bdrv_open_image()?
3280  */
3281 int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
3282                            const char *bdref_key, Error **errp)
3283 {
3284     char *backing_filename = NULL;
3285     char *bdref_key_dot;
3286     const char *reference = NULL;
3287     int ret = 0;
3288     bool implicit_backing = false;
3289     BlockDriverState *backing_hd;
3290     QDict *options;
3291     QDict *tmp_parent_options = NULL;
3292     Error *local_err = NULL;
3293 
3294     if (bs->backing != NULL) {
3295         goto free_exit;
3296     }
3297 
3298     /* NULL means an empty set of options */
3299     if (parent_options == NULL) {
3300         tmp_parent_options = qdict_new();
3301         parent_options = tmp_parent_options;
3302     }
3303 
3304     bs->open_flags &= ~BDRV_O_NO_BACKING;
3305 
3306     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
3307     qdict_extract_subqdict(parent_options, &options, bdref_key_dot);
3308     g_free(bdref_key_dot);
3309 
3310     /*
3311      * Caution: while qdict_get_try_str() is fine, getting non-string
3312      * types would require more care.  When @parent_options come from
3313      * -blockdev or blockdev_add, its members are typed according to
3314      * the QAPI schema, but when they come from -drive, they're all
3315      * QString.
3316      */
3317     reference = qdict_get_try_str(parent_options, bdref_key);
3318     if (reference || qdict_haskey(options, "file.filename")) {
3319         /* keep backing_filename NULL */
3320     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
3321         qobject_unref(options);
3322         goto free_exit;
3323     } else {
3324         if (qdict_size(options) == 0) {
3325             /* If the user specifies options that do not modify the
3326              * backing file's behavior, we might still consider it the
3327              * implicit backing file.  But it's easier this way, and
3328              * just specifying some of the backing BDS's options is
3329              * only possible with -drive anyway (otherwise the QAPI
3330              * schema forces the user to specify everything). */
3331             implicit_backing = !strcmp(bs->auto_backing_file, bs->backing_file);
3332         }
3333 
3334         backing_filename = bdrv_get_full_backing_filename(bs, &local_err);
3335         if (local_err) {
3336             ret = -EINVAL;
3337             error_propagate(errp, local_err);
3338             qobject_unref(options);
3339             goto free_exit;
3340         }
3341     }
3342 
3343     if (!bs->drv || !bs->drv->supports_backing) {
3344         ret = -EINVAL;
3345         error_setg(errp, "Driver doesn't support backing files");
3346         qobject_unref(options);
3347         goto free_exit;
3348     }
3349 
3350     if (!reference &&
3351         bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
3352         qdict_put_str(options, "driver", bs->backing_format);
3353     }
3354 
3355     backing_hd = bdrv_open_inherit(backing_filename, reference, options, 0, bs,
3356                                    &child_of_bds, bdrv_backing_role(bs), errp);
3357     if (!backing_hd) {
3358         bs->open_flags |= BDRV_O_NO_BACKING;
3359         error_prepend(errp, "Could not open backing file: ");
3360         ret = -EINVAL;
3361         goto free_exit;
3362     }
3363 
3364     if (implicit_backing) {
3365         bdrv_refresh_filename(backing_hd);
3366         pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
3367                 backing_hd->filename);
3368     }
3369 
3370     /* Hook up the backing file link; drop our reference, bs owns the
3371      * backing_hd reference now */
3372     ret = bdrv_set_backing_hd(bs, backing_hd, errp);
3373     bdrv_unref(backing_hd);
3374     if (ret < 0) {
3375         goto free_exit;
3376     }
3377 
3378     qdict_del(parent_options, bdref_key);
3379 
3380 free_exit:
3381     g_free(backing_filename);
3382     qobject_unref(tmp_parent_options);
3383     return ret;
3384 }
3385 
3386 static BlockDriverState *
3387 bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
3388                    BlockDriverState *parent, const BdrvChildClass *child_class,
3389                    BdrvChildRole child_role, bool allow_none, Error **errp)
3390 {
3391     BlockDriverState *bs = NULL;
3392     QDict *image_options;
3393     char *bdref_key_dot;
3394     const char *reference;
3395 
3396     assert(child_class != NULL);
3397 
3398     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
3399     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
3400     g_free(bdref_key_dot);
3401 
3402     /*
3403      * Caution: while qdict_get_try_str() is fine, getting non-string
3404      * types would require more care.  When @options come from
3405      * -blockdev or blockdev_add, its members are typed according to
3406      * the QAPI schema, but when they come from -drive, they're all
3407      * QString.
3408      */
3409     reference = qdict_get_try_str(options, bdref_key);
3410     if (!filename && !reference && !qdict_size(image_options)) {
3411         if (!allow_none) {
3412             error_setg(errp, "A block device must be specified for \"%s\"",
3413                        bdref_key);
3414         }
3415         qobject_unref(image_options);
3416         goto done;
3417     }
3418 
3419     bs = bdrv_open_inherit(filename, reference, image_options, 0,
3420                            parent, child_class, child_role, errp);
3421     if (!bs) {
3422         goto done;
3423     }
3424 
3425 done:
3426     qdict_del(options, bdref_key);
3427     return bs;
3428 }
3429 
3430 /*
3431  * Opens a disk image whose options are given as BlockdevRef in another block
3432  * device's options.
3433  *
3434  * If allow_none is true, no image will be opened if filename is false and no
3435  * BlockdevRef is given. NULL will be returned, but errp remains unset.
3436  *
3437  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
3438  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
3439  * itself, all options starting with "${bdref_key}." are considered part of the
3440  * BlockdevRef.
3441  *
3442  * The BlockdevRef will be removed from the options QDict.
3443  */
3444 BdrvChild *bdrv_open_child(const char *filename,
3445                            QDict *options, const char *bdref_key,
3446                            BlockDriverState *parent,
3447                            const BdrvChildClass *child_class,
3448                            BdrvChildRole child_role,
3449                            bool allow_none, Error **errp)
3450 {
3451     BlockDriverState *bs;
3452 
3453     bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class,
3454                             child_role, allow_none, errp);
3455     if (bs == NULL) {
3456         return NULL;
3457     }
3458 
3459     return bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
3460                              errp);
3461 }
3462 
3463 /*
3464  * TODO Future callers may need to specify parent/child_class in order for
3465  * option inheritance to work. Existing callers use it for the root node.
3466  */
3467 BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp)
3468 {
3469     BlockDriverState *bs = NULL;
3470     QObject *obj = NULL;
3471     QDict *qdict = NULL;
3472     const char *reference = NULL;
3473     Visitor *v = NULL;
3474 
3475     if (ref->type == QTYPE_QSTRING) {
3476         reference = ref->u.reference;
3477     } else {
3478         BlockdevOptions *options = &ref->u.definition;
3479         assert(ref->type == QTYPE_QDICT);
3480 
3481         v = qobject_output_visitor_new(&obj);
3482         visit_type_BlockdevOptions(v, NULL, &options, &error_abort);
3483         visit_complete(v, &obj);
3484 
3485         qdict = qobject_to(QDict, obj);
3486         qdict_flatten(qdict);
3487 
3488         /* bdrv_open_inherit() defaults to the values in bdrv_flags (for
3489          * compatibility with other callers) rather than what we want as the
3490          * real defaults. Apply the defaults here instead. */
3491         qdict_set_default_str(qdict, BDRV_OPT_CACHE_DIRECT, "off");
3492         qdict_set_default_str(qdict, BDRV_OPT_CACHE_NO_FLUSH, "off");
3493         qdict_set_default_str(qdict, BDRV_OPT_READ_ONLY, "off");
3494         qdict_set_default_str(qdict, BDRV_OPT_AUTO_READ_ONLY, "off");
3495 
3496     }
3497 
3498     bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, errp);
3499     obj = NULL;
3500     qobject_unref(obj);
3501     visit_free(v);
3502     return bs;
3503 }
3504 
3505 static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
3506                                                    int flags,
3507                                                    QDict *snapshot_options,
3508                                                    Error **errp)
3509 {
3510     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
3511     char *tmp_filename = g_malloc0(PATH_MAX + 1);
3512     int64_t total_size;
3513     QemuOpts *opts = NULL;
3514     BlockDriverState *bs_snapshot = NULL;
3515     int ret;
3516 
3517     /* if snapshot, we create a temporary backing file and open it
3518        instead of opening 'filename' directly */
3519 
3520     /* Get the required size from the image */
3521     total_size = bdrv_getlength(bs);
3522     if (total_size < 0) {
3523         error_setg_errno(errp, -total_size, "Could not get image size");
3524         goto out;
3525     }
3526 
3527     /* Create the temporary image */
3528     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
3529     if (ret < 0) {
3530         error_setg_errno(errp, -ret, "Could not get temporary filename");
3531         goto out;
3532     }
3533 
3534     opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
3535                             &error_abort);
3536     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
3537     ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, errp);
3538     qemu_opts_del(opts);
3539     if (ret < 0) {
3540         error_prepend(errp, "Could not create temporary overlay '%s': ",
3541                       tmp_filename);
3542         goto out;
3543     }
3544 
3545     /* Prepare options QDict for the temporary file */
3546     qdict_put_str(snapshot_options, "file.driver", "file");
3547     qdict_put_str(snapshot_options, "file.filename", tmp_filename);
3548     qdict_put_str(snapshot_options, "driver", "qcow2");
3549 
3550     bs_snapshot = bdrv_open(NULL, NULL, snapshot_options, flags, errp);
3551     snapshot_options = NULL;
3552     if (!bs_snapshot) {
3553         goto out;
3554     }
3555 
3556     ret = bdrv_append(bs_snapshot, bs, errp);
3557     if (ret < 0) {
3558         bs_snapshot = NULL;
3559         goto out;
3560     }
3561 
3562 out:
3563     qobject_unref(snapshot_options);
3564     g_free(tmp_filename);
3565     return bs_snapshot;
3566 }
3567 
3568 /*
3569  * Opens a disk image (raw, qcow2, vmdk, ...)
3570  *
3571  * options is a QDict of options to pass to the block drivers, or NULL for an
3572  * empty set of options. The reference to the QDict belongs to the block layer
3573  * after the call (even on failure), so if the caller intends to reuse the
3574  * dictionary, it needs to use qobject_ref() before calling bdrv_open.
3575  *
3576  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
3577  * If it is not NULL, the referenced BDS will be reused.
3578  *
3579  * The reference parameter may be used to specify an existing block device which
3580  * should be opened. If specified, neither options nor a filename may be given,
3581  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
3582  */
3583 static BlockDriverState *bdrv_open_inherit(const char *filename,
3584                                            const char *reference,
3585                                            QDict *options, int flags,
3586                                            BlockDriverState *parent,
3587                                            const BdrvChildClass *child_class,
3588                                            BdrvChildRole child_role,
3589                                            Error **errp)
3590 {
3591     int ret;
3592     BlockBackend *file = NULL;
3593     BlockDriverState *bs;
3594     BlockDriver *drv = NULL;
3595     BdrvChild *child;
3596     const char *drvname;
3597     const char *backing;
3598     Error *local_err = NULL;
3599     QDict *snapshot_options = NULL;
3600     int snapshot_flags = 0;
3601 
3602     assert(!child_class || !flags);
3603     assert(!child_class == !parent);
3604 
3605     if (reference) {
3606         bool options_non_empty = options ? qdict_size(options) : false;
3607         qobject_unref(options);
3608 
3609         if (filename || options_non_empty) {
3610             error_setg(errp, "Cannot reference an existing block device with "
3611                        "additional options or a new filename");
3612             return NULL;
3613         }
3614 
3615         bs = bdrv_lookup_bs(reference, reference, errp);
3616         if (!bs) {
3617             return NULL;
3618         }
3619 
3620         bdrv_ref(bs);
3621         return bs;
3622     }
3623 
3624     bs = bdrv_new();
3625 
3626     /* NULL means an empty set of options */
3627     if (options == NULL) {
3628         options = qdict_new();
3629     }
3630 
3631     /* json: syntax counts as explicit options, as if in the QDict */
3632     parse_json_protocol(options, &filename, &local_err);
3633     if (local_err) {
3634         goto fail;
3635     }
3636 
3637     bs->explicit_options = qdict_clone_shallow(options);
3638 
3639     if (child_class) {
3640         bool parent_is_format;
3641 
3642         if (parent->drv) {
3643             parent_is_format = parent->drv->is_format;
3644         } else {
3645             /*
3646              * parent->drv is not set yet because this node is opened for
3647              * (potential) format probing.  That means that @parent is going
3648              * to be a format node.
3649              */
3650             parent_is_format = true;
3651         }
3652 
3653         bs->inherits_from = parent;
3654         child_class->inherit_options(child_role, parent_is_format,
3655                                      &flags, options,
3656                                      parent->open_flags, parent->options);
3657     }
3658 
3659     ret = bdrv_fill_options(&options, filename, &flags, &local_err);
3660     if (ret < 0) {
3661         goto fail;
3662     }
3663 
3664     /*
3665      * Set the BDRV_O_RDWR and BDRV_O_ALLOW_RDWR flags.
3666      * Caution: getting a boolean member of @options requires care.
3667      * When @options come from -blockdev or blockdev_add, members are
3668      * typed according to the QAPI schema, but when they come from
3669      * -drive, they're all QString.
3670      */
3671     if (g_strcmp0(qdict_get_try_str(options, BDRV_OPT_READ_ONLY), "on") &&
3672         !qdict_get_try_bool(options, BDRV_OPT_READ_ONLY, false)) {
3673         flags |= (BDRV_O_RDWR | BDRV_O_ALLOW_RDWR);
3674     } else {
3675         flags &= ~BDRV_O_RDWR;
3676     }
3677 
3678     if (flags & BDRV_O_SNAPSHOT) {
3679         snapshot_options = qdict_new();
3680         bdrv_temp_snapshot_options(&snapshot_flags, snapshot_options,
3681                                    flags, options);
3682         /* Let bdrv_backing_options() override "read-only" */
3683         qdict_del(options, BDRV_OPT_READ_ONLY);
3684         bdrv_inherited_options(BDRV_CHILD_COW, true,
3685                                &flags, options, flags, options);
3686     }
3687 
3688     bs->open_flags = flags;
3689     bs->options = options;
3690     options = qdict_clone_shallow(options);
3691 
3692     /* Find the right image format driver */
3693     /* See cautionary note on accessing @options above */
3694     drvname = qdict_get_try_str(options, "driver");
3695     if (drvname) {
3696         drv = bdrv_find_format(drvname);
3697         if (!drv) {
3698             error_setg(errp, "Unknown driver: '%s'", drvname);
3699             goto fail;
3700         }
3701     }
3702 
3703     assert(drvname || !(flags & BDRV_O_PROTOCOL));
3704 
3705     /* See cautionary note on accessing @options above */
3706     backing = qdict_get_try_str(options, "backing");
3707     if (qobject_to(QNull, qdict_get(options, "backing")) != NULL ||
3708         (backing && *backing == '\0'))
3709     {
3710         if (backing) {
3711             warn_report("Use of \"backing\": \"\" is deprecated; "
3712                         "use \"backing\": null instead");
3713         }
3714         flags |= BDRV_O_NO_BACKING;
3715         qdict_del(bs->explicit_options, "backing");
3716         qdict_del(bs->options, "backing");
3717         qdict_del(options, "backing");
3718     }
3719 
3720     /* Open image file without format layer. This BlockBackend is only used for
3721      * probing, the block drivers will do their own bdrv_open_child() for the
3722      * same BDS, which is why we put the node name back into options. */
3723     if ((flags & BDRV_O_PROTOCOL) == 0) {
3724         BlockDriverState *file_bs;
3725 
3726         file_bs = bdrv_open_child_bs(filename, options, "file", bs,
3727                                      &child_of_bds, BDRV_CHILD_IMAGE,
3728                                      true, &local_err);
3729         if (local_err) {
3730             goto fail;
3731         }
3732         if (file_bs != NULL) {
3733             /* Not requesting BLK_PERM_CONSISTENT_READ because we're only
3734              * looking at the header to guess the image format. This works even
3735              * in cases where a guest would not see a consistent state. */
3736             file = blk_new(bdrv_get_aio_context(file_bs), 0, BLK_PERM_ALL);
3737             blk_insert_bs(file, file_bs, &local_err);
3738             bdrv_unref(file_bs);
3739             if (local_err) {
3740                 goto fail;
3741             }
3742 
3743             qdict_put_str(options, "file", bdrv_get_node_name(file_bs));
3744         }
3745     }
3746 
3747     /* Image format probing */
3748     bs->probed = !drv;
3749     if (!drv && file) {
3750         ret = find_image_format(file, filename, &drv, &local_err);
3751         if (ret < 0) {
3752             goto fail;
3753         }
3754         /*
3755          * This option update would logically belong in bdrv_fill_options(),
3756          * but we first need to open bs->file for the probing to work, while
3757          * opening bs->file already requires the (mostly) final set of options
3758          * so that cache mode etc. can be inherited.
3759          *
3760          * Adding the driver later is somewhat ugly, but it's not an option
3761          * that would ever be inherited, so it's correct. We just need to make
3762          * sure to update both bs->options (which has the full effective
3763          * options for bs) and options (which has file.* already removed).
3764          */
3765         qdict_put_str(bs->options, "driver", drv->format_name);
3766         qdict_put_str(options, "driver", drv->format_name);
3767     } else if (!drv) {
3768         error_setg(errp, "Must specify either driver or file");
3769         goto fail;
3770     }
3771 
3772     /* BDRV_O_PROTOCOL must be set iff a protocol BDS is about to be created */
3773     assert(!!(flags & BDRV_O_PROTOCOL) == !!drv->bdrv_file_open);
3774     /* file must be NULL if a protocol BDS is about to be created
3775      * (the inverse results in an error message from bdrv_open_common()) */
3776     assert(!(flags & BDRV_O_PROTOCOL) || !file);
3777 
3778     /* Open the image */
3779     ret = bdrv_open_common(bs, file, options, &local_err);
3780     if (ret < 0) {
3781         goto fail;
3782     }
3783 
3784     if (file) {
3785         blk_unref(file);
3786         file = NULL;
3787     }
3788 
3789     /* If there is a backing file, use it */
3790     if ((flags & BDRV_O_NO_BACKING) == 0) {
3791         ret = bdrv_open_backing_file(bs, options, "backing", &local_err);
3792         if (ret < 0) {
3793             goto close_and_fail;
3794         }
3795     }
3796 
3797     /* Remove all children options and references
3798      * from bs->options and bs->explicit_options */
3799     QLIST_FOREACH(child, &bs->children, next) {
3800         char *child_key_dot;
3801         child_key_dot = g_strdup_printf("%s.", child->name);
3802         qdict_extract_subqdict(bs->explicit_options, NULL, child_key_dot);
3803         qdict_extract_subqdict(bs->options, NULL, child_key_dot);
3804         qdict_del(bs->explicit_options, child->name);
3805         qdict_del(bs->options, child->name);
3806         g_free(child_key_dot);
3807     }
3808 
3809     /* Check if any unknown options were used */
3810     if (qdict_size(options) != 0) {
3811         const QDictEntry *entry = qdict_first(options);
3812         if (flags & BDRV_O_PROTOCOL) {
3813             error_setg(errp, "Block protocol '%s' doesn't support the option "
3814                        "'%s'", drv->format_name, entry->key);
3815         } else {
3816             error_setg(errp,
3817                        "Block format '%s' does not support the option '%s'",
3818                        drv->format_name, entry->key);
3819         }
3820 
3821         goto close_and_fail;
3822     }
3823 
3824     bdrv_parent_cb_change_media(bs, true);
3825 
3826     qobject_unref(options);
3827     options = NULL;
3828 
3829     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
3830      * temporary snapshot afterwards. */
3831     if (snapshot_flags) {
3832         BlockDriverState *snapshot_bs;
3833         snapshot_bs = bdrv_append_temp_snapshot(bs, snapshot_flags,
3834                                                 snapshot_options, &local_err);
3835         snapshot_options = NULL;
3836         if (local_err) {
3837             goto close_and_fail;
3838         }
3839         /* We are not going to return bs but the overlay on top of it
3840          * (snapshot_bs); thus, we have to drop the strong reference to bs
3841          * (which we obtained by calling bdrv_new()). bs will not be deleted,
3842          * though, because the overlay still has a reference to it. */
3843         bdrv_unref(bs);
3844         bs = snapshot_bs;
3845     }
3846 
3847     return bs;
3848 
3849 fail:
3850     blk_unref(file);
3851     qobject_unref(snapshot_options);
3852     qobject_unref(bs->explicit_options);
3853     qobject_unref(bs->options);
3854     qobject_unref(options);
3855     bs->options = NULL;
3856     bs->explicit_options = NULL;
3857     bdrv_unref(bs);
3858     error_propagate(errp, local_err);
3859     return NULL;
3860 
3861 close_and_fail:
3862     bdrv_unref(bs);
3863     qobject_unref(snapshot_options);
3864     qobject_unref(options);
3865     error_propagate(errp, local_err);
3866     return NULL;
3867 }
3868 
3869 BlockDriverState *bdrv_open(const char *filename, const char *reference,
3870                             QDict *options, int flags, Error **errp)
3871 {
3872     return bdrv_open_inherit(filename, reference, options, flags, NULL,
3873                              NULL, 0, errp);
3874 }
3875 
3876 /* Return true if the NULL-terminated @list contains @str */
3877 static bool is_str_in_list(const char *str, const char *const *list)
3878 {
3879     if (str && list) {
3880         int i;
3881         for (i = 0; list[i] != NULL; i++) {
3882             if (!strcmp(str, list[i])) {
3883                 return true;
3884             }
3885         }
3886     }
3887     return false;
3888 }
3889 
3890 /*
3891  * Check that every option set in @bs->options is also set in
3892  * @new_opts.
3893  *
3894  * Options listed in the common_options list and in
3895  * @bs->drv->mutable_opts are skipped.
3896  *
3897  * Return 0 on success, otherwise return -EINVAL and set @errp.
3898  */
3899 static int bdrv_reset_options_allowed(BlockDriverState *bs,
3900                                       const QDict *new_opts, Error **errp)
3901 {
3902     const QDictEntry *e;
3903     /* These options are common to all block drivers and are handled
3904      * in bdrv_reopen_prepare() so they can be left out of @new_opts */
3905     const char *const common_options[] = {
3906         "node-name", "discard", "cache.direct", "cache.no-flush",
3907         "read-only", "auto-read-only", "detect-zeroes", NULL
3908     };
3909 
3910     for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) {
3911         if (!qdict_haskey(new_opts, e->key) &&
3912             !is_str_in_list(e->key, common_options) &&
3913             !is_str_in_list(e->key, bs->drv->mutable_opts)) {
3914             error_setg(errp, "Option '%s' cannot be reset "
3915                        "to its default value", e->key);
3916             return -EINVAL;
3917         }
3918     }
3919 
3920     return 0;
3921 }
3922 
3923 /*
3924  * Returns true if @child can be reached recursively from @bs
3925  */
3926 static bool bdrv_recurse_has_child(BlockDriverState *bs,
3927                                    BlockDriverState *child)
3928 {
3929     BdrvChild *c;
3930 
3931     if (bs == child) {
3932         return true;
3933     }
3934 
3935     QLIST_FOREACH(c, &bs->children, next) {
3936         if (bdrv_recurse_has_child(c->bs, child)) {
3937             return true;
3938         }
3939     }
3940 
3941     return false;
3942 }
3943 
3944 /*
3945  * Adds a BlockDriverState to a simple queue for an atomic, transactional
3946  * reopen of multiple devices.
3947  *
3948  * bs_queue can either be an existing BlockReopenQueue that has had QTAILQ_INIT
3949  * already performed, or alternatively may be NULL a new BlockReopenQueue will
3950  * be created and initialized. This newly created BlockReopenQueue should be
3951  * passed back in for subsequent calls that are intended to be of the same
3952  * atomic 'set'.
3953  *
3954  * bs is the BlockDriverState to add to the reopen queue.
3955  *
3956  * options contains the changed options for the associated bs
3957  * (the BlockReopenQueue takes ownership)
3958  *
3959  * flags contains the open flags for the associated bs
3960  *
3961  * returns a pointer to bs_queue, which is either the newly allocated
3962  * bs_queue, or the existing bs_queue being used.
3963  *
3964  * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
3965  */
3966 static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
3967                                                  BlockDriverState *bs,
3968                                                  QDict *options,
3969                                                  const BdrvChildClass *klass,
3970                                                  BdrvChildRole role,
3971                                                  bool parent_is_format,
3972                                                  QDict *parent_options,
3973                                                  int parent_flags,
3974                                                  bool keep_old_opts)
3975 {
3976     assert(bs != NULL);
3977 
3978     BlockReopenQueueEntry *bs_entry;
3979     BdrvChild *child;
3980     QDict *old_options, *explicit_options, *options_copy;
3981     int flags;
3982     QemuOpts *opts;
3983 
3984     /* Make sure that the caller remembered to use a drained section. This is
3985      * important to avoid graph changes between the recursive queuing here and
3986      * bdrv_reopen_multiple(). */
3987     assert(bs->quiesce_counter > 0);
3988 
3989     if (bs_queue == NULL) {
3990         bs_queue = g_new0(BlockReopenQueue, 1);
3991         QTAILQ_INIT(bs_queue);
3992     }
3993 
3994     if (!options) {
3995         options = qdict_new();
3996     }
3997 
3998     /* Check if this BlockDriverState is already in the queue */
3999     QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4000         if (bs == bs_entry->state.bs) {
4001             break;
4002         }
4003     }
4004 
4005     /*
4006      * Precedence of options:
4007      * 1. Explicitly passed in options (highest)
4008      * 2. Retained from explicitly set options of bs
4009      * 3. Inherited from parent node
4010      * 4. Retained from effective options of bs
4011      */
4012 
4013     /* Old explicitly set values (don't overwrite by inherited value) */
4014     if (bs_entry || keep_old_opts) {
4015         old_options = qdict_clone_shallow(bs_entry ?
4016                                           bs_entry->state.explicit_options :
4017                                           bs->explicit_options);
4018         bdrv_join_options(bs, options, old_options);
4019         qobject_unref(old_options);
4020     }
4021 
4022     explicit_options = qdict_clone_shallow(options);
4023 
4024     /* Inherit from parent node */
4025     if (parent_options) {
4026         flags = 0;
4027         klass->inherit_options(role, parent_is_format, &flags, options,
4028                                parent_flags, parent_options);
4029     } else {
4030         flags = bdrv_get_flags(bs);
4031     }
4032 
4033     if (keep_old_opts) {
4034         /* Old values are used for options that aren't set yet */
4035         old_options = qdict_clone_shallow(bs->options);
4036         bdrv_join_options(bs, options, old_options);
4037         qobject_unref(old_options);
4038     }
4039 
4040     /* We have the final set of options so let's update the flags */
4041     options_copy = qdict_clone_shallow(options);
4042     opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
4043     qemu_opts_absorb_qdict(opts, options_copy, NULL);
4044     update_flags_from_options(&flags, opts);
4045     qemu_opts_del(opts);
4046     qobject_unref(options_copy);
4047 
4048     /* bdrv_open_inherit() sets and clears some additional flags internally */
4049     flags &= ~BDRV_O_PROTOCOL;
4050     if (flags & BDRV_O_RDWR) {
4051         flags |= BDRV_O_ALLOW_RDWR;
4052     }
4053 
4054     if (!bs_entry) {
4055         bs_entry = g_new0(BlockReopenQueueEntry, 1);
4056         QTAILQ_INSERT_TAIL(bs_queue, bs_entry, entry);
4057     } else {
4058         qobject_unref(bs_entry->state.options);
4059         qobject_unref(bs_entry->state.explicit_options);
4060     }
4061 
4062     bs_entry->state.bs = bs;
4063     bs_entry->state.options = options;
4064     bs_entry->state.explicit_options = explicit_options;
4065     bs_entry->state.flags = flags;
4066 
4067     /*
4068      * If keep_old_opts is false then it means that unspecified
4069      * options must be reset to their original value. We don't allow
4070      * resetting 'backing' but we need to know if the option is
4071      * missing in order to decide if we have to return an error.
4072      */
4073     if (!keep_old_opts) {
4074         bs_entry->state.backing_missing =
4075             !qdict_haskey(options, "backing") &&
4076             !qdict_haskey(options, "backing.driver");
4077     }
4078 
4079     QLIST_FOREACH(child, &bs->children, next) {
4080         QDict *new_child_options = NULL;
4081         bool child_keep_old = keep_old_opts;
4082 
4083         /* reopen can only change the options of block devices that were
4084          * implicitly created and inherited options. For other (referenced)
4085          * block devices, a syntax like "backing.foo" results in an error. */
4086         if (child->bs->inherits_from != bs) {
4087             continue;
4088         }
4089 
4090         /* Check if the options contain a child reference */
4091         if (qdict_haskey(options, child->name)) {
4092             const char *childref = qdict_get_try_str(options, child->name);
4093             /*
4094              * The current child must not be reopened if the child
4095              * reference is null or points to a different node.
4096              */
4097             if (g_strcmp0(childref, child->bs->node_name)) {
4098                 continue;
4099             }
4100             /*
4101              * If the child reference points to the current child then
4102              * reopen it with its existing set of options (note that
4103              * it can still inherit new options from the parent).
4104              */
4105             child_keep_old = true;
4106         } else {
4107             /* Extract child options ("child-name.*") */
4108             char *child_key_dot = g_strdup_printf("%s.", child->name);
4109             qdict_extract_subqdict(explicit_options, NULL, child_key_dot);
4110             qdict_extract_subqdict(options, &new_child_options, child_key_dot);
4111             g_free(child_key_dot);
4112         }
4113 
4114         bdrv_reopen_queue_child(bs_queue, child->bs, new_child_options,
4115                                 child->klass, child->role, bs->drv->is_format,
4116                                 options, flags, child_keep_old);
4117     }
4118 
4119     return bs_queue;
4120 }
4121 
4122 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
4123                                     BlockDriverState *bs,
4124                                     QDict *options, bool keep_old_opts)
4125 {
4126     return bdrv_reopen_queue_child(bs_queue, bs, options, NULL, 0, false,
4127                                    NULL, 0, keep_old_opts);
4128 }
4129 
4130 void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue)
4131 {
4132     if (bs_queue) {
4133         BlockReopenQueueEntry *bs_entry, *next;
4134         QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
4135             qobject_unref(bs_entry->state.explicit_options);
4136             qobject_unref(bs_entry->state.options);
4137             g_free(bs_entry);
4138         }
4139         g_free(bs_queue);
4140     }
4141 }
4142 
4143 /*
4144  * Reopen multiple BlockDriverStates atomically & transactionally.
4145  *
4146  * The queue passed in (bs_queue) must have been built up previous
4147  * via bdrv_reopen_queue().
4148  *
4149  * Reopens all BDS specified in the queue, with the appropriate
4150  * flags.  All devices are prepared for reopen, and failure of any
4151  * device will cause all device changes to be abandoned, and intermediate
4152  * data cleaned up.
4153  *
4154  * If all devices prepare successfully, then the changes are committed
4155  * to all devices.
4156  *
4157  * All affected nodes must be drained between bdrv_reopen_queue() and
4158  * bdrv_reopen_multiple().
4159  *
4160  * To be called from the main thread, with all other AioContexts unlocked.
4161  */
4162 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
4163 {
4164     int ret = -1;
4165     BlockReopenQueueEntry *bs_entry, *next;
4166     AioContext *ctx;
4167     Transaction *tran = tran_new();
4168     g_autoptr(GHashTable) found = NULL;
4169     g_autoptr(GSList) refresh_list = NULL;
4170 
4171     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
4172     assert(bs_queue != NULL);
4173 
4174     QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4175         ctx = bdrv_get_aio_context(bs_entry->state.bs);
4176         aio_context_acquire(ctx);
4177         ret = bdrv_flush(bs_entry->state.bs);
4178         aio_context_release(ctx);
4179         if (ret < 0) {
4180             error_setg_errno(errp, -ret, "Error flushing drive");
4181             goto abort;
4182         }
4183     }
4184 
4185     QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4186         assert(bs_entry->state.bs->quiesce_counter > 0);
4187         ctx = bdrv_get_aio_context(bs_entry->state.bs);
4188         aio_context_acquire(ctx);
4189         ret = bdrv_reopen_prepare(&bs_entry->state, bs_queue, tran, errp);
4190         aio_context_release(ctx);
4191         if (ret < 0) {
4192             goto abort;
4193         }
4194         bs_entry->prepared = true;
4195     }
4196 
4197     found = g_hash_table_new(NULL, NULL);
4198     QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4199         BDRVReopenState *state = &bs_entry->state;
4200 
4201         refresh_list = bdrv_topological_dfs(refresh_list, found, state->bs);
4202         if (state->old_backing_bs) {
4203             refresh_list = bdrv_topological_dfs(refresh_list, found,
4204                                                 state->old_backing_bs);
4205         }
4206         if (state->old_file_bs) {
4207             refresh_list = bdrv_topological_dfs(refresh_list, found,
4208                                                 state->old_file_bs);
4209         }
4210     }
4211 
4212     /*
4213      * Note that file-posix driver rely on permission update done during reopen
4214      * (even if no permission changed), because it wants "new" permissions for
4215      * reconfiguring the fd and that's why it does it in raw_check_perm(), not
4216      * in raw_reopen_prepare() which is called with "old" permissions.
4217      */
4218     ret = bdrv_list_refresh_perms(refresh_list, bs_queue, tran, errp);
4219     if (ret < 0) {
4220         goto abort;
4221     }
4222 
4223     /*
4224      * If we reach this point, we have success and just need to apply the
4225      * changes.
4226      *
4227      * Reverse order is used to comfort qcow2 driver: on commit it need to write
4228      * IN_USE flag to the image, to mark bitmaps in the image as invalid. But
4229      * children are usually goes after parents in reopen-queue, so go from last
4230      * to first element.
4231      */
4232     QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
4233         ctx = bdrv_get_aio_context(bs_entry->state.bs);
4234         aio_context_acquire(ctx);
4235         bdrv_reopen_commit(&bs_entry->state);
4236         aio_context_release(ctx);
4237     }
4238 
4239     tran_commit(tran);
4240 
4241     QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
4242         BlockDriverState *bs = bs_entry->state.bs;
4243 
4244         if (bs->drv->bdrv_reopen_commit_post) {
4245             ctx = bdrv_get_aio_context(bs);
4246             aio_context_acquire(ctx);
4247             bs->drv->bdrv_reopen_commit_post(&bs_entry->state);
4248             aio_context_release(ctx);
4249         }
4250     }
4251 
4252     ret = 0;
4253     goto cleanup;
4254 
4255 abort:
4256     tran_abort(tran);
4257     QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
4258         if (bs_entry->prepared) {
4259             ctx = bdrv_get_aio_context(bs_entry->state.bs);
4260             aio_context_acquire(ctx);
4261             bdrv_reopen_abort(&bs_entry->state);
4262             aio_context_release(ctx);
4263         }
4264     }
4265 
4266 cleanup:
4267     bdrv_reopen_queue_free(bs_queue);
4268 
4269     return ret;
4270 }
4271 
4272 int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts,
4273                 Error **errp)
4274 {
4275     AioContext *ctx = bdrv_get_aio_context(bs);
4276     BlockReopenQueue *queue;
4277     int ret;
4278 
4279     bdrv_subtree_drained_begin(bs);
4280     if (ctx != qemu_get_aio_context()) {
4281         aio_context_release(ctx);
4282     }
4283 
4284     queue = bdrv_reopen_queue(NULL, bs, opts, keep_old_opts);
4285     ret = bdrv_reopen_multiple(queue, errp);
4286 
4287     if (ctx != qemu_get_aio_context()) {
4288         aio_context_acquire(ctx);
4289     }
4290     bdrv_subtree_drained_end(bs);
4291 
4292     return ret;
4293 }
4294 
4295 int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
4296                               Error **errp)
4297 {
4298     QDict *opts = qdict_new();
4299 
4300     qdict_put_bool(opts, BDRV_OPT_READ_ONLY, read_only);
4301 
4302     return bdrv_reopen(bs, opts, true, errp);
4303 }
4304 
4305 /*
4306  * Take a BDRVReopenState and check if the value of 'backing' in the
4307  * reopen_state->options QDict is valid or not.
4308  *
4309  * If 'backing' is missing from the QDict then return 0.
4310  *
4311  * If 'backing' contains the node name of the backing file of
4312  * reopen_state->bs then return 0.
4313  *
4314  * If 'backing' contains a different node name (or is null) then check
4315  * whether the current backing file can be replaced with the new one.
4316  * If that's the case then reopen_state->replace_backing_bs is set to
4317  * true and reopen_state->new_backing_bs contains a pointer to the new
4318  * backing BlockDriverState (or NULL).
4319  *
4320  * Return 0 on success, otherwise return < 0 and set @errp.
4321  */
4322 static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
4323                                              bool is_backing, Transaction *tran,
4324                                              Error **errp)
4325 {
4326     BlockDriverState *bs = reopen_state->bs;
4327     BlockDriverState *new_child_bs;
4328     BlockDriverState *old_child_bs = is_backing ? child_bs(bs->backing) :
4329                                                   child_bs(bs->file);
4330     const char *child_name = is_backing ? "backing" : "file";
4331     QObject *value;
4332     const char *str;
4333 
4334     value = qdict_get(reopen_state->options, child_name);
4335     if (value == NULL) {
4336         return 0;
4337     }
4338 
4339     switch (qobject_type(value)) {
4340     case QTYPE_QNULL:
4341         assert(is_backing); /* The 'file' option does not allow a null value */
4342         new_child_bs = NULL;
4343         break;
4344     case QTYPE_QSTRING:
4345         str = qstring_get_str(qobject_to(QString, value));
4346         new_child_bs = bdrv_lookup_bs(NULL, str, errp);
4347         if (new_child_bs == NULL) {
4348             return -EINVAL;
4349         } else if (bdrv_recurse_has_child(new_child_bs, bs)) {
4350             error_setg(errp, "Making '%s' a %s child of '%s' would create a "
4351                        "cycle", str, child_name, bs->node_name);
4352             return -EINVAL;
4353         }
4354         break;
4355     default:
4356         /*
4357          * The options QDict has been flattened, so 'backing' and 'file'
4358          * do not allow any other data type here.
4359          */
4360         g_assert_not_reached();
4361     }
4362 
4363     if (old_child_bs == new_child_bs) {
4364         return 0;
4365     }
4366 
4367     if (old_child_bs) {
4368         if (bdrv_skip_implicit_filters(old_child_bs) == new_child_bs) {
4369             return 0;
4370         }
4371 
4372         if (old_child_bs->implicit) {
4373             error_setg(errp, "Cannot replace implicit %s child of %s",
4374                        child_name, bs->node_name);
4375             return -EPERM;
4376         }
4377     }
4378 
4379     if (bs->drv->is_filter && !old_child_bs) {
4380         /*
4381          * Filters always have a file or a backing child, so we are trying to
4382          * change wrong child
4383          */
4384         error_setg(errp, "'%s' is a %s filter node that does not support a "
4385                    "%s child", bs->node_name, bs->drv->format_name, child_name);
4386         return -EINVAL;
4387     }
4388 
4389     if (is_backing) {
4390         reopen_state->old_backing_bs = old_child_bs;
4391     } else {
4392         reopen_state->old_file_bs = old_child_bs;
4393     }
4394 
4395     return bdrv_set_file_or_backing_noperm(bs, new_child_bs, is_backing,
4396                                            tran, errp);
4397 }
4398 
4399 /*
4400  * Prepares a BlockDriverState for reopen. All changes are staged in the
4401  * 'opaque' field of the BDRVReopenState, which is used and allocated by
4402  * the block driver layer .bdrv_reopen_prepare()
4403  *
4404  * bs is the BlockDriverState to reopen
4405  * flags are the new open flags
4406  * queue is the reopen queue
4407  *
4408  * Returns 0 on success, non-zero on error.  On error errp will be set
4409  * as well.
4410  *
4411  * On failure, bdrv_reopen_abort() will be called to clean up any data.
4412  * It is the responsibility of the caller to then call the abort() or
4413  * commit() for any other BDS that have been left in a prepare() state
4414  *
4415  */
4416 static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
4417                                BlockReopenQueue *queue,
4418                                Transaction *change_child_tran, Error **errp)
4419 {
4420     int ret = -1;
4421     int old_flags;
4422     Error *local_err = NULL;
4423     BlockDriver *drv;
4424     QemuOpts *opts;
4425     QDict *orig_reopen_opts;
4426     char *discard = NULL;
4427     bool read_only;
4428     bool drv_prepared = false;
4429 
4430     assert(reopen_state != NULL);
4431     assert(reopen_state->bs->drv != NULL);
4432     drv = reopen_state->bs->drv;
4433 
4434     /* This function and each driver's bdrv_reopen_prepare() remove
4435      * entries from reopen_state->options as they are processed, so
4436      * we need to make a copy of the original QDict. */
4437     orig_reopen_opts = qdict_clone_shallow(reopen_state->options);
4438 
4439     /* Process generic block layer options */
4440     opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
4441     if (!qemu_opts_absorb_qdict(opts, reopen_state->options, errp)) {
4442         ret = -EINVAL;
4443         goto error;
4444     }
4445 
4446     /* This was already called in bdrv_reopen_queue_child() so the flags
4447      * are up-to-date. This time we simply want to remove the options from
4448      * QemuOpts in order to indicate that they have been processed. */
4449     old_flags = reopen_state->flags;
4450     update_flags_from_options(&reopen_state->flags, opts);
4451     assert(old_flags == reopen_state->flags);
4452 
4453     discard = qemu_opt_get_del(opts, BDRV_OPT_DISCARD);
4454     if (discard != NULL) {
4455         if (bdrv_parse_discard_flags(discard, &reopen_state->flags) != 0) {
4456             error_setg(errp, "Invalid discard option");
4457             ret = -EINVAL;
4458             goto error;
4459         }
4460     }
4461 
4462     reopen_state->detect_zeroes =
4463         bdrv_parse_detect_zeroes(opts, reopen_state->flags, &local_err);
4464     if (local_err) {
4465         error_propagate(errp, local_err);
4466         ret = -EINVAL;
4467         goto error;
4468     }
4469 
4470     /* All other options (including node-name and driver) must be unchanged.
4471      * Put them back into the QDict, so that they are checked at the end
4472      * of this function. */
4473     qemu_opts_to_qdict(opts, reopen_state->options);
4474 
4475     /* If we are to stay read-only, do not allow permission change
4476      * to r/w. Attempting to set to r/w may fail if either BDRV_O_ALLOW_RDWR is
4477      * not set, or if the BDS still has copy_on_read enabled */
4478     read_only = !(reopen_state->flags & BDRV_O_RDWR);
4479     ret = bdrv_can_set_read_only(reopen_state->bs, read_only, true, &local_err);
4480     if (local_err) {
4481         error_propagate(errp, local_err);
4482         goto error;
4483     }
4484 
4485     if (drv->bdrv_reopen_prepare) {
4486         /*
4487          * If a driver-specific option is missing, it means that we
4488          * should reset it to its default value.
4489          * But not all options allow that, so we need to check it first.
4490          */
4491         ret = bdrv_reset_options_allowed(reopen_state->bs,
4492                                          reopen_state->options, errp);
4493         if (ret) {
4494             goto error;
4495         }
4496 
4497         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
4498         if (ret) {
4499             if (local_err != NULL) {
4500                 error_propagate(errp, local_err);
4501             } else {
4502                 bdrv_refresh_filename(reopen_state->bs);
4503                 error_setg(errp, "failed while preparing to reopen image '%s'",
4504                            reopen_state->bs->filename);
4505             }
4506             goto error;
4507         }
4508     } else {
4509         /* It is currently mandatory to have a bdrv_reopen_prepare()
4510          * handler for each supported drv. */
4511         error_setg(errp, "Block format '%s' used by node '%s' "
4512                    "does not support reopening files", drv->format_name,
4513                    bdrv_get_device_or_node_name(reopen_state->bs));
4514         ret = -1;
4515         goto error;
4516     }
4517 
4518     drv_prepared = true;
4519 
4520     /*
4521      * We must provide the 'backing' option if the BDS has a backing
4522      * file or if the image file has a backing file name as part of
4523      * its metadata. Otherwise the 'backing' option can be omitted.
4524      */
4525     if (drv->supports_backing && reopen_state->backing_missing &&
4526         (reopen_state->bs->backing || reopen_state->bs->backing_file[0])) {
4527         error_setg(errp, "backing is missing for '%s'",
4528                    reopen_state->bs->node_name);
4529         ret = -EINVAL;
4530         goto error;
4531     }
4532 
4533     /*
4534      * Allow changing the 'backing' option. The new value can be
4535      * either a reference to an existing node (using its node name)
4536      * or NULL to simply detach the current backing file.
4537      */
4538     ret = bdrv_reopen_parse_file_or_backing(reopen_state, true,
4539                                             change_child_tran, errp);
4540     if (ret < 0) {
4541         goto error;
4542     }
4543     qdict_del(reopen_state->options, "backing");
4544 
4545     /* Allow changing the 'file' option. In this case NULL is not allowed */
4546     ret = bdrv_reopen_parse_file_or_backing(reopen_state, false,
4547                                             change_child_tran, errp);
4548     if (ret < 0) {
4549         goto error;
4550     }
4551     qdict_del(reopen_state->options, "file");
4552 
4553     /* Options that are not handled are only okay if they are unchanged
4554      * compared to the old state. It is expected that some options are only
4555      * used for the initial open, but not reopen (e.g. filename) */
4556     if (qdict_size(reopen_state->options)) {
4557         const QDictEntry *entry = qdict_first(reopen_state->options);
4558 
4559         do {
4560             QObject *new = entry->value;
4561             QObject *old = qdict_get(reopen_state->bs->options, entry->key);
4562 
4563             /* Allow child references (child_name=node_name) as long as they
4564              * point to the current child (i.e. everything stays the same). */
4565             if (qobject_type(new) == QTYPE_QSTRING) {
4566                 BdrvChild *child;
4567                 QLIST_FOREACH(child, &reopen_state->bs->children, next) {
4568                     if (!strcmp(child->name, entry->key)) {
4569                         break;
4570                     }
4571                 }
4572 
4573                 if (child) {
4574                     if (!strcmp(child->bs->node_name,
4575                                 qstring_get_str(qobject_to(QString, new)))) {
4576                         continue; /* Found child with this name, skip option */
4577                     }
4578                 }
4579             }
4580 
4581             /*
4582              * TODO: When using -drive to specify blockdev options, all values
4583              * will be strings; however, when using -blockdev, blockdev-add or
4584              * filenames using the json:{} pseudo-protocol, they will be
4585              * correctly typed.
4586              * In contrast, reopening options are (currently) always strings
4587              * (because you can only specify them through qemu-io; all other
4588              * callers do not specify any options).
4589              * Therefore, when using anything other than -drive to create a BDS,
4590              * this cannot detect non-string options as unchanged, because
4591              * qobject_is_equal() always returns false for objects of different
4592              * type.  In the future, this should be remedied by correctly typing
4593              * all options.  For now, this is not too big of an issue because
4594              * the user can simply omit options which cannot be changed anyway,
4595              * so they will stay unchanged.
4596              */
4597             if (!qobject_is_equal(new, old)) {
4598                 error_setg(errp, "Cannot change the option '%s'", entry->key);
4599                 ret = -EINVAL;
4600                 goto error;
4601             }
4602         } while ((entry = qdict_next(reopen_state->options, entry)));
4603     }
4604 
4605     ret = 0;
4606 
4607     /* Restore the original reopen_state->options QDict */
4608     qobject_unref(reopen_state->options);
4609     reopen_state->options = qobject_ref(orig_reopen_opts);
4610 
4611 error:
4612     if (ret < 0 && drv_prepared) {
4613         /* drv->bdrv_reopen_prepare() has succeeded, so we need to
4614          * call drv->bdrv_reopen_abort() before signaling an error
4615          * (bdrv_reopen_multiple() will not call bdrv_reopen_abort()
4616          * when the respective bdrv_reopen_prepare() has failed) */
4617         if (drv->bdrv_reopen_abort) {
4618             drv->bdrv_reopen_abort(reopen_state);
4619         }
4620     }
4621     qemu_opts_del(opts);
4622     qobject_unref(orig_reopen_opts);
4623     g_free(discard);
4624     return ret;
4625 }
4626 
4627 /*
4628  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
4629  * makes them final by swapping the staging BlockDriverState contents into
4630  * the active BlockDriverState contents.
4631  */
4632 static void bdrv_reopen_commit(BDRVReopenState *reopen_state)
4633 {
4634     BlockDriver *drv;
4635     BlockDriverState *bs;
4636     BdrvChild *child;
4637 
4638     assert(reopen_state != NULL);
4639     bs = reopen_state->bs;
4640     drv = bs->drv;
4641     assert(drv != NULL);
4642 
4643     /* If there are any driver level actions to take */
4644     if (drv->bdrv_reopen_commit) {
4645         drv->bdrv_reopen_commit(reopen_state);
4646     }
4647 
4648     /* set BDS specific flags now */
4649     qobject_unref(bs->explicit_options);
4650     qobject_unref(bs->options);
4651     qobject_ref(reopen_state->explicit_options);
4652     qobject_ref(reopen_state->options);
4653 
4654     bs->explicit_options   = reopen_state->explicit_options;
4655     bs->options            = reopen_state->options;
4656     bs->open_flags         = reopen_state->flags;
4657     bs->detect_zeroes      = reopen_state->detect_zeroes;
4658 
4659     /* Remove child references from bs->options and bs->explicit_options.
4660      * Child options were already removed in bdrv_reopen_queue_child() */
4661     QLIST_FOREACH(child, &bs->children, next) {
4662         qdict_del(bs->explicit_options, child->name);
4663         qdict_del(bs->options, child->name);
4664     }
4665     /* backing is probably removed, so it's not handled by previous loop */
4666     qdict_del(bs->explicit_options, "backing");
4667     qdict_del(bs->options, "backing");
4668 
4669     bdrv_refresh_limits(bs, NULL, NULL);
4670 }
4671 
4672 /*
4673  * Abort the reopen, and delete and free the staged changes in
4674  * reopen_state
4675  */
4676 static void bdrv_reopen_abort(BDRVReopenState *reopen_state)
4677 {
4678     BlockDriver *drv;
4679 
4680     assert(reopen_state != NULL);
4681     drv = reopen_state->bs->drv;
4682     assert(drv != NULL);
4683 
4684     if (drv->bdrv_reopen_abort) {
4685         drv->bdrv_reopen_abort(reopen_state);
4686     }
4687 }
4688 
4689 
4690 static void bdrv_close(BlockDriverState *bs)
4691 {
4692     BdrvAioNotifier *ban, *ban_next;
4693     BdrvChild *child, *next;
4694 
4695     assert(!bs->refcnt);
4696 
4697     bdrv_drained_begin(bs); /* complete I/O */
4698     bdrv_flush(bs);
4699     bdrv_drain(bs); /* in case flush left pending I/O */
4700 
4701     if (bs->drv) {
4702         if (bs->drv->bdrv_close) {
4703             /* Must unfreeze all children, so bdrv_unref_child() works */
4704             bs->drv->bdrv_close(bs);
4705         }
4706         bs->drv = NULL;
4707     }
4708 
4709     QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
4710         bdrv_unref_child(bs, child);
4711     }
4712 
4713     bs->backing = NULL;
4714     bs->file = NULL;
4715     g_free(bs->opaque);
4716     bs->opaque = NULL;
4717     qatomic_set(&bs->copy_on_read, 0);
4718     bs->backing_file[0] = '\0';
4719     bs->backing_format[0] = '\0';
4720     bs->total_sectors = 0;
4721     bs->encrypted = false;
4722     bs->sg = false;
4723     qobject_unref(bs->options);
4724     qobject_unref(bs->explicit_options);
4725     bs->options = NULL;
4726     bs->explicit_options = NULL;
4727     qobject_unref(bs->full_open_options);
4728     bs->full_open_options = NULL;
4729     g_free(bs->block_status_cache);
4730     bs->block_status_cache = NULL;
4731 
4732     bdrv_release_named_dirty_bitmaps(bs);
4733     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4734 
4735     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
4736         g_free(ban);
4737     }
4738     QLIST_INIT(&bs->aio_notifiers);
4739     bdrv_drained_end(bs);
4740 
4741     /*
4742      * If we're still inside some bdrv_drain_all_begin()/end() sections, end
4743      * them now since this BDS won't exist anymore when bdrv_drain_all_end()
4744      * gets called.
4745      */
4746     if (bs->quiesce_counter) {
4747         bdrv_drain_all_end_quiesce(bs);
4748     }
4749 }
4750 
4751 void bdrv_close_all(void)
4752 {
4753     assert(job_next(NULL) == NULL);
4754 
4755     /* Drop references from requests still in flight, such as canceled block
4756      * jobs whose AIO context has not been polled yet */
4757     bdrv_drain_all();
4758 
4759     blk_remove_all_bs();
4760     blockdev_close_all_bdrv_states();
4761 
4762     assert(QTAILQ_EMPTY(&all_bdrv_states));
4763 }
4764 
4765 static bool should_update_child(BdrvChild *c, BlockDriverState *to)
4766 {
4767     GQueue *queue;
4768     GHashTable *found;
4769     bool ret;
4770 
4771     if (c->klass->stay_at_node) {
4772         return false;
4773     }
4774 
4775     /* If the child @c belongs to the BDS @to, replacing the current
4776      * c->bs by @to would mean to create a loop.
4777      *
4778      * Such a case occurs when appending a BDS to a backing chain.
4779      * For instance, imagine the following chain:
4780      *
4781      *   guest device -> node A -> further backing chain...
4782      *
4783      * Now we create a new BDS B which we want to put on top of this
4784      * chain, so we first attach A as its backing node:
4785      *
4786      *                   node B
4787      *                     |
4788      *                     v
4789      *   guest device -> node A -> further backing chain...
4790      *
4791      * Finally we want to replace A by B.  When doing that, we want to
4792      * replace all pointers to A by pointers to B -- except for the
4793      * pointer from B because (1) that would create a loop, and (2)
4794      * that pointer should simply stay intact:
4795      *
4796      *   guest device -> node B
4797      *                     |
4798      *                     v
4799      *                   node A -> further backing chain...
4800      *
4801      * In general, when replacing a node A (c->bs) by a node B (@to),
4802      * if A is a child of B, that means we cannot replace A by B there
4803      * because that would create a loop.  Silently detaching A from B
4804      * is also not really an option.  So overall just leaving A in
4805      * place there is the most sensible choice.
4806      *
4807      * We would also create a loop in any cases where @c is only
4808      * indirectly referenced by @to. Prevent this by returning false
4809      * if @c is found (by breadth-first search) anywhere in the whole
4810      * subtree of @to.
4811      */
4812 
4813     ret = true;
4814     found = g_hash_table_new(NULL, NULL);
4815     g_hash_table_add(found, to);
4816     queue = g_queue_new();
4817     g_queue_push_tail(queue, to);
4818 
4819     while (!g_queue_is_empty(queue)) {
4820         BlockDriverState *v = g_queue_pop_head(queue);
4821         BdrvChild *c2;
4822 
4823         QLIST_FOREACH(c2, &v->children, next) {
4824             if (c2 == c) {
4825                 ret = false;
4826                 break;
4827             }
4828 
4829             if (g_hash_table_contains(found, c2->bs)) {
4830                 continue;
4831             }
4832 
4833             g_queue_push_tail(queue, c2->bs);
4834             g_hash_table_add(found, c2->bs);
4835         }
4836     }
4837 
4838     g_queue_free(queue);
4839     g_hash_table_destroy(found);
4840 
4841     return ret;
4842 }
4843 
4844 typedef struct BdrvRemoveFilterOrCowChild {
4845     BdrvChild *child;
4846     bool is_backing;
4847 } BdrvRemoveFilterOrCowChild;
4848 
4849 static void bdrv_remove_filter_or_cow_child_abort(void *opaque)
4850 {
4851     BdrvRemoveFilterOrCowChild *s = opaque;
4852     BlockDriverState *parent_bs = s->child->opaque;
4853 
4854     QLIST_INSERT_HEAD(&parent_bs->children, s->child, next);
4855     if (s->is_backing) {
4856         parent_bs->backing = s->child;
4857     } else {
4858         parent_bs->file = s->child;
4859     }
4860 
4861     /*
4862      * We don't have to restore child->bs here to undo bdrv_replace_child_tran()
4863      * because that function is transactionable and it registered own completion
4864      * entries in @tran, so .abort() for bdrv_replace_child_safe() will be
4865      * called automatically.
4866      */
4867 }
4868 
4869 static void bdrv_remove_filter_or_cow_child_commit(void *opaque)
4870 {
4871     BdrvRemoveFilterOrCowChild *s = opaque;
4872 
4873     bdrv_child_free(s->child);
4874 }
4875 
4876 static TransactionActionDrv bdrv_remove_filter_or_cow_child_drv = {
4877     .abort = bdrv_remove_filter_or_cow_child_abort,
4878     .commit = bdrv_remove_filter_or_cow_child_commit,
4879     .clean = g_free,
4880 };
4881 
4882 /*
4883  * A function to remove backing or file child of @bs.
4884  * Function doesn't update permissions, caller is responsible for this.
4885  */
4886 static void bdrv_remove_file_or_backing_child(BlockDriverState *bs,
4887                                               BdrvChild *child,
4888                                               Transaction *tran)
4889 {
4890     BdrvRemoveFilterOrCowChild *s;
4891 
4892     assert(child == bs->backing || child == bs->file);
4893 
4894     if (!child) {
4895         return;
4896     }
4897 
4898     if (child->bs) {
4899         bdrv_replace_child_tran(child, NULL, tran);
4900     }
4901 
4902     s = g_new(BdrvRemoveFilterOrCowChild, 1);
4903     *s = (BdrvRemoveFilterOrCowChild) {
4904         .child = child,
4905         .is_backing = (child == bs->backing),
4906     };
4907     tran_add(tran, &bdrv_remove_filter_or_cow_child_drv, s);
4908 
4909     QLIST_SAFE_REMOVE(child, next);
4910     if (s->is_backing) {
4911         bs->backing = NULL;
4912     } else {
4913         bs->file = NULL;
4914     }
4915 }
4916 
4917 /*
4918  * A function to remove backing-chain child of @bs if exists: cow child for
4919  * format nodes (always .backing) and filter child for filters (may be .file or
4920  * .backing)
4921  */
4922 static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs,
4923                                             Transaction *tran)
4924 {
4925     bdrv_remove_file_or_backing_child(bs, bdrv_filter_or_cow_child(bs), tran);
4926 }
4927 
4928 static int bdrv_replace_node_noperm(BlockDriverState *from,
4929                                     BlockDriverState *to,
4930                                     bool auto_skip, Transaction *tran,
4931                                     Error **errp)
4932 {
4933     BdrvChild *c, *next;
4934 
4935     QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
4936         assert(c->bs == from);
4937         if (!should_update_child(c, to)) {
4938             if (auto_skip) {
4939                 continue;
4940             }
4941             error_setg(errp, "Should not change '%s' link to '%s'",
4942                        c->name, from->node_name);
4943             return -EINVAL;
4944         }
4945         if (c->frozen) {
4946             error_setg(errp, "Cannot change '%s' link to '%s'",
4947                        c->name, from->node_name);
4948             return -EPERM;
4949         }
4950         bdrv_replace_child_tran(c, to, tran);
4951     }
4952 
4953     return 0;
4954 }
4955 
4956 /*
4957  * With auto_skip=true bdrv_replace_node_common skips updating from parents
4958  * if it creates a parent-child relation loop or if parent is block-job.
4959  *
4960  * With auto_skip=false the error is returned if from has a parent which should
4961  * not be updated.
4962  *
4963  * With @detach_subchain=true @to must be in a backing chain of @from. In this
4964  * case backing link of the cow-parent of @to is removed.
4965  */
4966 static int bdrv_replace_node_common(BlockDriverState *from,
4967                                     BlockDriverState *to,
4968                                     bool auto_skip, bool detach_subchain,
4969                                     Error **errp)
4970 {
4971     Transaction *tran = tran_new();
4972     g_autoptr(GHashTable) found = NULL;
4973     g_autoptr(GSList) refresh_list = NULL;
4974     BlockDriverState *to_cow_parent = NULL;
4975     int ret;
4976 
4977     if (detach_subchain) {
4978         assert(bdrv_chain_contains(from, to));
4979         assert(from != to);
4980         for (to_cow_parent = from;
4981              bdrv_filter_or_cow_bs(to_cow_parent) != to;
4982              to_cow_parent = bdrv_filter_or_cow_bs(to_cow_parent))
4983         {
4984             ;
4985         }
4986     }
4987 
4988     /* Make sure that @from doesn't go away until we have successfully attached
4989      * all of its parents to @to. */
4990     bdrv_ref(from);
4991 
4992     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
4993     assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to));
4994     bdrv_drained_begin(from);
4995 
4996     /*
4997      * Do the replacement without permission update.
4998      * Replacement may influence the permissions, we should calculate new
4999      * permissions based on new graph. If we fail, we'll roll-back the
5000      * replacement.
5001      */
5002     ret = bdrv_replace_node_noperm(from, to, auto_skip, tran, errp);
5003     if (ret < 0) {
5004         goto out;
5005     }
5006 
5007     if (detach_subchain) {
5008         bdrv_remove_filter_or_cow_child(to_cow_parent, tran);
5009     }
5010 
5011     found = g_hash_table_new(NULL, NULL);
5012 
5013     refresh_list = bdrv_topological_dfs(refresh_list, found, to);
5014     refresh_list = bdrv_topological_dfs(refresh_list, found, from);
5015 
5016     ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
5017     if (ret < 0) {
5018         goto out;
5019     }
5020 
5021     ret = 0;
5022 
5023 out:
5024     tran_finalize(tran, ret);
5025 
5026     bdrv_drained_end(from);
5027     bdrv_unref(from);
5028 
5029     return ret;
5030 }
5031 
5032 int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
5033                       Error **errp)
5034 {
5035     return bdrv_replace_node_common(from, to, true, false, errp);
5036 }
5037 
5038 int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
5039 {
5040     return bdrv_replace_node_common(bs, bdrv_filter_or_cow_bs(bs), true, true,
5041                                     errp);
5042 }
5043 
5044 /*
5045  * Add new bs contents at the top of an image chain while the chain is
5046  * live, while keeping required fields on the top layer.
5047  *
5048  * This will modify the BlockDriverState fields, and swap contents
5049  * between bs_new and bs_top. Both bs_new and bs_top are modified.
5050  *
5051  * bs_new must not be attached to a BlockBackend and must not have backing
5052  * child.
5053  *
5054  * This function does not create any image files.
5055  */
5056 int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
5057                 Error **errp)
5058 {
5059     int ret;
5060     Transaction *tran = tran_new();
5061 
5062     assert(!bs_new->backing);
5063 
5064     ret = bdrv_attach_child_noperm(bs_new, bs_top, "backing",
5065                                    &child_of_bds, bdrv_backing_role(bs_new),
5066                                    &bs_new->backing, tran, errp);
5067     if (ret < 0) {
5068         goto out;
5069     }
5070 
5071     ret = bdrv_replace_node_noperm(bs_top, bs_new, true, tran, errp);
5072     if (ret < 0) {
5073         goto out;
5074     }
5075 
5076     ret = bdrv_refresh_perms(bs_new, errp);
5077 out:
5078     tran_finalize(tran, ret);
5079 
5080     bdrv_refresh_limits(bs_top, NULL, NULL);
5081 
5082     return ret;
5083 }
5084 
5085 /* Not for empty child */
5086 int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
5087                           Error **errp)
5088 {
5089     int ret;
5090     Transaction *tran = tran_new();
5091     g_autoptr(GHashTable) found = NULL;
5092     g_autoptr(GSList) refresh_list = NULL;
5093     BlockDriverState *old_bs = child->bs;
5094 
5095     bdrv_ref(old_bs);
5096     bdrv_drained_begin(old_bs);
5097     bdrv_drained_begin(new_bs);
5098 
5099     bdrv_replace_child_tran(child, new_bs, tran);
5100 
5101     found = g_hash_table_new(NULL, NULL);
5102     refresh_list = bdrv_topological_dfs(refresh_list, found, old_bs);
5103     refresh_list = bdrv_topological_dfs(refresh_list, found, new_bs);
5104 
5105     ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
5106 
5107     tran_finalize(tran, ret);
5108 
5109     bdrv_drained_end(old_bs);
5110     bdrv_drained_end(new_bs);
5111     bdrv_unref(old_bs);
5112 
5113     return ret;
5114 }
5115 
5116 static void bdrv_delete(BlockDriverState *bs)
5117 {
5118     assert(bdrv_op_blocker_is_empty(bs));
5119     assert(!bs->refcnt);
5120 
5121     /* remove from list, if necessary */
5122     if (bs->node_name[0] != '\0') {
5123         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
5124     }
5125     QTAILQ_REMOVE(&all_bdrv_states, bs, bs_list);
5126 
5127     bdrv_close(bs);
5128 
5129     g_free(bs);
5130 }
5131 
5132 
5133 /*
5134  * Replace @bs by newly created block node.
5135  *
5136  * @options is a QDict of options to pass to the block drivers, or NULL for an
5137  * empty set of options. The reference to the QDict belongs to the block layer
5138  * after the call (even on failure), so if the caller intends to reuse the
5139  * dictionary, it needs to use qobject_ref() before calling bdrv_open.
5140  */
5141 BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,
5142                                    int flags, Error **errp)
5143 {
5144     ERRP_GUARD();
5145     int ret;
5146     BlockDriverState *new_node_bs = NULL;
5147     const char *drvname, *node_name;
5148     BlockDriver *drv;
5149 
5150     drvname = qdict_get_try_str(options, "driver");
5151     if (!drvname) {
5152         error_setg(errp, "driver is not specified");
5153         goto fail;
5154     }
5155 
5156     drv = bdrv_find_format(drvname);
5157     if (!drv) {
5158         error_setg(errp, "Unknown driver: '%s'", drvname);
5159         goto fail;
5160     }
5161 
5162     node_name = qdict_get_try_str(options, "node-name");
5163 
5164     new_node_bs = bdrv_new_open_driver_opts(drv, node_name, options, flags,
5165                                             errp);
5166     options = NULL; /* bdrv_new_open_driver() eats options */
5167     if (!new_node_bs) {
5168         error_prepend(errp, "Could not create node: ");
5169         goto fail;
5170     }
5171 
5172     bdrv_drained_begin(bs);
5173     ret = bdrv_replace_node(bs, new_node_bs, errp);
5174     bdrv_drained_end(bs);
5175 
5176     if (ret < 0) {
5177         error_prepend(errp, "Could not replace node: ");
5178         goto fail;
5179     }
5180 
5181     return new_node_bs;
5182 
5183 fail:
5184     qobject_unref(options);
5185     bdrv_unref(new_node_bs);
5186     return NULL;
5187 }
5188 
5189 /*
5190  * Run consistency checks on an image
5191  *
5192  * Returns 0 if the check could be completed (it doesn't mean that the image is
5193  * free of errors) or -errno when an internal error occurred. The results of the
5194  * check are stored in res.
5195  */
5196 int coroutine_fn bdrv_co_check(BlockDriverState *bs,
5197                                BdrvCheckResult *res, BdrvCheckMode fix)
5198 {
5199     if (bs->drv == NULL) {
5200         return -ENOMEDIUM;
5201     }
5202     if (bs->drv->bdrv_co_check == NULL) {
5203         return -ENOTSUP;
5204     }
5205 
5206     memset(res, 0, sizeof(*res));
5207     return bs->drv->bdrv_co_check(bs, res, fix);
5208 }
5209 
5210 /*
5211  * Return values:
5212  * 0        - success
5213  * -EINVAL  - backing format specified, but no file
5214  * -ENOSPC  - can't update the backing file because no space is left in the
5215  *            image file header
5216  * -ENOTSUP - format driver doesn't support changing the backing file
5217  */
5218 int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file,
5219                              const char *backing_fmt, bool require)
5220 {
5221     BlockDriver *drv = bs->drv;
5222     int ret;
5223 
5224     if (!drv) {
5225         return -ENOMEDIUM;
5226     }
5227 
5228     /* Backing file format doesn't make sense without a backing file */
5229     if (backing_fmt && !backing_file) {
5230         return -EINVAL;
5231     }
5232 
5233     if (require && backing_file && !backing_fmt) {
5234         return -EINVAL;
5235     }
5236 
5237     if (drv->bdrv_change_backing_file != NULL) {
5238         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
5239     } else {
5240         ret = -ENOTSUP;
5241     }
5242 
5243     if (ret == 0) {
5244         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
5245         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
5246         pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
5247                 backing_file ?: "");
5248     }
5249     return ret;
5250 }
5251 
5252 /*
5253  * Finds the first non-filter node above bs in the chain between
5254  * active and bs.  The returned node is either an immediate parent of
5255  * bs, or there are only filter nodes between the two.
5256  *
5257  * Returns NULL if bs is not found in active's image chain,
5258  * or if active == bs.
5259  *
5260  * Returns the bottommost base image if bs == NULL.
5261  */
5262 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
5263                                     BlockDriverState *bs)
5264 {
5265     bs = bdrv_skip_filters(bs);
5266     active = bdrv_skip_filters(active);
5267 
5268     while (active) {
5269         BlockDriverState *next = bdrv_backing_chain_next(active);
5270         if (bs == next) {
5271             return active;
5272         }
5273         active = next;
5274     }
5275 
5276     return NULL;
5277 }
5278 
5279 /* Given a BDS, searches for the base layer. */
5280 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
5281 {
5282     return bdrv_find_overlay(bs, NULL);
5283 }
5284 
5285 /*
5286  * Return true if at least one of the COW (backing) and filter links
5287  * between @bs and @base is frozen. @errp is set if that's the case.
5288  * @base must be reachable from @bs, or NULL.
5289  */
5290 bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
5291                                   Error **errp)
5292 {
5293     BlockDriverState *i;
5294     BdrvChild *child;
5295 
5296     for (i = bs; i != base; i = child_bs(child)) {
5297         child = bdrv_filter_or_cow_child(i);
5298 
5299         if (child && child->frozen) {
5300             error_setg(errp, "Cannot change '%s' link from '%s' to '%s'",
5301                        child->name, i->node_name, child->bs->node_name);
5302             return true;
5303         }
5304     }
5305 
5306     return false;
5307 }
5308 
5309 /*
5310  * Freeze all COW (backing) and filter links between @bs and @base.
5311  * If any of the links is already frozen the operation is aborted and
5312  * none of the links are modified.
5313  * @base must be reachable from @bs, or NULL.
5314  * Returns 0 on success. On failure returns < 0 and sets @errp.
5315  */
5316 int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base,
5317                               Error **errp)
5318 {
5319     BlockDriverState *i;
5320     BdrvChild *child;
5321 
5322     if (bdrv_is_backing_chain_frozen(bs, base, errp)) {
5323         return -EPERM;
5324     }
5325 
5326     for (i = bs; i != base; i = child_bs(child)) {
5327         child = bdrv_filter_or_cow_child(i);
5328         if (child && child->bs->never_freeze) {
5329             error_setg(errp, "Cannot freeze '%s' link to '%s'",
5330                        child->name, child->bs->node_name);
5331             return -EPERM;
5332         }
5333     }
5334 
5335     for (i = bs; i != base; i = child_bs(child)) {
5336         child = bdrv_filter_or_cow_child(i);
5337         if (child) {
5338             child->frozen = true;
5339         }
5340     }
5341 
5342     return 0;
5343 }
5344 
5345 /*
5346  * Unfreeze all COW (backing) and filter links between @bs and @base.
5347  * The caller must ensure that all links are frozen before using this
5348  * function.
5349  * @base must be reachable from @bs, or NULL.
5350  */
5351 void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base)
5352 {
5353     BlockDriverState *i;
5354     BdrvChild *child;
5355 
5356     for (i = bs; i != base; i = child_bs(child)) {
5357         child = bdrv_filter_or_cow_child(i);
5358         if (child) {
5359             assert(child->frozen);
5360             child->frozen = false;
5361         }
5362     }
5363 }
5364 
5365 /*
5366  * Drops images above 'base' up to and including 'top', and sets the image
5367  * above 'top' to have base as its backing file.
5368  *
5369  * Requires that the overlay to 'top' is opened r/w, so that the backing file
5370  * information in 'bs' can be properly updated.
5371  *
5372  * E.g., this will convert the following chain:
5373  * bottom <- base <- intermediate <- top <- active
5374  *
5375  * to
5376  *
5377  * bottom <- base <- active
5378  *
5379  * It is allowed for bottom==base, in which case it converts:
5380  *
5381  * base <- intermediate <- top <- active
5382  *
5383  * to
5384  *
5385  * base <- active
5386  *
5387  * If backing_file_str is non-NULL, it will be used when modifying top's
5388  * overlay image metadata.
5389  *
5390  * Error conditions:
5391  *  if active == top, that is considered an error
5392  *
5393  */
5394 int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
5395                            const char *backing_file_str)
5396 {
5397     BlockDriverState *explicit_top = top;
5398     bool update_inherits_from;
5399     BdrvChild *c;
5400     Error *local_err = NULL;
5401     int ret = -EIO;
5402     g_autoptr(GSList) updated_children = NULL;
5403     GSList *p;
5404 
5405     bdrv_ref(top);
5406     bdrv_subtree_drained_begin(top);
5407 
5408     if (!top->drv || !base->drv) {
5409         goto exit;
5410     }
5411 
5412     /* Make sure that base is in the backing chain of top */
5413     if (!bdrv_chain_contains(top, base)) {
5414         goto exit;
5415     }
5416 
5417     /* If 'base' recursively inherits from 'top' then we should set
5418      * base->inherits_from to top->inherits_from after 'top' and all
5419      * other intermediate nodes have been dropped.
5420      * If 'top' is an implicit node (e.g. "commit_top") we should skip
5421      * it because no one inherits from it. We use explicit_top for that. */
5422     explicit_top = bdrv_skip_implicit_filters(explicit_top);
5423     update_inherits_from = bdrv_inherits_from_recursive(base, explicit_top);
5424 
5425     /* success - we can delete the intermediate states, and link top->base */
5426     /* TODO Check graph modification op blockers (BLK_PERM_GRAPH_MOD) once
5427      * we've figured out how they should work. */
5428     if (!backing_file_str) {
5429         bdrv_refresh_filename(base);
5430         backing_file_str = base->filename;
5431     }
5432 
5433     QLIST_FOREACH(c, &top->parents, next_parent) {
5434         updated_children = g_slist_prepend(updated_children, c);
5435     }
5436 
5437     /*
5438      * It seems correct to pass detach_subchain=true here, but it triggers
5439      * one more yet not fixed bug, when due to nested aio_poll loop we switch to
5440      * another drained section, which modify the graph (for example, removing
5441      * the child, which we keep in updated_children list). So, it's a TODO.
5442      *
5443      * Note, bug triggered if pass detach_subchain=true here and run
5444      * test-bdrv-drain. test_drop_intermediate_poll() test-case will crash.
5445      * That's a FIXME.
5446      */
5447     bdrv_replace_node_common(top, base, false, false, &local_err);
5448     if (local_err) {
5449         error_report_err(local_err);
5450         goto exit;
5451     }
5452 
5453     for (p = updated_children; p; p = p->next) {
5454         c = p->data;
5455 
5456         if (c->klass->update_filename) {
5457             ret = c->klass->update_filename(c, base, backing_file_str,
5458                                             &local_err);
5459             if (ret < 0) {
5460                 /*
5461                  * TODO: Actually, we want to rollback all previous iterations
5462                  * of this loop, and (which is almost impossible) previous
5463                  * bdrv_replace_node()...
5464                  *
5465                  * Note, that c->klass->update_filename may lead to permission
5466                  * update, so it's a bad idea to call it inside permission
5467                  * update transaction of bdrv_replace_node.
5468                  */
5469                 error_report_err(local_err);
5470                 goto exit;
5471             }
5472         }
5473     }
5474 
5475     if (update_inherits_from) {
5476         base->inherits_from = explicit_top->inherits_from;
5477     }
5478 
5479     ret = 0;
5480 exit:
5481     bdrv_subtree_drained_end(top);
5482     bdrv_unref(top);
5483     return ret;
5484 }
5485 
5486 /**
5487  * Implementation of BlockDriver.bdrv_get_allocated_file_size() that
5488  * sums the size of all data-bearing children.  (This excludes backing
5489  * children.)
5490  */
5491 static int64_t bdrv_sum_allocated_file_size(BlockDriverState *bs)
5492 {
5493     BdrvChild *child;
5494     int64_t child_size, sum = 0;
5495 
5496     QLIST_FOREACH(child, &bs->children, next) {
5497         if (child->role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
5498                            BDRV_CHILD_FILTERED))
5499         {
5500             child_size = bdrv_get_allocated_file_size(child->bs);
5501             if (child_size < 0) {
5502                 return child_size;
5503             }
5504             sum += child_size;
5505         }
5506     }
5507 
5508     return sum;
5509 }
5510 
5511 /**
5512  * Length of a allocated file in bytes. Sparse files are counted by actual
5513  * allocated space. Return < 0 if error or unknown.
5514  */
5515 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
5516 {
5517     BlockDriver *drv = bs->drv;
5518     if (!drv) {
5519         return -ENOMEDIUM;
5520     }
5521     if (drv->bdrv_get_allocated_file_size) {
5522         return drv->bdrv_get_allocated_file_size(bs);
5523     }
5524 
5525     if (drv->bdrv_file_open) {
5526         /*
5527          * Protocol drivers default to -ENOTSUP (most of their data is
5528          * not stored in any of their children (if they even have any),
5529          * so there is no generic way to figure it out).
5530          */
5531         return -ENOTSUP;
5532     } else if (drv->is_filter) {
5533         /* Filter drivers default to the size of their filtered child */
5534         return bdrv_get_allocated_file_size(bdrv_filter_bs(bs));
5535     } else {
5536         /* Other drivers default to summing their children's sizes */
5537         return bdrv_sum_allocated_file_size(bs);
5538     }
5539 }
5540 
5541 /*
5542  * bdrv_measure:
5543  * @drv: Format driver
5544  * @opts: Creation options for new image
5545  * @in_bs: Existing image containing data for new image (may be NULL)
5546  * @errp: Error object
5547  * Returns: A #BlockMeasureInfo (free using qapi_free_BlockMeasureInfo())
5548  *          or NULL on error
5549  *
5550  * Calculate file size required to create a new image.
5551  *
5552  * If @in_bs is given then space for allocated clusters and zero clusters
5553  * from that image are included in the calculation.  If @opts contains a
5554  * backing file that is shared by @in_bs then backing clusters may be omitted
5555  * from the calculation.
5556  *
5557  * If @in_bs is NULL then the calculation includes no allocated clusters
5558  * unless a preallocation option is given in @opts.
5559  *
5560  * Note that @in_bs may use a different BlockDriver from @drv.
5561  *
5562  * If an error occurs the @errp pointer is set.
5563  */
5564 BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
5565                                BlockDriverState *in_bs, Error **errp)
5566 {
5567     if (!drv->bdrv_measure) {
5568         error_setg(errp, "Block driver '%s' does not support size measurement",
5569                    drv->format_name);
5570         return NULL;
5571     }
5572 
5573     return drv->bdrv_measure(opts, in_bs, errp);
5574 }
5575 
5576 /**
5577  * Return number of sectors on success, -errno on error.
5578  */
5579 int64_t bdrv_nb_sectors(BlockDriverState *bs)
5580 {
5581     BlockDriver *drv = bs->drv;
5582 
5583     if (!drv)
5584         return -ENOMEDIUM;
5585 
5586     if (drv->has_variable_length) {
5587         int ret = refresh_total_sectors(bs, bs->total_sectors);
5588         if (ret < 0) {
5589             return ret;
5590         }
5591     }
5592     return bs->total_sectors;
5593 }
5594 
5595 /**
5596  * Return length in bytes on success, -errno on error.
5597  * The length is always a multiple of BDRV_SECTOR_SIZE.
5598  */
5599 int64_t bdrv_getlength(BlockDriverState *bs)
5600 {
5601     int64_t ret = bdrv_nb_sectors(bs);
5602 
5603     if (ret < 0) {
5604         return ret;
5605     }
5606     if (ret > INT64_MAX / BDRV_SECTOR_SIZE) {
5607         return -EFBIG;
5608     }
5609     return ret * BDRV_SECTOR_SIZE;
5610 }
5611 
5612 /* return 0 as number of sectors if no device present or error */
5613 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
5614 {
5615     int64_t nb_sectors = bdrv_nb_sectors(bs);
5616 
5617     *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
5618 }
5619 
5620 bool bdrv_is_sg(BlockDriverState *bs)
5621 {
5622     return bs->sg;
5623 }
5624 
5625 /**
5626  * Return whether the given node supports compressed writes.
5627  */
5628 bool bdrv_supports_compressed_writes(BlockDriverState *bs)
5629 {
5630     BlockDriverState *filtered;
5631 
5632     if (!bs->drv || !block_driver_can_compress(bs->drv)) {
5633         return false;
5634     }
5635 
5636     filtered = bdrv_filter_bs(bs);
5637     if (filtered) {
5638         /*
5639          * Filters can only forward compressed writes, so we have to
5640          * check the child.
5641          */
5642         return bdrv_supports_compressed_writes(filtered);
5643     }
5644 
5645     return true;
5646 }
5647 
5648 const char *bdrv_get_format_name(BlockDriverState *bs)
5649 {
5650     return bs->drv ? bs->drv->format_name : NULL;
5651 }
5652 
5653 static int qsort_strcmp(const void *a, const void *b)
5654 {
5655     return strcmp(*(char *const *)a, *(char *const *)b);
5656 }
5657 
5658 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
5659                          void *opaque, bool read_only)
5660 {
5661     BlockDriver *drv;
5662     int count = 0;
5663     int i;
5664     const char **formats = NULL;
5665 
5666     QLIST_FOREACH(drv, &bdrv_drivers, list) {
5667         if (drv->format_name) {
5668             bool found = false;
5669             int i = count;
5670 
5671             if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, read_only)) {
5672                 continue;
5673             }
5674 
5675             while (formats && i && !found) {
5676                 found = !strcmp(formats[--i], drv->format_name);
5677             }
5678 
5679             if (!found) {
5680                 formats = g_renew(const char *, formats, count + 1);
5681                 formats[count++] = drv->format_name;
5682             }
5683         }
5684     }
5685 
5686     for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); i++) {
5687         const char *format_name = block_driver_modules[i].format_name;
5688 
5689         if (format_name) {
5690             bool found = false;
5691             int j = count;
5692 
5693             if (use_bdrv_whitelist &&
5694                 !bdrv_format_is_whitelisted(format_name, read_only)) {
5695                 continue;
5696             }
5697 
5698             while (formats && j && !found) {
5699                 found = !strcmp(formats[--j], format_name);
5700             }
5701 
5702             if (!found) {
5703                 formats = g_renew(const char *, formats, count + 1);
5704                 formats[count++] = format_name;
5705             }
5706         }
5707     }
5708 
5709     qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
5710 
5711     for (i = 0; i < count; i++) {
5712         it(opaque, formats[i]);
5713     }
5714 
5715     g_free(formats);
5716 }
5717 
5718 /* This function is to find a node in the bs graph */
5719 BlockDriverState *bdrv_find_node(const char *node_name)
5720 {
5721     BlockDriverState *bs;
5722 
5723     assert(node_name);
5724 
5725     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
5726         if (!strcmp(node_name, bs->node_name)) {
5727             return bs;
5728         }
5729     }
5730     return NULL;
5731 }
5732 
5733 /* Put this QMP function here so it can access the static graph_bdrv_states. */
5734 BlockDeviceInfoList *bdrv_named_nodes_list(bool flat,
5735                                            Error **errp)
5736 {
5737     BlockDeviceInfoList *list;
5738     BlockDriverState *bs;
5739 
5740     list = NULL;
5741     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
5742         BlockDeviceInfo *info = bdrv_block_device_info(NULL, bs, flat, errp);
5743         if (!info) {
5744             qapi_free_BlockDeviceInfoList(list);
5745             return NULL;
5746         }
5747         QAPI_LIST_PREPEND(list, info);
5748     }
5749 
5750     return list;
5751 }
5752 
5753 typedef struct XDbgBlockGraphConstructor {
5754     XDbgBlockGraph *graph;
5755     GHashTable *graph_nodes;
5756 } XDbgBlockGraphConstructor;
5757 
5758 static XDbgBlockGraphConstructor *xdbg_graph_new(void)
5759 {
5760     XDbgBlockGraphConstructor *gr = g_new(XDbgBlockGraphConstructor, 1);
5761 
5762     gr->graph = g_new0(XDbgBlockGraph, 1);
5763     gr->graph_nodes = g_hash_table_new(NULL, NULL);
5764 
5765     return gr;
5766 }
5767 
5768 static XDbgBlockGraph *xdbg_graph_finalize(XDbgBlockGraphConstructor *gr)
5769 {
5770     XDbgBlockGraph *graph = gr->graph;
5771 
5772     g_hash_table_destroy(gr->graph_nodes);
5773     g_free(gr);
5774 
5775     return graph;
5776 }
5777 
5778 static uintptr_t xdbg_graph_node_num(XDbgBlockGraphConstructor *gr, void *node)
5779 {
5780     uintptr_t ret = (uintptr_t)g_hash_table_lookup(gr->graph_nodes, node);
5781 
5782     if (ret != 0) {
5783         return ret;
5784     }
5785 
5786     /*
5787      * Start counting from 1, not 0, because 0 interferes with not-found (NULL)
5788      * answer of g_hash_table_lookup.
5789      */
5790     ret = g_hash_table_size(gr->graph_nodes) + 1;
5791     g_hash_table_insert(gr->graph_nodes, node, (void *)ret);
5792 
5793     return ret;
5794 }
5795 
5796 static void xdbg_graph_add_node(XDbgBlockGraphConstructor *gr, void *node,
5797                                 XDbgBlockGraphNodeType type, const char *name)
5798 {
5799     XDbgBlockGraphNode *n;
5800 
5801     n = g_new0(XDbgBlockGraphNode, 1);
5802 
5803     n->id = xdbg_graph_node_num(gr, node);
5804     n->type = type;
5805     n->name = g_strdup(name);
5806 
5807     QAPI_LIST_PREPEND(gr->graph->nodes, n);
5808 }
5809 
5810 static void xdbg_graph_add_edge(XDbgBlockGraphConstructor *gr, void *parent,
5811                                 const BdrvChild *child)
5812 {
5813     BlockPermission qapi_perm;
5814     XDbgBlockGraphEdge *edge;
5815 
5816     edge = g_new0(XDbgBlockGraphEdge, 1);
5817 
5818     edge->parent = xdbg_graph_node_num(gr, parent);
5819     edge->child = xdbg_graph_node_num(gr, child->bs);
5820     edge->name = g_strdup(child->name);
5821 
5822     for (qapi_perm = 0; qapi_perm < BLOCK_PERMISSION__MAX; qapi_perm++) {
5823         uint64_t flag = bdrv_qapi_perm_to_blk_perm(qapi_perm);
5824 
5825         if (flag & child->perm) {
5826             QAPI_LIST_PREPEND(edge->perm, qapi_perm);
5827         }
5828         if (flag & child->shared_perm) {
5829             QAPI_LIST_PREPEND(edge->shared_perm, qapi_perm);
5830         }
5831     }
5832 
5833     QAPI_LIST_PREPEND(gr->graph->edges, edge);
5834 }
5835 
5836 
5837 XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp)
5838 {
5839     BlockBackend *blk;
5840     BlockJob *job;
5841     BlockDriverState *bs;
5842     BdrvChild *child;
5843     XDbgBlockGraphConstructor *gr = xdbg_graph_new();
5844 
5845     for (blk = blk_all_next(NULL); blk; blk = blk_all_next(blk)) {
5846         char *allocated_name = NULL;
5847         const char *name = blk_name(blk);
5848 
5849         if (!*name) {
5850             name = allocated_name = blk_get_attached_dev_id(blk);
5851         }
5852         xdbg_graph_add_node(gr, blk, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_BACKEND,
5853                            name);
5854         g_free(allocated_name);
5855         if (blk_root(blk)) {
5856             xdbg_graph_add_edge(gr, blk, blk_root(blk));
5857         }
5858     }
5859 
5860     for (job = block_job_next(NULL); job; job = block_job_next(job)) {
5861         GSList *el;
5862 
5863         xdbg_graph_add_node(gr, job, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_JOB,
5864                            job->job.id);
5865         for (el = job->nodes; el; el = el->next) {
5866             xdbg_graph_add_edge(gr, job, (BdrvChild *)el->data);
5867         }
5868     }
5869 
5870     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
5871         xdbg_graph_add_node(gr, bs, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_DRIVER,
5872                            bs->node_name);
5873         QLIST_FOREACH(child, &bs->children, next) {
5874             xdbg_graph_add_edge(gr, bs, child);
5875         }
5876     }
5877 
5878     return xdbg_graph_finalize(gr);
5879 }
5880 
5881 BlockDriverState *bdrv_lookup_bs(const char *device,
5882                                  const char *node_name,
5883                                  Error **errp)
5884 {
5885     BlockBackend *blk;
5886     BlockDriverState *bs;
5887 
5888     if (device) {
5889         blk = blk_by_name(device);
5890 
5891         if (blk) {
5892             bs = blk_bs(blk);
5893             if (!bs) {
5894                 error_setg(errp, "Device '%s' has no medium", device);
5895             }
5896 
5897             return bs;
5898         }
5899     }
5900 
5901     if (node_name) {
5902         bs = bdrv_find_node(node_name);
5903 
5904         if (bs) {
5905             return bs;
5906         }
5907     }
5908 
5909     error_setg(errp, "Cannot find device=\'%s\' nor node-name=\'%s\'",
5910                      device ? device : "",
5911                      node_name ? node_name : "");
5912     return NULL;
5913 }
5914 
5915 /* If 'base' is in the same chain as 'top', return true. Otherwise,
5916  * return false.  If either argument is NULL, return false. */
5917 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
5918 {
5919     while (top && top != base) {
5920         top = bdrv_filter_or_cow_bs(top);
5921     }
5922 
5923     return top != NULL;
5924 }
5925 
5926 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
5927 {
5928     if (!bs) {
5929         return QTAILQ_FIRST(&graph_bdrv_states);
5930     }
5931     return QTAILQ_NEXT(bs, node_list);
5932 }
5933 
5934 BlockDriverState *bdrv_next_all_states(BlockDriverState *bs)
5935 {
5936     if (!bs) {
5937         return QTAILQ_FIRST(&all_bdrv_states);
5938     }
5939     return QTAILQ_NEXT(bs, bs_list);
5940 }
5941 
5942 const char *bdrv_get_node_name(const BlockDriverState *bs)
5943 {
5944     return bs->node_name;
5945 }
5946 
5947 const char *bdrv_get_parent_name(const BlockDriverState *bs)
5948 {
5949     BdrvChild *c;
5950     const char *name;
5951 
5952     /* If multiple parents have a name, just pick the first one. */
5953     QLIST_FOREACH(c, &bs->parents, next_parent) {
5954         if (c->klass->get_name) {
5955             name = c->klass->get_name(c);
5956             if (name && *name) {
5957                 return name;
5958             }
5959         }
5960     }
5961 
5962     return NULL;
5963 }
5964 
5965 /* TODO check what callers really want: bs->node_name or blk_name() */
5966 const char *bdrv_get_device_name(const BlockDriverState *bs)
5967 {
5968     return bdrv_get_parent_name(bs) ?: "";
5969 }
5970 
5971 /* This can be used to identify nodes that might not have a device
5972  * name associated. Since node and device names live in the same
5973  * namespace, the result is unambiguous. The exception is if both are
5974  * absent, then this returns an empty (non-null) string. */
5975 const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
5976 {
5977     return bdrv_get_parent_name(bs) ?: bs->node_name;
5978 }
5979 
5980 int bdrv_get_flags(BlockDriverState *bs)
5981 {
5982     return bs->open_flags;
5983 }
5984 
5985 int bdrv_has_zero_init_1(BlockDriverState *bs)
5986 {
5987     return 1;
5988 }
5989 
5990 int bdrv_has_zero_init(BlockDriverState *bs)
5991 {
5992     BlockDriverState *filtered;
5993 
5994     if (!bs->drv) {
5995         return 0;
5996     }
5997 
5998     /* If BS is a copy on write image, it is initialized to
5999        the contents of the base image, which may not be zeroes.  */
6000     if (bdrv_cow_child(bs)) {
6001         return 0;
6002     }
6003     if (bs->drv->bdrv_has_zero_init) {
6004         return bs->drv->bdrv_has_zero_init(bs);
6005     }
6006 
6007     filtered = bdrv_filter_bs(bs);
6008     if (filtered) {
6009         return bdrv_has_zero_init(filtered);
6010     }
6011 
6012     /* safe default */
6013     return 0;
6014 }
6015 
6016 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
6017 {
6018     if (!(bs->open_flags & BDRV_O_UNMAP)) {
6019         return false;
6020     }
6021 
6022     return bs->supported_zero_flags & BDRV_REQ_MAY_UNMAP;
6023 }
6024 
6025 void bdrv_get_backing_filename(BlockDriverState *bs,
6026                                char *filename, int filename_size)
6027 {
6028     pstrcpy(filename, filename_size, bs->backing_file);
6029 }
6030 
6031 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
6032 {
6033     int ret;
6034     BlockDriver *drv = bs->drv;
6035     /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
6036     if (!drv) {
6037         return -ENOMEDIUM;
6038     }
6039     if (!drv->bdrv_get_info) {
6040         BlockDriverState *filtered = bdrv_filter_bs(bs);
6041         if (filtered) {
6042             return bdrv_get_info(filtered, bdi);
6043         }
6044         return -ENOTSUP;
6045     }
6046     memset(bdi, 0, sizeof(*bdi));
6047     ret = drv->bdrv_get_info(bs, bdi);
6048     if (ret < 0) {
6049         return ret;
6050     }
6051 
6052     if (bdi->cluster_size > BDRV_MAX_ALIGNMENT) {
6053         return -EINVAL;
6054     }
6055 
6056     return 0;
6057 }
6058 
6059 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
6060                                           Error **errp)
6061 {
6062     BlockDriver *drv = bs->drv;
6063     if (drv && drv->bdrv_get_specific_info) {
6064         return drv->bdrv_get_specific_info(bs, errp);
6065     }
6066     return NULL;
6067 }
6068 
6069 BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs)
6070 {
6071     BlockDriver *drv = bs->drv;
6072     if (!drv || !drv->bdrv_get_specific_stats) {
6073         return NULL;
6074     }
6075     return drv->bdrv_get_specific_stats(bs);
6076 }
6077 
6078 void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event)
6079 {
6080     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
6081         return;
6082     }
6083 
6084     bs->drv->bdrv_debug_event(bs, event);
6085 }
6086 
6087 static BlockDriverState *bdrv_find_debug_node(BlockDriverState *bs)
6088 {
6089     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
6090         bs = bdrv_primary_bs(bs);
6091     }
6092 
6093     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
6094         assert(bs->drv->bdrv_debug_remove_breakpoint);
6095         return bs;
6096     }
6097 
6098     return NULL;
6099 }
6100 
6101 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
6102                           const char *tag)
6103 {
6104     bs = bdrv_find_debug_node(bs);
6105     if (bs) {
6106         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
6107     }
6108 
6109     return -ENOTSUP;
6110 }
6111 
6112 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
6113 {
6114     bs = bdrv_find_debug_node(bs);
6115     if (bs) {
6116         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
6117     }
6118 
6119     return -ENOTSUP;
6120 }
6121 
6122 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
6123 {
6124     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
6125         bs = bdrv_primary_bs(bs);
6126     }
6127 
6128     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
6129         return bs->drv->bdrv_debug_resume(bs, tag);
6130     }
6131 
6132     return -ENOTSUP;
6133 }
6134 
6135 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
6136 {
6137     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
6138         bs = bdrv_primary_bs(bs);
6139     }
6140 
6141     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
6142         return bs->drv->bdrv_debug_is_suspended(bs, tag);
6143     }
6144 
6145     return false;
6146 }
6147 
6148 /* backing_file can either be relative, or absolute, or a protocol.  If it is
6149  * relative, it must be relative to the chain.  So, passing in bs->filename
6150  * from a BDS as backing_file should not be done, as that may be relative to
6151  * the CWD rather than the chain. */
6152 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
6153         const char *backing_file)
6154 {
6155     char *filename_full = NULL;
6156     char *backing_file_full = NULL;
6157     char *filename_tmp = NULL;
6158     int is_protocol = 0;
6159     bool filenames_refreshed = false;
6160     BlockDriverState *curr_bs = NULL;
6161     BlockDriverState *retval = NULL;
6162     BlockDriverState *bs_below;
6163 
6164     if (!bs || !bs->drv || !backing_file) {
6165         return NULL;
6166     }
6167 
6168     filename_full     = g_malloc(PATH_MAX);
6169     backing_file_full = g_malloc(PATH_MAX);
6170 
6171     is_protocol = path_has_protocol(backing_file);
6172 
6173     /*
6174      * Being largely a legacy function, skip any filters here
6175      * (because filters do not have normal filenames, so they cannot
6176      * match anyway; and allowing json:{} filenames is a bit out of
6177      * scope).
6178      */
6179     for (curr_bs = bdrv_skip_filters(bs);
6180          bdrv_cow_child(curr_bs) != NULL;
6181          curr_bs = bs_below)
6182     {
6183         bs_below = bdrv_backing_chain_next(curr_bs);
6184 
6185         if (bdrv_backing_overridden(curr_bs)) {
6186             /*
6187              * If the backing file was overridden, we can only compare
6188              * directly against the backing node's filename.
6189              */
6190 
6191             if (!filenames_refreshed) {
6192                 /*
6193                  * This will automatically refresh all of the
6194                  * filenames in the rest of the backing chain, so we
6195                  * only need to do this once.
6196                  */
6197                 bdrv_refresh_filename(bs_below);
6198                 filenames_refreshed = true;
6199             }
6200 
6201             if (strcmp(backing_file, bs_below->filename) == 0) {
6202                 retval = bs_below;
6203                 break;
6204             }
6205         } else if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
6206             /*
6207              * If either of the filename paths is actually a protocol, then
6208              * compare unmodified paths; otherwise make paths relative.
6209              */
6210             char *backing_file_full_ret;
6211 
6212             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
6213                 retval = bs_below;
6214                 break;
6215             }
6216             /* Also check against the full backing filename for the image */
6217             backing_file_full_ret = bdrv_get_full_backing_filename(curr_bs,
6218                                                                    NULL);
6219             if (backing_file_full_ret) {
6220                 bool equal = strcmp(backing_file, backing_file_full_ret) == 0;
6221                 g_free(backing_file_full_ret);
6222                 if (equal) {
6223                     retval = bs_below;
6224                     break;
6225                 }
6226             }
6227         } else {
6228             /* If not an absolute filename path, make it relative to the current
6229              * image's filename path */
6230             filename_tmp = bdrv_make_absolute_filename(curr_bs, backing_file,
6231                                                        NULL);
6232             /* We are going to compare canonicalized absolute pathnames */
6233             if (!filename_tmp || !realpath(filename_tmp, filename_full)) {
6234                 g_free(filename_tmp);
6235                 continue;
6236             }
6237             g_free(filename_tmp);
6238 
6239             /* We need to make sure the backing filename we are comparing against
6240              * is relative to the current image filename (or absolute) */
6241             filename_tmp = bdrv_get_full_backing_filename(curr_bs, NULL);
6242             if (!filename_tmp || !realpath(filename_tmp, backing_file_full)) {
6243                 g_free(filename_tmp);
6244                 continue;
6245             }
6246             g_free(filename_tmp);
6247 
6248             if (strcmp(backing_file_full, filename_full) == 0) {
6249                 retval = bs_below;
6250                 break;
6251             }
6252         }
6253     }
6254 
6255     g_free(filename_full);
6256     g_free(backing_file_full);
6257     return retval;
6258 }
6259 
6260 void bdrv_init(void)
6261 {
6262 #ifdef CONFIG_BDRV_WHITELIST_TOOLS
6263     use_bdrv_whitelist = 1;
6264 #endif
6265     module_call_init(MODULE_INIT_BLOCK);
6266 }
6267 
6268 void bdrv_init_with_whitelist(void)
6269 {
6270     use_bdrv_whitelist = 1;
6271     bdrv_init();
6272 }
6273 
6274 int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp)
6275 {
6276     BdrvChild *child, *parent;
6277     Error *local_err = NULL;
6278     int ret;
6279     BdrvDirtyBitmap *bm;
6280 
6281     if (!bs->drv)  {
6282         return -ENOMEDIUM;
6283     }
6284 
6285     QLIST_FOREACH(child, &bs->children, next) {
6286         bdrv_co_invalidate_cache(child->bs, &local_err);
6287         if (local_err) {
6288             error_propagate(errp, local_err);
6289             return -EINVAL;
6290         }
6291     }
6292 
6293     /*
6294      * Update permissions, they may differ for inactive nodes.
6295      *
6296      * Note that the required permissions of inactive images are always a
6297      * subset of the permissions required after activating the image. This
6298      * allows us to just get the permissions upfront without restricting
6299      * drv->bdrv_invalidate_cache().
6300      *
6301      * It also means that in error cases, we don't have to try and revert to
6302      * the old permissions (which is an operation that could fail, too). We can
6303      * just keep the extended permissions for the next time that an activation
6304      * of the image is tried.
6305      */
6306     if (bs->open_flags & BDRV_O_INACTIVE) {
6307         bs->open_flags &= ~BDRV_O_INACTIVE;
6308         ret = bdrv_refresh_perms(bs, errp);
6309         if (ret < 0) {
6310             bs->open_flags |= BDRV_O_INACTIVE;
6311             return ret;
6312         }
6313 
6314         if (bs->drv->bdrv_co_invalidate_cache) {
6315             bs->drv->bdrv_co_invalidate_cache(bs, &local_err);
6316             if (local_err) {
6317                 bs->open_flags |= BDRV_O_INACTIVE;
6318                 error_propagate(errp, local_err);
6319                 return -EINVAL;
6320             }
6321         }
6322 
6323         FOR_EACH_DIRTY_BITMAP(bs, bm) {
6324             bdrv_dirty_bitmap_skip_store(bm, false);
6325         }
6326 
6327         ret = refresh_total_sectors(bs, bs->total_sectors);
6328         if (ret < 0) {
6329             bs->open_flags |= BDRV_O_INACTIVE;
6330             error_setg_errno(errp, -ret, "Could not refresh total sector count");
6331             return ret;
6332         }
6333     }
6334 
6335     QLIST_FOREACH(parent, &bs->parents, next_parent) {
6336         if (parent->klass->activate) {
6337             parent->klass->activate(parent, &local_err);
6338             if (local_err) {
6339                 bs->open_flags |= BDRV_O_INACTIVE;
6340                 error_propagate(errp, local_err);
6341                 return -EINVAL;
6342             }
6343         }
6344     }
6345 
6346     return 0;
6347 }
6348 
6349 void bdrv_invalidate_cache_all(Error **errp)
6350 {
6351     BlockDriverState *bs;
6352     BdrvNextIterator it;
6353 
6354     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
6355         AioContext *aio_context = bdrv_get_aio_context(bs);
6356         int ret;
6357 
6358         aio_context_acquire(aio_context);
6359         ret = bdrv_invalidate_cache(bs, errp);
6360         aio_context_release(aio_context);
6361         if (ret < 0) {
6362             bdrv_next_cleanup(&it);
6363             return;
6364         }
6365     }
6366 }
6367 
6368 static bool bdrv_has_bds_parent(BlockDriverState *bs, bool only_active)
6369 {
6370     BdrvChild *parent;
6371 
6372     QLIST_FOREACH(parent, &bs->parents, next_parent) {
6373         if (parent->klass->parent_is_bds) {
6374             BlockDriverState *parent_bs = parent->opaque;
6375             if (!only_active || !(parent_bs->open_flags & BDRV_O_INACTIVE)) {
6376                 return true;
6377             }
6378         }
6379     }
6380 
6381     return false;
6382 }
6383 
6384 static int bdrv_inactivate_recurse(BlockDriverState *bs)
6385 {
6386     BdrvChild *child, *parent;
6387     int ret;
6388     uint64_t cumulative_perms, cumulative_shared_perms;
6389 
6390     if (!bs->drv) {
6391         return -ENOMEDIUM;
6392     }
6393 
6394     /* Make sure that we don't inactivate a child before its parent.
6395      * It will be covered by recursion from the yet active parent. */
6396     if (bdrv_has_bds_parent(bs, true)) {
6397         return 0;
6398     }
6399 
6400     assert(!(bs->open_flags & BDRV_O_INACTIVE));
6401 
6402     /* Inactivate this node */
6403     if (bs->drv->bdrv_inactivate) {
6404         ret = bs->drv->bdrv_inactivate(bs);
6405         if (ret < 0) {
6406             return ret;
6407         }
6408     }
6409 
6410     QLIST_FOREACH(parent, &bs->parents, next_parent) {
6411         if (parent->klass->inactivate) {
6412             ret = parent->klass->inactivate(parent);
6413             if (ret < 0) {
6414                 return ret;
6415             }
6416         }
6417     }
6418 
6419     bdrv_get_cumulative_perm(bs, &cumulative_perms,
6420                              &cumulative_shared_perms);
6421     if (cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) {
6422         /* Our inactive parents still need write access. Inactivation failed. */
6423         return -EPERM;
6424     }
6425 
6426     bs->open_flags |= BDRV_O_INACTIVE;
6427 
6428     /*
6429      * Update permissions, they may differ for inactive nodes.
6430      * We only tried to loosen restrictions, so errors are not fatal, ignore
6431      * them.
6432      */
6433     bdrv_refresh_perms(bs, NULL);
6434 
6435     /* Recursively inactivate children */
6436     QLIST_FOREACH(child, &bs->children, next) {
6437         ret = bdrv_inactivate_recurse(child->bs);
6438         if (ret < 0) {
6439             return ret;
6440         }
6441     }
6442 
6443     return 0;
6444 }
6445 
6446 int bdrv_inactivate_all(void)
6447 {
6448     BlockDriverState *bs = NULL;
6449     BdrvNextIterator it;
6450     int ret = 0;
6451     GSList *aio_ctxs = NULL, *ctx;
6452 
6453     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
6454         AioContext *aio_context = bdrv_get_aio_context(bs);
6455 
6456         if (!g_slist_find(aio_ctxs, aio_context)) {
6457             aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
6458             aio_context_acquire(aio_context);
6459         }
6460     }
6461 
6462     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
6463         /* Nodes with BDS parents are covered by recursion from the last
6464          * parent that gets inactivated. Don't inactivate them a second
6465          * time if that has already happened. */
6466         if (bdrv_has_bds_parent(bs, false)) {
6467             continue;
6468         }
6469         ret = bdrv_inactivate_recurse(bs);
6470         if (ret < 0) {
6471             bdrv_next_cleanup(&it);
6472             goto out;
6473         }
6474     }
6475 
6476 out:
6477     for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
6478         AioContext *aio_context = ctx->data;
6479         aio_context_release(aio_context);
6480     }
6481     g_slist_free(aio_ctxs);
6482 
6483     return ret;
6484 }
6485 
6486 /**************************************************************/
6487 /* removable device support */
6488 
6489 /**
6490  * Return TRUE if the media is present
6491  */
6492 bool bdrv_is_inserted(BlockDriverState *bs)
6493 {
6494     BlockDriver *drv = bs->drv;
6495     BdrvChild *child;
6496 
6497     if (!drv) {
6498         return false;
6499     }
6500     if (drv->bdrv_is_inserted) {
6501         return drv->bdrv_is_inserted(bs);
6502     }
6503     QLIST_FOREACH(child, &bs->children, next) {
6504         if (!bdrv_is_inserted(child->bs)) {
6505             return false;
6506         }
6507     }
6508     return true;
6509 }
6510 
6511 /**
6512  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
6513  */
6514 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
6515 {
6516     BlockDriver *drv = bs->drv;
6517 
6518     if (drv && drv->bdrv_eject) {
6519         drv->bdrv_eject(bs, eject_flag);
6520     }
6521 }
6522 
6523 /**
6524  * Lock or unlock the media (if it is locked, the user won't be able
6525  * to eject it manually).
6526  */
6527 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
6528 {
6529     BlockDriver *drv = bs->drv;
6530 
6531     trace_bdrv_lock_medium(bs, locked);
6532 
6533     if (drv && drv->bdrv_lock_medium) {
6534         drv->bdrv_lock_medium(bs, locked);
6535     }
6536 }
6537 
6538 /* Get a reference to bs */
6539 void bdrv_ref(BlockDriverState *bs)
6540 {
6541     bs->refcnt++;
6542 }
6543 
6544 /* Release a previously grabbed reference to bs.
6545  * If after releasing, reference count is zero, the BlockDriverState is
6546  * deleted. */
6547 void bdrv_unref(BlockDriverState *bs)
6548 {
6549     if (!bs) {
6550         return;
6551     }
6552     assert(bs->refcnt > 0);
6553     if (--bs->refcnt == 0) {
6554         bdrv_delete(bs);
6555     }
6556 }
6557 
6558 struct BdrvOpBlocker {
6559     Error *reason;
6560     QLIST_ENTRY(BdrvOpBlocker) list;
6561 };
6562 
6563 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
6564 {
6565     BdrvOpBlocker *blocker;
6566     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
6567     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
6568         blocker = QLIST_FIRST(&bs->op_blockers[op]);
6569         error_propagate_prepend(errp, error_copy(blocker->reason),
6570                                 "Node '%s' is busy: ",
6571                                 bdrv_get_device_or_node_name(bs));
6572         return true;
6573     }
6574     return false;
6575 }
6576 
6577 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
6578 {
6579     BdrvOpBlocker *blocker;
6580     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
6581 
6582     blocker = g_new0(BdrvOpBlocker, 1);
6583     blocker->reason = reason;
6584     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
6585 }
6586 
6587 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
6588 {
6589     BdrvOpBlocker *blocker, *next;
6590     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
6591     QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
6592         if (blocker->reason == reason) {
6593             QLIST_REMOVE(blocker, list);
6594             g_free(blocker);
6595         }
6596     }
6597 }
6598 
6599 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
6600 {
6601     int i;
6602     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
6603         bdrv_op_block(bs, i, reason);
6604     }
6605 }
6606 
6607 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
6608 {
6609     int i;
6610     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
6611         bdrv_op_unblock(bs, i, reason);
6612     }
6613 }
6614 
6615 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
6616 {
6617     int i;
6618 
6619     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
6620         if (!QLIST_EMPTY(&bs->op_blockers[i])) {
6621             return false;
6622         }
6623     }
6624     return true;
6625 }
6626 
6627 void bdrv_img_create(const char *filename, const char *fmt,
6628                      const char *base_filename, const char *base_fmt,
6629                      char *options, uint64_t img_size, int flags, bool quiet,
6630                      Error **errp)
6631 {
6632     QemuOptsList *create_opts = NULL;
6633     QemuOpts *opts = NULL;
6634     const char *backing_fmt, *backing_file;
6635     int64_t size;
6636     BlockDriver *drv, *proto_drv;
6637     Error *local_err = NULL;
6638     int ret = 0;
6639 
6640     /* Find driver and parse its options */
6641     drv = bdrv_find_format(fmt);
6642     if (!drv) {
6643         error_setg(errp, "Unknown file format '%s'", fmt);
6644         return;
6645     }
6646 
6647     proto_drv = bdrv_find_protocol(filename, true, errp);
6648     if (!proto_drv) {
6649         return;
6650     }
6651 
6652     if (!drv->create_opts) {
6653         error_setg(errp, "Format driver '%s' does not support image creation",
6654                    drv->format_name);
6655         return;
6656     }
6657 
6658     if (!proto_drv->create_opts) {
6659         error_setg(errp, "Protocol driver '%s' does not support image creation",
6660                    proto_drv->format_name);
6661         return;
6662     }
6663 
6664     /* Create parameter list */
6665     create_opts = qemu_opts_append(create_opts, drv->create_opts);
6666     create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
6667 
6668     opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
6669 
6670     /* Parse -o options */
6671     if (options) {
6672         if (!qemu_opts_do_parse(opts, options, NULL, errp)) {
6673             goto out;
6674         }
6675     }
6676 
6677     if (!qemu_opt_get(opts, BLOCK_OPT_SIZE)) {
6678         qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort);
6679     } else if (img_size != UINT64_C(-1)) {
6680         error_setg(errp, "The image size must be specified only once");
6681         goto out;
6682     }
6683 
6684     if (base_filename) {
6685         if (!qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename,
6686                           NULL)) {
6687             error_setg(errp, "Backing file not supported for file format '%s'",
6688                        fmt);
6689             goto out;
6690         }
6691     }
6692 
6693     if (base_fmt) {
6694         if (!qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt, NULL)) {
6695             error_setg(errp, "Backing file format not supported for file "
6696                              "format '%s'", fmt);
6697             goto out;
6698         }
6699     }
6700 
6701     backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
6702     if (backing_file) {
6703         if (!strcmp(filename, backing_file)) {
6704             error_setg(errp, "Error: Trying to create an image with the "
6705                              "same filename as the backing file");
6706             goto out;
6707         }
6708         if (backing_file[0] == '\0') {
6709             error_setg(errp, "Expected backing file name, got empty string");
6710             goto out;
6711         }
6712     }
6713 
6714     backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
6715 
6716     /* The size for the image must always be specified, unless we have a backing
6717      * file and we have not been forbidden from opening it. */
6718     size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, img_size);
6719     if (backing_file && !(flags & BDRV_O_NO_BACKING)) {
6720         BlockDriverState *bs;
6721         char *full_backing;
6722         int back_flags;
6723         QDict *backing_options = NULL;
6724 
6725         full_backing =
6726             bdrv_get_full_backing_filename_from_filename(filename, backing_file,
6727                                                          &local_err);
6728         if (local_err) {
6729             goto out;
6730         }
6731         assert(full_backing);
6732 
6733         /*
6734          * No need to do I/O here, which allows us to open encrypted
6735          * backing images without needing the secret
6736          */
6737         back_flags = flags;
6738         back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
6739         back_flags |= BDRV_O_NO_IO;
6740 
6741         backing_options = qdict_new();
6742         if (backing_fmt) {
6743             qdict_put_str(backing_options, "driver", backing_fmt);
6744         }
6745         qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
6746 
6747         bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
6748                        &local_err);
6749         g_free(full_backing);
6750         if (!bs) {
6751             error_append_hint(&local_err, "Could not open backing image.\n");
6752             goto out;
6753         } else {
6754             if (!backing_fmt) {
6755                 error_setg(&local_err,
6756                            "Backing file specified without backing format");
6757                 error_append_hint(&local_err, "Detected format of %s.",
6758                                   bs->drv->format_name);
6759                 goto out;
6760             }
6761             if (size == -1) {
6762                 /* Opened BS, have no size */
6763                 size = bdrv_getlength(bs);
6764                 if (size < 0) {
6765                     error_setg_errno(errp, -size, "Could not get size of '%s'",
6766                                      backing_file);
6767                     bdrv_unref(bs);
6768                     goto out;
6769                 }
6770                 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
6771             }
6772             bdrv_unref(bs);
6773         }
6774         /* (backing_file && !(flags & BDRV_O_NO_BACKING)) */
6775     } else if (backing_file && !backing_fmt) {
6776         error_setg(&local_err,
6777                    "Backing file specified without backing format");
6778         goto out;
6779     }
6780 
6781     if (size == -1) {
6782         error_setg(errp, "Image creation needs a size parameter");
6783         goto out;
6784     }
6785 
6786     if (!quiet) {
6787         printf("Formatting '%s', fmt=%s ", filename, fmt);
6788         qemu_opts_print(opts, " ");
6789         puts("");
6790         fflush(stdout);
6791     }
6792 
6793     ret = bdrv_create(drv, filename, opts, &local_err);
6794 
6795     if (ret == -EFBIG) {
6796         /* This is generally a better message than whatever the driver would
6797          * deliver (especially because of the cluster_size_hint), since that
6798          * is most probably not much different from "image too large". */
6799         const char *cluster_size_hint = "";
6800         if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
6801             cluster_size_hint = " (try using a larger cluster size)";
6802         }
6803         error_setg(errp, "The image size is too large for file format '%s'"
6804                    "%s", fmt, cluster_size_hint);
6805         error_free(local_err);
6806         local_err = NULL;
6807     }
6808 
6809 out:
6810     qemu_opts_del(opts);
6811     qemu_opts_free(create_opts);
6812     error_propagate(errp, local_err);
6813 }
6814 
6815 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
6816 {
6817     return bs ? bs->aio_context : qemu_get_aio_context();
6818 }
6819 
6820 AioContext *coroutine_fn bdrv_co_enter(BlockDriverState *bs)
6821 {
6822     Coroutine *self = qemu_coroutine_self();
6823     AioContext *old_ctx = qemu_coroutine_get_aio_context(self);
6824     AioContext *new_ctx;
6825 
6826     /*
6827      * Increase bs->in_flight to ensure that this operation is completed before
6828      * moving the node to a different AioContext. Read new_ctx only afterwards.
6829      */
6830     bdrv_inc_in_flight(bs);
6831 
6832     new_ctx = bdrv_get_aio_context(bs);
6833     aio_co_reschedule_self(new_ctx);
6834     return old_ctx;
6835 }
6836 
6837 void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx)
6838 {
6839     aio_co_reschedule_self(old_ctx);
6840     bdrv_dec_in_flight(bs);
6841 }
6842 
6843 void coroutine_fn bdrv_co_lock(BlockDriverState *bs)
6844 {
6845     AioContext *ctx = bdrv_get_aio_context(bs);
6846 
6847     /* In the main thread, bs->aio_context won't change concurrently */
6848     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
6849 
6850     /*
6851      * We're in coroutine context, so we already hold the lock of the main
6852      * loop AioContext. Don't lock it twice to avoid deadlocks.
6853      */
6854     assert(qemu_in_coroutine());
6855     if (ctx != qemu_get_aio_context()) {
6856         aio_context_acquire(ctx);
6857     }
6858 }
6859 
6860 void coroutine_fn bdrv_co_unlock(BlockDriverState *bs)
6861 {
6862     AioContext *ctx = bdrv_get_aio_context(bs);
6863 
6864     assert(qemu_in_coroutine());
6865     if (ctx != qemu_get_aio_context()) {
6866         aio_context_release(ctx);
6867     }
6868 }
6869 
6870 void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co)
6871 {
6872     aio_co_enter(bdrv_get_aio_context(bs), co);
6873 }
6874 
6875 static void bdrv_do_remove_aio_context_notifier(BdrvAioNotifier *ban)
6876 {
6877     QLIST_REMOVE(ban, list);
6878     g_free(ban);
6879 }
6880 
6881 static void bdrv_detach_aio_context(BlockDriverState *bs)
6882 {
6883     BdrvAioNotifier *baf, *baf_tmp;
6884 
6885     assert(!bs->walking_aio_notifiers);
6886     bs->walking_aio_notifiers = true;
6887     QLIST_FOREACH_SAFE(baf, &bs->aio_notifiers, list, baf_tmp) {
6888         if (baf->deleted) {
6889             bdrv_do_remove_aio_context_notifier(baf);
6890         } else {
6891             baf->detach_aio_context(baf->opaque);
6892         }
6893     }
6894     /* Never mind iterating again to check for ->deleted.  bdrv_close() will
6895      * remove remaining aio notifiers if we aren't called again.
6896      */
6897     bs->walking_aio_notifiers = false;
6898 
6899     if (bs->drv && bs->drv->bdrv_detach_aio_context) {
6900         bs->drv->bdrv_detach_aio_context(bs);
6901     }
6902 
6903     if (bs->quiesce_counter) {
6904         aio_enable_external(bs->aio_context);
6905     }
6906     bs->aio_context = NULL;
6907 }
6908 
6909 static void bdrv_attach_aio_context(BlockDriverState *bs,
6910                                     AioContext *new_context)
6911 {
6912     BdrvAioNotifier *ban, *ban_tmp;
6913 
6914     if (bs->quiesce_counter) {
6915         aio_disable_external(new_context);
6916     }
6917 
6918     bs->aio_context = new_context;
6919 
6920     if (bs->drv && bs->drv->bdrv_attach_aio_context) {
6921         bs->drv->bdrv_attach_aio_context(bs, new_context);
6922     }
6923 
6924     assert(!bs->walking_aio_notifiers);
6925     bs->walking_aio_notifiers = true;
6926     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_tmp) {
6927         if (ban->deleted) {
6928             bdrv_do_remove_aio_context_notifier(ban);
6929         } else {
6930             ban->attached_aio_context(new_context, ban->opaque);
6931         }
6932     }
6933     bs->walking_aio_notifiers = false;
6934 }
6935 
6936 /*
6937  * Changes the AioContext used for fd handlers, timers, and BHs by this
6938  * BlockDriverState and all its children and parents.
6939  *
6940  * Must be called from the main AioContext.
6941  *
6942  * The caller must own the AioContext lock for the old AioContext of bs, but it
6943  * must not own the AioContext lock for new_context (unless new_context is the
6944  * same as the current context of bs).
6945  *
6946  * @ignore will accumulate all visited BdrvChild object. The caller is
6947  * responsible for freeing the list afterwards.
6948  */
6949 void bdrv_set_aio_context_ignore(BlockDriverState *bs,
6950                                  AioContext *new_context, GSList **ignore)
6951 {
6952     AioContext *old_context = bdrv_get_aio_context(bs);
6953     GSList *children_to_process = NULL;
6954     GSList *parents_to_process = NULL;
6955     GSList *entry;
6956     BdrvChild *child, *parent;
6957 
6958     g_assert(qemu_get_current_aio_context() == qemu_get_aio_context());
6959 
6960     if (old_context == new_context) {
6961         return;
6962     }
6963 
6964     bdrv_drained_begin(bs);
6965 
6966     QLIST_FOREACH(child, &bs->children, next) {
6967         if (g_slist_find(*ignore, child)) {
6968             continue;
6969         }
6970         *ignore = g_slist_prepend(*ignore, child);
6971         children_to_process = g_slist_prepend(children_to_process, child);
6972     }
6973 
6974     QLIST_FOREACH(parent, &bs->parents, next_parent) {
6975         if (g_slist_find(*ignore, parent)) {
6976             continue;
6977         }
6978         *ignore = g_slist_prepend(*ignore, parent);
6979         parents_to_process = g_slist_prepend(parents_to_process, parent);
6980     }
6981 
6982     for (entry = children_to_process;
6983          entry != NULL;
6984          entry = g_slist_next(entry)) {
6985         child = entry->data;
6986         bdrv_set_aio_context_ignore(child->bs, new_context, ignore);
6987     }
6988     g_slist_free(children_to_process);
6989 
6990     for (entry = parents_to_process;
6991          entry != NULL;
6992          entry = g_slist_next(entry)) {
6993         parent = entry->data;
6994         assert(parent->klass->set_aio_ctx);
6995         parent->klass->set_aio_ctx(parent, new_context, ignore);
6996     }
6997     g_slist_free(parents_to_process);
6998 
6999     bdrv_detach_aio_context(bs);
7000 
7001     /* Acquire the new context, if necessary */
7002     if (qemu_get_aio_context() != new_context) {
7003         aio_context_acquire(new_context);
7004     }
7005 
7006     bdrv_attach_aio_context(bs, new_context);
7007 
7008     /*
7009      * If this function was recursively called from
7010      * bdrv_set_aio_context_ignore(), there may be nodes in the
7011      * subtree that have not yet been moved to the new AioContext.
7012      * Release the old one so bdrv_drained_end() can poll them.
7013      */
7014     if (qemu_get_aio_context() != old_context) {
7015         aio_context_release(old_context);
7016     }
7017 
7018     bdrv_drained_end(bs);
7019 
7020     if (qemu_get_aio_context() != old_context) {
7021         aio_context_acquire(old_context);
7022     }
7023     if (qemu_get_aio_context() != new_context) {
7024         aio_context_release(new_context);
7025     }
7026 }
7027 
7028 static bool bdrv_parent_can_set_aio_context(BdrvChild *c, AioContext *ctx,
7029                                             GSList **ignore, Error **errp)
7030 {
7031     if (g_slist_find(*ignore, c)) {
7032         return true;
7033     }
7034     *ignore = g_slist_prepend(*ignore, c);
7035 
7036     /*
7037      * A BdrvChildClass that doesn't handle AioContext changes cannot
7038      * tolerate any AioContext changes
7039      */
7040     if (!c->klass->can_set_aio_ctx) {
7041         char *user = bdrv_child_user_desc(c);
7042         error_setg(errp, "Changing iothreads is not supported by %s", user);
7043         g_free(user);
7044         return false;
7045     }
7046     if (!c->klass->can_set_aio_ctx(c, ctx, ignore, errp)) {
7047         assert(!errp || *errp);
7048         return false;
7049     }
7050     return true;
7051 }
7052 
7053 bool bdrv_child_can_set_aio_context(BdrvChild *c, AioContext *ctx,
7054                                     GSList **ignore, Error **errp)
7055 {
7056     if (g_slist_find(*ignore, c)) {
7057         return true;
7058     }
7059     *ignore = g_slist_prepend(*ignore, c);
7060     return bdrv_can_set_aio_context(c->bs, ctx, ignore, errp);
7061 }
7062 
7063 /* @ignore will accumulate all visited BdrvChild object. The caller is
7064  * responsible for freeing the list afterwards. */
7065 bool bdrv_can_set_aio_context(BlockDriverState *bs, AioContext *ctx,
7066                               GSList **ignore, Error **errp)
7067 {
7068     BdrvChild *c;
7069 
7070     if (bdrv_get_aio_context(bs) == ctx) {
7071         return true;
7072     }
7073 
7074     QLIST_FOREACH(c, &bs->parents, next_parent) {
7075         if (!bdrv_parent_can_set_aio_context(c, ctx, ignore, errp)) {
7076             return false;
7077         }
7078     }
7079     QLIST_FOREACH(c, &bs->children, next) {
7080         if (!bdrv_child_can_set_aio_context(c, ctx, ignore, errp)) {
7081             return false;
7082         }
7083     }
7084 
7085     return true;
7086 }
7087 
7088 int bdrv_child_try_set_aio_context(BlockDriverState *bs, AioContext *ctx,
7089                                    BdrvChild *ignore_child, Error **errp)
7090 {
7091     GSList *ignore;
7092     bool ret;
7093 
7094     ignore = ignore_child ? g_slist_prepend(NULL, ignore_child) : NULL;
7095     ret = bdrv_can_set_aio_context(bs, ctx, &ignore, errp);
7096     g_slist_free(ignore);
7097 
7098     if (!ret) {
7099         return -EPERM;
7100     }
7101 
7102     ignore = ignore_child ? g_slist_prepend(NULL, ignore_child) : NULL;
7103     bdrv_set_aio_context_ignore(bs, ctx, &ignore);
7104     g_slist_free(ignore);
7105 
7106     return 0;
7107 }
7108 
7109 int bdrv_try_set_aio_context(BlockDriverState *bs, AioContext *ctx,
7110                              Error **errp)
7111 {
7112     return bdrv_child_try_set_aio_context(bs, ctx, NULL, errp);
7113 }
7114 
7115 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
7116         void (*attached_aio_context)(AioContext *new_context, void *opaque),
7117         void (*detach_aio_context)(void *opaque), void *opaque)
7118 {
7119     BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
7120     *ban = (BdrvAioNotifier){
7121         .attached_aio_context = attached_aio_context,
7122         .detach_aio_context   = detach_aio_context,
7123         .opaque               = opaque
7124     };
7125 
7126     QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
7127 }
7128 
7129 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
7130                                       void (*attached_aio_context)(AioContext *,
7131                                                                    void *),
7132                                       void (*detach_aio_context)(void *),
7133                                       void *opaque)
7134 {
7135     BdrvAioNotifier *ban, *ban_next;
7136 
7137     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
7138         if (ban->attached_aio_context == attached_aio_context &&
7139             ban->detach_aio_context   == detach_aio_context   &&
7140             ban->opaque               == opaque               &&
7141             ban->deleted              == false)
7142         {
7143             if (bs->walking_aio_notifiers) {
7144                 ban->deleted = true;
7145             } else {
7146                 bdrv_do_remove_aio_context_notifier(ban);
7147             }
7148             return;
7149         }
7150     }
7151 
7152     abort();
7153 }
7154 
7155 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
7156                        BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
7157                        bool force,
7158                        Error **errp)
7159 {
7160     if (!bs->drv) {
7161         error_setg(errp, "Node is ejected");
7162         return -ENOMEDIUM;
7163     }
7164     if (!bs->drv->bdrv_amend_options) {
7165         error_setg(errp, "Block driver '%s' does not support option amendment",
7166                    bs->drv->format_name);
7167         return -ENOTSUP;
7168     }
7169     return bs->drv->bdrv_amend_options(bs, opts, status_cb,
7170                                        cb_opaque, force, errp);
7171 }
7172 
7173 /*
7174  * This function checks whether the given @to_replace is allowed to be
7175  * replaced by a node that always shows the same data as @bs.  This is
7176  * used for example to verify whether the mirror job can replace
7177  * @to_replace by the target mirrored from @bs.
7178  * To be replaceable, @bs and @to_replace may either be guaranteed to
7179  * always show the same data (because they are only connected through
7180  * filters), or some driver may allow replacing one of its children
7181  * because it can guarantee that this child's data is not visible at
7182  * all (for example, for dissenting quorum children that have no other
7183  * parents).
7184  */
7185 bool bdrv_recurse_can_replace(BlockDriverState *bs,
7186                               BlockDriverState *to_replace)
7187 {
7188     BlockDriverState *filtered;
7189 
7190     if (!bs || !bs->drv) {
7191         return false;
7192     }
7193 
7194     if (bs == to_replace) {
7195         return true;
7196     }
7197 
7198     /* See what the driver can do */
7199     if (bs->drv->bdrv_recurse_can_replace) {
7200         return bs->drv->bdrv_recurse_can_replace(bs, to_replace);
7201     }
7202 
7203     /* For filters without an own implementation, we can recurse on our own */
7204     filtered = bdrv_filter_bs(bs);
7205     if (filtered) {
7206         return bdrv_recurse_can_replace(filtered, to_replace);
7207     }
7208 
7209     /* Safe default */
7210     return false;
7211 }
7212 
7213 /*
7214  * Check whether the given @node_name can be replaced by a node that
7215  * has the same data as @parent_bs.  If so, return @node_name's BDS;
7216  * NULL otherwise.
7217  *
7218  * @node_name must be a (recursive) *child of @parent_bs (or this
7219  * function will return NULL).
7220  *
7221  * The result (whether the node can be replaced or not) is only valid
7222  * for as long as no graph or permission changes occur.
7223  */
7224 BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
7225                                         const char *node_name, Error **errp)
7226 {
7227     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
7228     AioContext *aio_context;
7229 
7230     if (!to_replace_bs) {
7231         error_setg(errp, "Failed to find node with node-name='%s'", node_name);
7232         return NULL;
7233     }
7234 
7235     aio_context = bdrv_get_aio_context(to_replace_bs);
7236     aio_context_acquire(aio_context);
7237 
7238     if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
7239         to_replace_bs = NULL;
7240         goto out;
7241     }
7242 
7243     /* We don't want arbitrary node of the BDS chain to be replaced only the top
7244      * most non filter in order to prevent data corruption.
7245      * Another benefit is that this tests exclude backing files which are
7246      * blocked by the backing blockers.
7247      */
7248     if (!bdrv_recurse_can_replace(parent_bs, to_replace_bs)) {
7249         error_setg(errp, "Cannot replace '%s' by a node mirrored from '%s', "
7250                    "because it cannot be guaranteed that doing so would not "
7251                    "lead to an abrupt change of visible data",
7252                    node_name, parent_bs->node_name);
7253         to_replace_bs = NULL;
7254         goto out;
7255     }
7256 
7257 out:
7258     aio_context_release(aio_context);
7259     return to_replace_bs;
7260 }
7261 
7262 /**
7263  * Iterates through the list of runtime option keys that are said to
7264  * be "strong" for a BDS.  An option is called "strong" if it changes
7265  * a BDS's data.  For example, the null block driver's "size" and
7266  * "read-zeroes" options are strong, but its "latency-ns" option is
7267  * not.
7268  *
7269  * If a key returned by this function ends with a dot, all options
7270  * starting with that prefix are strong.
7271  */
7272 static const char *const *strong_options(BlockDriverState *bs,
7273                                          const char *const *curopt)
7274 {
7275     static const char *const global_options[] = {
7276         "driver", "filename", NULL
7277     };
7278 
7279     if (!curopt) {
7280         return &global_options[0];
7281     }
7282 
7283     curopt++;
7284     if (curopt == &global_options[ARRAY_SIZE(global_options) - 1] && bs->drv) {
7285         curopt = bs->drv->strong_runtime_opts;
7286     }
7287 
7288     return (curopt && *curopt) ? curopt : NULL;
7289 }
7290 
7291 /**
7292  * Copies all strong runtime options from bs->options to the given
7293  * QDict.  The set of strong option keys is determined by invoking
7294  * strong_options().
7295  *
7296  * Returns true iff any strong option was present in bs->options (and
7297  * thus copied to the target QDict) with the exception of "filename"
7298  * and "driver".  The caller is expected to use this value to decide
7299  * whether the existence of strong options prevents the generation of
7300  * a plain filename.
7301  */
7302 static bool append_strong_runtime_options(QDict *d, BlockDriverState *bs)
7303 {
7304     bool found_any = false;
7305     const char *const *option_name = NULL;
7306 
7307     if (!bs->drv) {
7308         return false;
7309     }
7310 
7311     while ((option_name = strong_options(bs, option_name))) {
7312         bool option_given = false;
7313 
7314         assert(strlen(*option_name) > 0);
7315         if ((*option_name)[strlen(*option_name) - 1] != '.') {
7316             QObject *entry = qdict_get(bs->options, *option_name);
7317             if (!entry) {
7318                 continue;
7319             }
7320 
7321             qdict_put_obj(d, *option_name, qobject_ref(entry));
7322             option_given = true;
7323         } else {
7324             const QDictEntry *entry;
7325             for (entry = qdict_first(bs->options); entry;
7326                  entry = qdict_next(bs->options, entry))
7327             {
7328                 if (strstart(qdict_entry_key(entry), *option_name, NULL)) {
7329                     qdict_put_obj(d, qdict_entry_key(entry),
7330                                   qobject_ref(qdict_entry_value(entry)));
7331                     option_given = true;
7332                 }
7333             }
7334         }
7335 
7336         /* While "driver" and "filename" need to be included in a JSON filename,
7337          * their existence does not prohibit generation of a plain filename. */
7338         if (!found_any && option_given &&
7339             strcmp(*option_name, "driver") && strcmp(*option_name, "filename"))
7340         {
7341             found_any = true;
7342         }
7343     }
7344 
7345     if (!qdict_haskey(d, "driver")) {
7346         /* Drivers created with bdrv_new_open_driver() may not have a
7347          * @driver option.  Add it here. */
7348         qdict_put_str(d, "driver", bs->drv->format_name);
7349     }
7350 
7351     return found_any;
7352 }
7353 
7354 /* Note: This function may return false positives; it may return true
7355  * even if opening the backing file specified by bs's image header
7356  * would result in exactly bs->backing. */
7357 bool bdrv_backing_overridden(BlockDriverState *bs)
7358 {
7359     if (bs->backing) {
7360         return strcmp(bs->auto_backing_file,
7361                       bs->backing->bs->filename);
7362     } else {
7363         /* No backing BDS, so if the image header reports any backing
7364          * file, it must have been suppressed */
7365         return bs->auto_backing_file[0] != '\0';
7366     }
7367 }
7368 
7369 /* Updates the following BDS fields:
7370  *  - exact_filename: A filename which may be used for opening a block device
7371  *                    which (mostly) equals the given BDS (even without any
7372  *                    other options; so reading and writing must return the same
7373  *                    results, but caching etc. may be different)
7374  *  - full_open_options: Options which, when given when opening a block device
7375  *                       (without a filename), result in a BDS (mostly)
7376  *                       equalling the given one
7377  *  - filename: If exact_filename is set, it is copied here. Otherwise,
7378  *              full_open_options is converted to a JSON object, prefixed with
7379  *              "json:" (for use through the JSON pseudo protocol) and put here.
7380  */
7381 void bdrv_refresh_filename(BlockDriverState *bs)
7382 {
7383     BlockDriver *drv = bs->drv;
7384     BdrvChild *child;
7385     BlockDriverState *primary_child_bs;
7386     QDict *opts;
7387     bool backing_overridden;
7388     bool generate_json_filename; /* Whether our default implementation should
7389                                     fill exact_filename (false) or not (true) */
7390 
7391     if (!drv) {
7392         return;
7393     }
7394 
7395     /* This BDS's file name may depend on any of its children's file names, so
7396      * refresh those first */
7397     QLIST_FOREACH(child, &bs->children, next) {
7398         bdrv_refresh_filename(child->bs);
7399     }
7400 
7401     if (bs->implicit) {
7402         /* For implicit nodes, just copy everything from the single child */
7403         child = QLIST_FIRST(&bs->children);
7404         assert(QLIST_NEXT(child, next) == NULL);
7405 
7406         pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
7407                 child->bs->exact_filename);
7408         pstrcpy(bs->filename, sizeof(bs->filename), child->bs->filename);
7409 
7410         qobject_unref(bs->full_open_options);
7411         bs->full_open_options = qobject_ref(child->bs->full_open_options);
7412 
7413         return;
7414     }
7415 
7416     backing_overridden = bdrv_backing_overridden(bs);
7417 
7418     if (bs->open_flags & BDRV_O_NO_IO) {
7419         /* Without I/O, the backing file does not change anything.
7420          * Therefore, in such a case (primarily qemu-img), we can
7421          * pretend the backing file has not been overridden even if
7422          * it technically has been. */
7423         backing_overridden = false;
7424     }
7425 
7426     /* Gather the options QDict */
7427     opts = qdict_new();
7428     generate_json_filename = append_strong_runtime_options(opts, bs);
7429     generate_json_filename |= backing_overridden;
7430 
7431     if (drv->bdrv_gather_child_options) {
7432         /* Some block drivers may not want to present all of their children's
7433          * options, or name them differently from BdrvChild.name */
7434         drv->bdrv_gather_child_options(bs, opts, backing_overridden);
7435     } else {
7436         QLIST_FOREACH(child, &bs->children, next) {
7437             if (child == bs->backing && !backing_overridden) {
7438                 /* We can skip the backing BDS if it has not been overridden */
7439                 continue;
7440             }
7441 
7442             qdict_put(opts, child->name,
7443                       qobject_ref(child->bs->full_open_options));
7444         }
7445 
7446         if (backing_overridden && !bs->backing) {
7447             /* Force no backing file */
7448             qdict_put_null(opts, "backing");
7449         }
7450     }
7451 
7452     qobject_unref(bs->full_open_options);
7453     bs->full_open_options = opts;
7454 
7455     primary_child_bs = bdrv_primary_bs(bs);
7456 
7457     if (drv->bdrv_refresh_filename) {
7458         /* Obsolete information is of no use here, so drop the old file name
7459          * information before refreshing it */
7460         bs->exact_filename[0] = '\0';
7461 
7462         drv->bdrv_refresh_filename(bs);
7463     } else if (primary_child_bs) {
7464         /*
7465          * Try to reconstruct valid information from the underlying
7466          * file -- this only works for format nodes (filter nodes
7467          * cannot be probed and as such must be selected by the user
7468          * either through an options dict, or through a special
7469          * filename which the filter driver must construct in its
7470          * .bdrv_refresh_filename() implementation).
7471          */
7472 
7473         bs->exact_filename[0] = '\0';
7474 
7475         /*
7476          * We can use the underlying file's filename if:
7477          * - it has a filename,
7478          * - the current BDS is not a filter,
7479          * - the file is a protocol BDS, and
7480          * - opening that file (as this BDS's format) will automatically create
7481          *   the BDS tree we have right now, that is:
7482          *   - the user did not significantly change this BDS's behavior with
7483          *     some explicit (strong) options
7484          *   - no non-file child of this BDS has been overridden by the user
7485          *   Both of these conditions are represented by generate_json_filename.
7486          */
7487         if (primary_child_bs->exact_filename[0] &&
7488             primary_child_bs->drv->bdrv_file_open &&
7489             !drv->is_filter && !generate_json_filename)
7490         {
7491             strcpy(bs->exact_filename, primary_child_bs->exact_filename);
7492         }
7493     }
7494 
7495     if (bs->exact_filename[0]) {
7496         pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
7497     } else {
7498         GString *json = qobject_to_json(QOBJECT(bs->full_open_options));
7499         if (snprintf(bs->filename, sizeof(bs->filename), "json:%s",
7500                      json->str) >= sizeof(bs->filename)) {
7501             /* Give user a hint if we truncated things. */
7502             strcpy(bs->filename + sizeof(bs->filename) - 4, "...");
7503         }
7504         g_string_free(json, true);
7505     }
7506 }
7507 
7508 char *bdrv_dirname(BlockDriverState *bs, Error **errp)
7509 {
7510     BlockDriver *drv = bs->drv;
7511     BlockDriverState *child_bs;
7512 
7513     if (!drv) {
7514         error_setg(errp, "Node '%s' is ejected", bs->node_name);
7515         return NULL;
7516     }
7517 
7518     if (drv->bdrv_dirname) {
7519         return drv->bdrv_dirname(bs, errp);
7520     }
7521 
7522     child_bs = bdrv_primary_bs(bs);
7523     if (child_bs) {
7524         return bdrv_dirname(child_bs, errp);
7525     }
7526 
7527     bdrv_refresh_filename(bs);
7528     if (bs->exact_filename[0] != '\0') {
7529         return path_combine(bs->exact_filename, "");
7530     }
7531 
7532     error_setg(errp, "Cannot generate a base directory for %s nodes",
7533                drv->format_name);
7534     return NULL;
7535 }
7536 
7537 /*
7538  * Hot add/remove a BDS's child. So the user can take a child offline when
7539  * it is broken and take a new child online
7540  */
7541 void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs,
7542                     Error **errp)
7543 {
7544 
7545     if (!parent_bs->drv || !parent_bs->drv->bdrv_add_child) {
7546         error_setg(errp, "The node %s does not support adding a child",
7547                    bdrv_get_device_or_node_name(parent_bs));
7548         return;
7549     }
7550 
7551     if (!QLIST_EMPTY(&child_bs->parents)) {
7552         error_setg(errp, "The node %s already has a parent",
7553                    child_bs->node_name);
7554         return;
7555     }
7556 
7557     parent_bs->drv->bdrv_add_child(parent_bs, child_bs, errp);
7558 }
7559 
7560 void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error **errp)
7561 {
7562     BdrvChild *tmp;
7563 
7564     if (!parent_bs->drv || !parent_bs->drv->bdrv_del_child) {
7565         error_setg(errp, "The node %s does not support removing a child",
7566                    bdrv_get_device_or_node_name(parent_bs));
7567         return;
7568     }
7569 
7570     QLIST_FOREACH(tmp, &parent_bs->children, next) {
7571         if (tmp == child) {
7572             break;
7573         }
7574     }
7575 
7576     if (!tmp) {
7577         error_setg(errp, "The node %s does not have a child named %s",
7578                    bdrv_get_device_or_node_name(parent_bs),
7579                    bdrv_get_device_or_node_name(child->bs));
7580         return;
7581     }
7582 
7583     parent_bs->drv->bdrv_del_child(parent_bs, child, errp);
7584 }
7585 
7586 int bdrv_make_empty(BdrvChild *c, Error **errp)
7587 {
7588     BlockDriver *drv = c->bs->drv;
7589     int ret;
7590 
7591     assert(c->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED));
7592 
7593     if (!drv->bdrv_make_empty) {
7594         error_setg(errp, "%s does not support emptying nodes",
7595                    drv->format_name);
7596         return -ENOTSUP;
7597     }
7598 
7599     ret = drv->bdrv_make_empty(c->bs);
7600     if (ret < 0) {
7601         error_setg_errno(errp, -ret, "Failed to empty %s",
7602                          c->bs->filename);
7603         return ret;
7604     }
7605 
7606     return 0;
7607 }
7608 
7609 /*
7610  * Return the child that @bs acts as an overlay for, and from which data may be
7611  * copied in COW or COR operations.  Usually this is the backing file.
7612  */
7613 BdrvChild *bdrv_cow_child(BlockDriverState *bs)
7614 {
7615     if (!bs || !bs->drv) {
7616         return NULL;
7617     }
7618 
7619     if (bs->drv->is_filter) {
7620         return NULL;
7621     }
7622 
7623     if (!bs->backing) {
7624         return NULL;
7625     }
7626 
7627     assert(bs->backing->role & BDRV_CHILD_COW);
7628     return bs->backing;
7629 }
7630 
7631 /*
7632  * If @bs acts as a filter for exactly one of its children, return
7633  * that child.
7634  */
7635 BdrvChild *bdrv_filter_child(BlockDriverState *bs)
7636 {
7637     BdrvChild *c;
7638 
7639     if (!bs || !bs->drv) {
7640         return NULL;
7641     }
7642 
7643     if (!bs->drv->is_filter) {
7644         return NULL;
7645     }
7646 
7647     /* Only one of @backing or @file may be used */
7648     assert(!(bs->backing && bs->file));
7649 
7650     c = bs->backing ?: bs->file;
7651     if (!c) {
7652         return NULL;
7653     }
7654 
7655     assert(c->role & BDRV_CHILD_FILTERED);
7656     return c;
7657 }
7658 
7659 /*
7660  * Return either the result of bdrv_cow_child() or bdrv_filter_child(),
7661  * whichever is non-NULL.
7662  *
7663  * Return NULL if both are NULL.
7664  */
7665 BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs)
7666 {
7667     BdrvChild *cow_child = bdrv_cow_child(bs);
7668     BdrvChild *filter_child = bdrv_filter_child(bs);
7669 
7670     /* Filter nodes cannot have COW backing files */
7671     assert(!(cow_child && filter_child));
7672 
7673     return cow_child ?: filter_child;
7674 }
7675 
7676 /*
7677  * Return the primary child of this node: For filters, that is the
7678  * filtered child.  For other nodes, that is usually the child storing
7679  * metadata.
7680  * (A generally more helpful description is that this is (usually) the
7681  * child that has the same filename as @bs.)
7682  *
7683  * Drivers do not necessarily have a primary child; for example quorum
7684  * does not.
7685  */
7686 BdrvChild *bdrv_primary_child(BlockDriverState *bs)
7687 {
7688     BdrvChild *c, *found = NULL;
7689 
7690     QLIST_FOREACH(c, &bs->children, next) {
7691         if (c->role & BDRV_CHILD_PRIMARY) {
7692             assert(!found);
7693             found = c;
7694         }
7695     }
7696 
7697     return found;
7698 }
7699 
7700 static BlockDriverState *bdrv_do_skip_filters(BlockDriverState *bs,
7701                                               bool stop_on_explicit_filter)
7702 {
7703     BdrvChild *c;
7704 
7705     if (!bs) {
7706         return NULL;
7707     }
7708 
7709     while (!(stop_on_explicit_filter && !bs->implicit)) {
7710         c = bdrv_filter_child(bs);
7711         if (!c) {
7712             /*
7713              * A filter that is embedded in a working block graph must
7714              * have a child.  Assert this here so this function does
7715              * not return a filter node that is not expected by the
7716              * caller.
7717              */
7718             assert(!bs->drv || !bs->drv->is_filter);
7719             break;
7720         }
7721         bs = c->bs;
7722     }
7723     /*
7724      * Note that this treats nodes with bs->drv == NULL as not being
7725      * filters (bs->drv == NULL should be replaced by something else
7726      * anyway).
7727      * The advantage of this behavior is that this function will thus
7728      * always return a non-NULL value (given a non-NULL @bs).
7729      */
7730 
7731     return bs;
7732 }
7733 
7734 /*
7735  * Return the first BDS that has not been added implicitly or that
7736  * does not have a filtered child down the chain starting from @bs
7737  * (including @bs itself).
7738  */
7739 BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs)
7740 {
7741     return bdrv_do_skip_filters(bs, true);
7742 }
7743 
7744 /*
7745  * Return the first BDS that does not have a filtered child down the
7746  * chain starting from @bs (including @bs itself).
7747  */
7748 BlockDriverState *bdrv_skip_filters(BlockDriverState *bs)
7749 {
7750     return bdrv_do_skip_filters(bs, false);
7751 }
7752 
7753 /*
7754  * For a backing chain, return the first non-filter backing image of
7755  * the first non-filter image.
7756  */
7757 BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs)
7758 {
7759     return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs)));
7760 }
7761 
7762 /**
7763  * Check whether [offset, offset + bytes) overlaps with the cached
7764  * block-status data region.
7765  *
7766  * If so, and @pnum is not NULL, set *pnum to `bsc.data_end - offset`,
7767  * which is what bdrv_bsc_is_data()'s interface needs.
7768  * Otherwise, *pnum is not touched.
7769  */
7770 static bool bdrv_bsc_range_overlaps_locked(BlockDriverState *bs,
7771                                            int64_t offset, int64_t bytes,
7772                                            int64_t *pnum)
7773 {
7774     BdrvBlockStatusCache *bsc = qatomic_rcu_read(&bs->block_status_cache);
7775     bool overlaps;
7776 
7777     overlaps =
7778         qatomic_read(&bsc->valid) &&
7779         ranges_overlap(offset, bytes, bsc->data_start,
7780                        bsc->data_end - bsc->data_start);
7781 
7782     if (overlaps && pnum) {
7783         *pnum = bsc->data_end - offset;
7784     }
7785 
7786     return overlaps;
7787 }
7788 
7789 /**
7790  * See block_int.h for this function's documentation.
7791  */
7792 bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum)
7793 {
7794     RCU_READ_LOCK_GUARD();
7795 
7796     return bdrv_bsc_range_overlaps_locked(bs, offset, 1, pnum);
7797 }
7798 
7799 /**
7800  * See block_int.h for this function's documentation.
7801  */
7802 void bdrv_bsc_invalidate_range(BlockDriverState *bs,
7803                                int64_t offset, int64_t bytes)
7804 {
7805     RCU_READ_LOCK_GUARD();
7806 
7807     if (bdrv_bsc_range_overlaps_locked(bs, offset, bytes, NULL)) {
7808         qatomic_set(&bs->block_status_cache->valid, false);
7809     }
7810 }
7811 
7812 /**
7813  * See block_int.h for this function's documentation.
7814  */
7815 void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes)
7816 {
7817     BdrvBlockStatusCache *new_bsc = g_new(BdrvBlockStatusCache, 1);
7818     BdrvBlockStatusCache *old_bsc;
7819 
7820     *new_bsc = (BdrvBlockStatusCache) {
7821         .valid = true,
7822         .data_start = offset,
7823         .data_end = offset + bytes,
7824     };
7825 
7826     QEMU_LOCK_GUARD(&bs->bsc_modify_lock);
7827 
7828     old_bsc = qatomic_rcu_read(&bs->block_status_cache);
7829     qatomic_rcu_set(&bs->block_status_cache, new_bsc);
7830     if (old_bsc) {
7831         g_free_rcu(old_bsc, rcu);
7832     }
7833 }
7834