xref: /qemu/block.c (revision f231b88d)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
48 
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
52 
53 struct BdrvDirtyBitmap {
54     HBitmap *bitmap;
55     QLIST_ENTRY(BdrvDirtyBitmap) list;
56 };
57 
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59 
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                                int64_t sector_num,
81                                                QEMUIOVector *qiov,
82                                                int nb_sectors,
83                                                BdrvRequestFlags flags,
84                                                BlockDriverCompletionFunc *cb,
85                                                void *opaque,
86                                                bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   QEMU_CLOCK_VIRTUAL,
183                   bdrv_throttle_read_timer_cb,
184                   bdrv_throttle_write_timer_cb,
185                   bs);
186     bs->io_limits_enabled = true;
187 }
188 
189 /* This function makes an IO wait if needed
190  *
191  * @nb_sectors: the number of sectors of the IO
192  * @is_write:   is the IO a write
193  */
194 static void bdrv_io_limits_intercept(BlockDriverState *bs,
195                                      unsigned int bytes,
196                                      bool is_write)
197 {
198     /* does this io must wait */
199     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200 
201     /* if must wait or any request of this type throttled queue the IO */
202     if (must_wait ||
203         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205     }
206 
207     /* the IO will be executed, do the accounting */
208     throttle_account(&bs->throttle_state, is_write, bytes);
209 
210 
211     /* if the next request must wait -> do nothing */
212     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213         return;
214     }
215 
216     /* else queue next request for execution */
217     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218 }
219 
220 size_t bdrv_opt_mem_align(BlockDriverState *bs)
221 {
222     if (!bs || !bs->drv) {
223         /* 4k should be on the safe side */
224         return 4096;
225     }
226 
227     return bs->bl.opt_mem_alignment;
228 }
229 
230 /* check if the path starts with "<protocol>:" */
231 static int path_has_protocol(const char *path)
232 {
233     const char *p;
234 
235 #ifdef _WIN32
236     if (is_windows_drive(path) ||
237         is_windows_drive_prefix(path)) {
238         return 0;
239     }
240     p = path + strcspn(path, ":/\\");
241 #else
242     p = path + strcspn(path, ":/");
243 #endif
244 
245     return *p == ':';
246 }
247 
248 int path_is_absolute(const char *path)
249 {
250 #ifdef _WIN32
251     /* specific case for names like: "\\.\d:" */
252     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253         return 1;
254     }
255     return (*path == '/' || *path == '\\');
256 #else
257     return (*path == '/');
258 #endif
259 }
260 
261 /* if filename is absolute, just copy it to dest. Otherwise, build a
262    path to it by considering it is relative to base_path. URL are
263    supported. */
264 void path_combine(char *dest, int dest_size,
265                   const char *base_path,
266                   const char *filename)
267 {
268     const char *p, *p1;
269     int len;
270 
271     if (dest_size <= 0)
272         return;
273     if (path_is_absolute(filename)) {
274         pstrcpy(dest, dest_size, filename);
275     } else {
276         p = strchr(base_path, ':');
277         if (p)
278             p++;
279         else
280             p = base_path;
281         p1 = strrchr(base_path, '/');
282 #ifdef _WIN32
283         {
284             const char *p2;
285             p2 = strrchr(base_path, '\\');
286             if (!p1 || p2 > p1)
287                 p1 = p2;
288         }
289 #endif
290         if (p1)
291             p1++;
292         else
293             p1 = base_path;
294         if (p1 > p)
295             p = p1;
296         len = p - base_path;
297         if (len > dest_size - 1)
298             len = dest_size - 1;
299         memcpy(dest, base_path, len);
300         dest[len] = '\0';
301         pstrcat(dest, dest_size, filename);
302     }
303 }
304 
305 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306 {
307     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308         pstrcpy(dest, sz, bs->backing_file);
309     } else {
310         path_combine(dest, sz, bs->filename, bs->backing_file);
311     }
312 }
313 
314 void bdrv_register(BlockDriver *bdrv)
315 {
316     /* Block drivers without coroutine functions need emulation */
317     if (!bdrv->bdrv_co_readv) {
318         bdrv->bdrv_co_readv = bdrv_co_readv_em;
319         bdrv->bdrv_co_writev = bdrv_co_writev_em;
320 
321         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322          * the block driver lacks aio we need to emulate that too.
323          */
324         if (!bdrv->bdrv_aio_readv) {
325             /* add AIO emulation layer */
326             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328         }
329     }
330 
331     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332 }
333 
334 /* create a new block device (by default it is empty) */
335 BlockDriverState *bdrv_new(const char *device_name, Error **errp)
336 {
337     BlockDriverState *bs;
338 
339     if (bdrv_find(device_name)) {
340         error_setg(errp, "Device with id '%s' already exists",
341                    device_name);
342         return NULL;
343     }
344     if (bdrv_find_node(device_name)) {
345         error_setg(errp, "Device with node-name '%s' already exists",
346                    device_name);
347         return NULL;
348     }
349 
350     bs = g_malloc0(sizeof(BlockDriverState));
351     QLIST_INIT(&bs->dirty_bitmaps);
352     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
353     if (device_name[0] != '\0') {
354         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
355     }
356     bdrv_iostatus_disable(bs);
357     notifier_list_init(&bs->close_notifiers);
358     notifier_with_return_list_init(&bs->before_write_notifiers);
359     qemu_co_queue_init(&bs->throttled_reqs[0]);
360     qemu_co_queue_init(&bs->throttled_reqs[1]);
361     bs->refcnt = 1;
362 
363     return bs;
364 }
365 
366 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
367 {
368     notifier_list_add(&bs->close_notifiers, notify);
369 }
370 
371 BlockDriver *bdrv_find_format(const char *format_name)
372 {
373     BlockDriver *drv1;
374     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
375         if (!strcmp(drv1->format_name, format_name)) {
376             return drv1;
377         }
378     }
379     return NULL;
380 }
381 
382 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
383 {
384     static const char *whitelist_rw[] = {
385         CONFIG_BDRV_RW_WHITELIST
386     };
387     static const char *whitelist_ro[] = {
388         CONFIG_BDRV_RO_WHITELIST
389     };
390     const char **p;
391 
392     if (!whitelist_rw[0] && !whitelist_ro[0]) {
393         return 1;               /* no whitelist, anything goes */
394     }
395 
396     for (p = whitelist_rw; *p; p++) {
397         if (!strcmp(drv->format_name, *p)) {
398             return 1;
399         }
400     }
401     if (read_only) {
402         for (p = whitelist_ro; *p; p++) {
403             if (!strcmp(drv->format_name, *p)) {
404                 return 1;
405             }
406         }
407     }
408     return 0;
409 }
410 
411 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
412                                           bool read_only)
413 {
414     BlockDriver *drv = bdrv_find_format(format_name);
415     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
416 }
417 
418 typedef struct CreateCo {
419     BlockDriver *drv;
420     char *filename;
421     QEMUOptionParameter *options;
422     int ret;
423     Error *err;
424 } CreateCo;
425 
426 static void coroutine_fn bdrv_create_co_entry(void *opaque)
427 {
428     Error *local_err = NULL;
429     int ret;
430 
431     CreateCo *cco = opaque;
432     assert(cco->drv);
433 
434     ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
435     if (local_err) {
436         error_propagate(&cco->err, local_err);
437     }
438     cco->ret = ret;
439 }
440 
441 int bdrv_create(BlockDriver *drv, const char* filename,
442     QEMUOptionParameter *options, Error **errp)
443 {
444     int ret;
445 
446     Coroutine *co;
447     CreateCo cco = {
448         .drv = drv,
449         .filename = g_strdup(filename),
450         .options = options,
451         .ret = NOT_DONE,
452         .err = NULL,
453     };
454 
455     if (!drv->bdrv_create) {
456         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
457         ret = -ENOTSUP;
458         goto out;
459     }
460 
461     if (qemu_in_coroutine()) {
462         /* Fast-path if already in coroutine context */
463         bdrv_create_co_entry(&cco);
464     } else {
465         co = qemu_coroutine_create(bdrv_create_co_entry);
466         qemu_coroutine_enter(co, &cco);
467         while (cco.ret == NOT_DONE) {
468             qemu_aio_wait();
469         }
470     }
471 
472     ret = cco.ret;
473     if (ret < 0) {
474         if (cco.err) {
475             error_propagate(errp, cco.err);
476         } else {
477             error_setg_errno(errp, -ret, "Could not create image");
478         }
479     }
480 
481 out:
482     g_free(cco.filename);
483     return ret;
484 }
485 
486 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
487                      Error **errp)
488 {
489     BlockDriver *drv;
490     Error *local_err = NULL;
491     int ret;
492 
493     drv = bdrv_find_protocol(filename, true);
494     if (drv == NULL) {
495         error_setg(errp, "Could not find protocol for file '%s'", filename);
496         return -ENOENT;
497     }
498 
499     ret = bdrv_create(drv, filename, options, &local_err);
500     if (local_err) {
501         error_propagate(errp, local_err);
502     }
503     return ret;
504 }
505 
506 int bdrv_refresh_limits(BlockDriverState *bs)
507 {
508     BlockDriver *drv = bs->drv;
509 
510     memset(&bs->bl, 0, sizeof(bs->bl));
511 
512     if (!drv) {
513         return 0;
514     }
515 
516     /* Take some limits from the children as a default */
517     if (bs->file) {
518         bdrv_refresh_limits(bs->file);
519         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
520         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
521     } else {
522         bs->bl.opt_mem_alignment = 512;
523     }
524 
525     if (bs->backing_hd) {
526         bdrv_refresh_limits(bs->backing_hd);
527         bs->bl.opt_transfer_length =
528             MAX(bs->bl.opt_transfer_length,
529                 bs->backing_hd->bl.opt_transfer_length);
530         bs->bl.opt_mem_alignment =
531             MAX(bs->bl.opt_mem_alignment,
532                 bs->backing_hd->bl.opt_mem_alignment);
533     }
534 
535     /* Then let the driver override it */
536     if (drv->bdrv_refresh_limits) {
537         return drv->bdrv_refresh_limits(bs);
538     }
539 
540     return 0;
541 }
542 
543 /*
544  * Create a uniquely-named empty temporary file.
545  * Return 0 upon success, otherwise a negative errno value.
546  */
547 int get_tmp_filename(char *filename, int size)
548 {
549 #ifdef _WIN32
550     char temp_dir[MAX_PATH];
551     /* GetTempFileName requires that its output buffer (4th param)
552        have length MAX_PATH or greater.  */
553     assert(size >= MAX_PATH);
554     return (GetTempPath(MAX_PATH, temp_dir)
555             && GetTempFileName(temp_dir, "qem", 0, filename)
556             ? 0 : -GetLastError());
557 #else
558     int fd;
559     const char *tmpdir;
560     tmpdir = getenv("TMPDIR");
561     if (!tmpdir) {
562         tmpdir = "/var/tmp";
563     }
564     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
565         return -EOVERFLOW;
566     }
567     fd = mkstemp(filename);
568     if (fd < 0) {
569         return -errno;
570     }
571     if (close(fd) != 0) {
572         unlink(filename);
573         return -errno;
574     }
575     return 0;
576 #endif
577 }
578 
579 /*
580  * Detect host devices. By convention, /dev/cdrom[N] is always
581  * recognized as a host CDROM.
582  */
583 static BlockDriver *find_hdev_driver(const char *filename)
584 {
585     int score_max = 0, score;
586     BlockDriver *drv = NULL, *d;
587 
588     QLIST_FOREACH(d, &bdrv_drivers, list) {
589         if (d->bdrv_probe_device) {
590             score = d->bdrv_probe_device(filename);
591             if (score > score_max) {
592                 score_max = score;
593                 drv = d;
594             }
595         }
596     }
597 
598     return drv;
599 }
600 
601 BlockDriver *bdrv_find_protocol(const char *filename,
602                                 bool allow_protocol_prefix)
603 {
604     BlockDriver *drv1;
605     char protocol[128];
606     int len;
607     const char *p;
608 
609     /* TODO Drivers without bdrv_file_open must be specified explicitly */
610 
611     /*
612      * XXX(hch): we really should not let host device detection
613      * override an explicit protocol specification, but moving this
614      * later breaks access to device names with colons in them.
615      * Thanks to the brain-dead persistent naming schemes on udev-
616      * based Linux systems those actually are quite common.
617      */
618     drv1 = find_hdev_driver(filename);
619     if (drv1) {
620         return drv1;
621     }
622 
623     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
624         return bdrv_find_format("file");
625     }
626 
627     p = strchr(filename, ':');
628     assert(p != NULL);
629     len = p - filename;
630     if (len > sizeof(protocol) - 1)
631         len = sizeof(protocol) - 1;
632     memcpy(protocol, filename, len);
633     protocol[len] = '\0';
634     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
635         if (drv1->protocol_name &&
636             !strcmp(drv1->protocol_name, protocol)) {
637             return drv1;
638         }
639     }
640     return NULL;
641 }
642 
643 static int find_image_format(BlockDriverState *bs, const char *filename,
644                              BlockDriver **pdrv, Error **errp)
645 {
646     int score, score_max;
647     BlockDriver *drv1, *drv;
648     uint8_t buf[2048];
649     int ret = 0;
650 
651     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
652     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
653         drv = bdrv_find_format("raw");
654         if (!drv) {
655             error_setg(errp, "Could not find raw image format");
656             ret = -ENOENT;
657         }
658         *pdrv = drv;
659         return ret;
660     }
661 
662     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
663     if (ret < 0) {
664         error_setg_errno(errp, -ret, "Could not read image for determining its "
665                          "format");
666         *pdrv = NULL;
667         return ret;
668     }
669 
670     score_max = 0;
671     drv = NULL;
672     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
673         if (drv1->bdrv_probe) {
674             score = drv1->bdrv_probe(buf, ret, filename);
675             if (score > score_max) {
676                 score_max = score;
677                 drv = drv1;
678             }
679         }
680     }
681     if (!drv) {
682         error_setg(errp, "Could not determine image format: No compatible "
683                    "driver found");
684         ret = -ENOENT;
685     }
686     *pdrv = drv;
687     return ret;
688 }
689 
690 /**
691  * Set the current 'total_sectors' value
692  */
693 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
694 {
695     BlockDriver *drv = bs->drv;
696 
697     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
698     if (bs->sg)
699         return 0;
700 
701     /* query actual device if possible, otherwise just trust the hint */
702     if (drv->bdrv_getlength) {
703         int64_t length = drv->bdrv_getlength(bs);
704         if (length < 0) {
705             return length;
706         }
707         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
708     }
709 
710     bs->total_sectors = hint;
711     return 0;
712 }
713 
714 /**
715  * Set open flags for a given discard mode
716  *
717  * Return 0 on success, -1 if the discard mode was invalid.
718  */
719 int bdrv_parse_discard_flags(const char *mode, int *flags)
720 {
721     *flags &= ~BDRV_O_UNMAP;
722 
723     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
724         /* do nothing */
725     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
726         *flags |= BDRV_O_UNMAP;
727     } else {
728         return -1;
729     }
730 
731     return 0;
732 }
733 
734 /**
735  * Set open flags for a given cache mode
736  *
737  * Return 0 on success, -1 if the cache mode was invalid.
738  */
739 int bdrv_parse_cache_flags(const char *mode, int *flags)
740 {
741     *flags &= ~BDRV_O_CACHE_MASK;
742 
743     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
744         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
745     } else if (!strcmp(mode, "directsync")) {
746         *flags |= BDRV_O_NOCACHE;
747     } else if (!strcmp(mode, "writeback")) {
748         *flags |= BDRV_O_CACHE_WB;
749     } else if (!strcmp(mode, "unsafe")) {
750         *flags |= BDRV_O_CACHE_WB;
751         *flags |= BDRV_O_NO_FLUSH;
752     } else if (!strcmp(mode, "writethrough")) {
753         /* this is the default */
754     } else {
755         return -1;
756     }
757 
758     return 0;
759 }
760 
761 /**
762  * The copy-on-read flag is actually a reference count so multiple users may
763  * use the feature without worrying about clobbering its previous state.
764  * Copy-on-read stays enabled until all users have called to disable it.
765  */
766 void bdrv_enable_copy_on_read(BlockDriverState *bs)
767 {
768     bs->copy_on_read++;
769 }
770 
771 void bdrv_disable_copy_on_read(BlockDriverState *bs)
772 {
773     assert(bs->copy_on_read > 0);
774     bs->copy_on_read--;
775 }
776 
777 static int bdrv_open_flags(BlockDriverState *bs, int flags)
778 {
779     int open_flags = flags | BDRV_O_CACHE_WB;
780 
781     /* The backing file of a temporary snapshot is read-only */
782     if (flags & BDRV_O_SNAPSHOT) {
783         open_flags &= ~BDRV_O_RDWR;
784     }
785 
786     /*
787      * Clear flags that are internal to the block layer before opening the
788      * image.
789      */
790     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
791 
792     /*
793      * Snapshots should be writable.
794      */
795     if (bs->is_temporary) {
796         open_flags |= BDRV_O_RDWR;
797     }
798 
799     return open_flags;
800 }
801 
802 static void bdrv_assign_node_name(BlockDriverState *bs,
803                                   const char *node_name,
804                                   Error **errp)
805 {
806     if (!node_name) {
807         return;
808     }
809 
810     /* empty string node name is invalid */
811     if (node_name[0] == '\0') {
812         error_setg(errp, "Empty node name");
813         return;
814     }
815 
816     /* takes care of avoiding namespaces collisions */
817     if (bdrv_find(node_name)) {
818         error_setg(errp, "node-name=%s is conflicting with a device id",
819                    node_name);
820         return;
821     }
822 
823     /* takes care of avoiding duplicates node names */
824     if (bdrv_find_node(node_name)) {
825         error_setg(errp, "Duplicate node name");
826         return;
827     }
828 
829     /* copy node name into the bs and insert it into the graph list */
830     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
831     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
832 }
833 
834 /*
835  * Common part for opening disk images and files
836  *
837  * Removes all processed options from *options.
838  */
839 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
840     QDict *options, int flags, BlockDriver *drv, Error **errp)
841 {
842     int ret, open_flags;
843     const char *filename;
844     const char *node_name = NULL;
845     Error *local_err = NULL;
846 
847     assert(drv != NULL);
848     assert(bs->file == NULL);
849     assert(options != NULL && bs->options != options);
850 
851     if (file != NULL) {
852         filename = file->filename;
853     } else {
854         filename = qdict_get_try_str(options, "filename");
855     }
856 
857     if (drv->bdrv_needs_filename && !filename) {
858         error_setg(errp, "The '%s' block driver requires a file name",
859                    drv->format_name);
860         return -EINVAL;
861     }
862 
863     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
864 
865     node_name = qdict_get_try_str(options, "node-name");
866     bdrv_assign_node_name(bs, node_name, &local_err);
867     if (error_is_set(&local_err)) {
868         error_propagate(errp, local_err);
869         return -EINVAL;
870     }
871     qdict_del(options, "node-name");
872 
873     /* bdrv_open() with directly using a protocol as drv. This layer is already
874      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
875      * and return immediately. */
876     if (file != NULL && drv->bdrv_file_open) {
877         bdrv_swap(file, bs);
878         return 0;
879     }
880 
881     bs->open_flags = flags;
882     bs->guest_block_size = 512;
883     bs->request_alignment = 512;
884     bs->zero_beyond_eof = true;
885     open_flags = bdrv_open_flags(bs, flags);
886     bs->read_only = !(open_flags & BDRV_O_RDWR);
887 
888     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
889         error_setg(errp,
890                    !bs->read_only && bdrv_is_whitelisted(drv, true)
891                         ? "Driver '%s' can only be used for read-only devices"
892                         : "Driver '%s' is not whitelisted",
893                    drv->format_name);
894         return -ENOTSUP;
895     }
896 
897     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
898     if (flags & BDRV_O_COPY_ON_READ) {
899         if (!bs->read_only) {
900             bdrv_enable_copy_on_read(bs);
901         } else {
902             error_setg(errp, "Can't use copy-on-read on read-only device");
903             return -EINVAL;
904         }
905     }
906 
907     if (filename != NULL) {
908         pstrcpy(bs->filename, sizeof(bs->filename), filename);
909     } else {
910         bs->filename[0] = '\0';
911     }
912 
913     bs->drv = drv;
914     bs->opaque = g_malloc0(drv->instance_size);
915 
916     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
917 
918     /* Open the image, either directly or using a protocol */
919     if (drv->bdrv_file_open) {
920         assert(file == NULL);
921         assert(!drv->bdrv_needs_filename || filename != NULL);
922         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
923     } else {
924         if (file == NULL) {
925             error_setg(errp, "Can't use '%s' as a block driver for the "
926                        "protocol level", drv->format_name);
927             ret = -EINVAL;
928             goto free_and_fail;
929         }
930         bs->file = file;
931         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
932     }
933 
934     if (ret < 0) {
935         if (local_err) {
936             error_propagate(errp, local_err);
937         } else if (bs->filename[0]) {
938             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
939         } else {
940             error_setg_errno(errp, -ret, "Could not open image");
941         }
942         goto free_and_fail;
943     }
944 
945     ret = refresh_total_sectors(bs, bs->total_sectors);
946     if (ret < 0) {
947         error_setg_errno(errp, -ret, "Could not refresh total sector count");
948         goto free_and_fail;
949     }
950 
951     bdrv_refresh_limits(bs);
952     assert(bdrv_opt_mem_align(bs) != 0);
953     assert((bs->request_alignment != 0) || bs->sg);
954 
955 #ifndef _WIN32
956     if (bs->is_temporary) {
957         assert(bs->filename[0] != '\0');
958         unlink(bs->filename);
959     }
960 #endif
961     return 0;
962 
963 free_and_fail:
964     bs->file = NULL;
965     g_free(bs->opaque);
966     bs->opaque = NULL;
967     bs->drv = NULL;
968     return ret;
969 }
970 
971 /*
972  * Opens a file using a protocol (file, host_device, nbd, ...)
973  *
974  * options is an indirect pointer to a QDict of options to pass to the block
975  * drivers, or pointer to NULL for an empty set of options. If this function
976  * takes ownership of the QDict reference, it will set *options to NULL;
977  * otherwise, it will contain unused/unrecognized options after this function
978  * returns. Then, the caller is responsible for freeing it. If it intends to
979  * reuse the QDict, QINCREF() should be called beforehand.
980  */
981 static int bdrv_file_open(BlockDriverState *bs, const char *filename,
982                           QDict **options, int flags, Error **errp)
983 {
984     BlockDriver *drv;
985     const char *drvname;
986     bool parse_filename = false;
987     Error *local_err = NULL;
988     int ret;
989 
990     /* Fetch the file name from the options QDict if necessary */
991     if (!filename) {
992         filename = qdict_get_try_str(*options, "filename");
993     } else if (filename && !qdict_haskey(*options, "filename")) {
994         qdict_put(*options, "filename", qstring_from_str(filename));
995         parse_filename = true;
996     } else {
997         error_setg(errp, "Can't specify 'file' and 'filename' options at the "
998                    "same time");
999         ret = -EINVAL;
1000         goto fail;
1001     }
1002 
1003     /* Find the right block driver */
1004     drvname = qdict_get_try_str(*options, "driver");
1005     if (drvname) {
1006         drv = bdrv_find_format(drvname);
1007         if (!drv) {
1008             error_setg(errp, "Unknown driver '%s'", drvname);
1009         }
1010         qdict_del(*options, "driver");
1011     } else if (filename) {
1012         drv = bdrv_find_protocol(filename, parse_filename);
1013         if (!drv) {
1014             error_setg(errp, "Unknown protocol");
1015         }
1016     } else {
1017         error_setg(errp, "Must specify either driver or file");
1018         drv = NULL;
1019     }
1020 
1021     if (!drv) {
1022         /* errp has been set already */
1023         ret = -ENOENT;
1024         goto fail;
1025     }
1026 
1027     /* Parse the filename and open it */
1028     if (drv->bdrv_parse_filename && parse_filename) {
1029         drv->bdrv_parse_filename(filename, *options, &local_err);
1030         if (local_err) {
1031             error_propagate(errp, local_err);
1032             ret = -EINVAL;
1033             goto fail;
1034         }
1035 
1036         if (!drv->bdrv_needs_filename) {
1037             qdict_del(*options, "filename");
1038         } else {
1039             filename = qdict_get_str(*options, "filename");
1040         }
1041     }
1042 
1043     if (!drv->bdrv_file_open) {
1044         ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1045         *options = NULL;
1046     } else {
1047         ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
1048     }
1049     if (ret < 0) {
1050         error_propagate(errp, local_err);
1051         goto fail;
1052     }
1053 
1054     bs->growable = 1;
1055     return 0;
1056 
1057 fail:
1058     return ret;
1059 }
1060 
1061 /*
1062  * Opens the backing file for a BlockDriverState if not yet open
1063  *
1064  * options is a QDict of options to pass to the block drivers, or NULL for an
1065  * empty set of options. The reference to the QDict is transferred to this
1066  * function (even on failure), so if the caller intends to reuse the dictionary,
1067  * it needs to use QINCREF() before calling bdrv_file_open.
1068  */
1069 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1070 {
1071     char backing_filename[PATH_MAX];
1072     int back_flags, ret;
1073     BlockDriver *back_drv = NULL;
1074     Error *local_err = NULL;
1075 
1076     if (bs->backing_hd != NULL) {
1077         QDECREF(options);
1078         return 0;
1079     }
1080 
1081     /* NULL means an empty set of options */
1082     if (options == NULL) {
1083         options = qdict_new();
1084     }
1085 
1086     bs->open_flags &= ~BDRV_O_NO_BACKING;
1087     if (qdict_haskey(options, "file.filename")) {
1088         backing_filename[0] = '\0';
1089     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1090         QDECREF(options);
1091         return 0;
1092     } else {
1093         bdrv_get_full_backing_filename(bs, backing_filename,
1094                                        sizeof(backing_filename));
1095     }
1096 
1097     if (bs->backing_format[0] != '\0') {
1098         back_drv = bdrv_find_format(bs->backing_format);
1099     }
1100 
1101     /* backing files always opened read-only */
1102     back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1103                                     BDRV_O_COPY_ON_READ);
1104 
1105     assert(bs->backing_hd == NULL);
1106     ret = bdrv_open(&bs->backing_hd,
1107                     *backing_filename ? backing_filename : NULL, NULL, options,
1108                     back_flags, back_drv, &local_err);
1109     if (ret < 0) {
1110         bs->backing_hd = NULL;
1111         bs->open_flags |= BDRV_O_NO_BACKING;
1112         error_setg(errp, "Could not open backing file: %s",
1113                    error_get_pretty(local_err));
1114         error_free(local_err);
1115         return ret;
1116     }
1117 
1118     if (bs->backing_hd->file) {
1119         pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1120                 bs->backing_hd->file->filename);
1121     }
1122 
1123     /* Recalculate the BlockLimits with the backing file */
1124     bdrv_refresh_limits(bs);
1125 
1126     return 0;
1127 }
1128 
1129 /*
1130  * Opens a disk image whose options are given as BlockdevRef in another block
1131  * device's options.
1132  *
1133  * If allow_none is true, no image will be opened if filename is false and no
1134  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1135  *
1136  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1137  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1138  * itself, all options starting with "${bdref_key}." are considered part of the
1139  * BlockdevRef.
1140  *
1141  * The BlockdevRef will be removed from the options QDict.
1142  *
1143  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1144  */
1145 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1146                     QDict *options, const char *bdref_key, int flags,
1147                     bool allow_none, Error **errp)
1148 {
1149     QDict *image_options;
1150     int ret;
1151     char *bdref_key_dot;
1152     const char *reference;
1153 
1154     assert(pbs);
1155     assert(*pbs == NULL);
1156 
1157     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1158     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1159     g_free(bdref_key_dot);
1160 
1161     reference = qdict_get_try_str(options, bdref_key);
1162     if (!filename && !reference && !qdict_size(image_options)) {
1163         if (allow_none) {
1164             ret = 0;
1165         } else {
1166             error_setg(errp, "A block device must be specified for \"%s\"",
1167                        bdref_key);
1168             ret = -EINVAL;
1169         }
1170         goto done;
1171     }
1172 
1173     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1174 
1175 done:
1176     qdict_del(options, bdref_key);
1177     return ret;
1178 }
1179 
1180 void bdrv_append_temp_snapshot(BlockDriverState *bs, Error **errp)
1181 {
1182     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1183     char tmp_filename[PATH_MAX + 1];
1184 
1185     int64_t total_size;
1186     BlockDriver *bdrv_qcow2;
1187     QEMUOptionParameter *create_options;
1188     QDict *snapshot_options;
1189     BlockDriverState *bs_snapshot;
1190     Error *local_err;
1191     int ret;
1192 
1193     /* if snapshot, we create a temporary backing file and open it
1194        instead of opening 'filename' directly */
1195 
1196     /* Get the required size from the image */
1197     total_size = bdrv_getlength(bs);
1198     if (total_size < 0) {
1199         error_setg_errno(errp, -total_size, "Could not get image size");
1200         return;
1201     }
1202     total_size &= BDRV_SECTOR_MASK;
1203 
1204     /* Create the temporary image */
1205     ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1206     if (ret < 0) {
1207         error_setg_errno(errp, -ret, "Could not get temporary filename");
1208         return;
1209     }
1210 
1211     bdrv_qcow2 = bdrv_find_format("qcow2");
1212     create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1213                                              NULL);
1214 
1215     set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1216 
1217     ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1218     free_option_parameters(create_options);
1219     if (ret < 0) {
1220         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1221                          "'%s': %s", tmp_filename,
1222                          error_get_pretty(local_err));
1223         error_free(local_err);
1224         return;
1225     }
1226 
1227     /* Prepare a new options QDict for the temporary file */
1228     snapshot_options = qdict_new();
1229     qdict_put(snapshot_options, "file.driver",
1230               qstring_from_str("file"));
1231     qdict_put(snapshot_options, "file.filename",
1232               qstring_from_str(tmp_filename));
1233 
1234     bs_snapshot = bdrv_new("", &error_abort);
1235     bs_snapshot->is_temporary = 1;
1236 
1237     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1238                     bs->open_flags & ~BDRV_O_SNAPSHOT, bdrv_qcow2, &local_err);
1239     if (ret < 0) {
1240         error_propagate(errp, local_err);
1241         return;
1242     }
1243 
1244     bdrv_append(bs_snapshot, bs);
1245 }
1246 
1247 /*
1248  * Opens a disk image (raw, qcow2, vmdk, ...)
1249  *
1250  * options is a QDict of options to pass to the block drivers, or NULL for an
1251  * empty set of options. The reference to the QDict belongs to the block layer
1252  * after the call (even on failure), so if the caller intends to reuse the
1253  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1254  *
1255  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1256  * If it is not NULL, the referenced BDS will be reused.
1257  *
1258  * The reference parameter may be used to specify an existing block device which
1259  * should be opened. If specified, neither options nor a filename may be given,
1260  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1261  */
1262 int bdrv_open(BlockDriverState **pbs, const char *filename,
1263               const char *reference, QDict *options, int flags,
1264               BlockDriver *drv, Error **errp)
1265 {
1266     int ret;
1267     BlockDriverState *file = NULL, *bs;
1268     const char *drvname;
1269     Error *local_err = NULL;
1270 
1271     assert(pbs);
1272 
1273     if (reference) {
1274         bool options_non_empty = options ? qdict_size(options) : false;
1275         QDECREF(options);
1276 
1277         if (*pbs) {
1278             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1279                        "another block device");
1280             return -EINVAL;
1281         }
1282 
1283         if (filename || options_non_empty) {
1284             error_setg(errp, "Cannot reference an existing block device with "
1285                        "additional options or a new filename");
1286             return -EINVAL;
1287         }
1288 
1289         bs = bdrv_lookup_bs(reference, reference, errp);
1290         if (!bs) {
1291             return -ENODEV;
1292         }
1293         bdrv_ref(bs);
1294         *pbs = bs;
1295         return 0;
1296     }
1297 
1298     if (*pbs) {
1299         bs = *pbs;
1300     } else {
1301         bs = bdrv_new("", &error_abort);
1302     }
1303 
1304     /* NULL means an empty set of options */
1305     if (options == NULL) {
1306         options = qdict_new();
1307     }
1308 
1309     bs->options = options;
1310     options = qdict_clone_shallow(options);
1311 
1312     if (flags & BDRV_O_PROTOCOL) {
1313         assert(!drv);
1314         ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
1315                              &local_err);
1316         if (!ret) {
1317             drv = bs->drv;
1318             goto done;
1319         } else if (bs->drv) {
1320             goto close_and_fail;
1321         } else {
1322             goto fail;
1323         }
1324     }
1325 
1326     /* Open image file without format layer */
1327     if (flags & BDRV_O_RDWR) {
1328         flags |= BDRV_O_ALLOW_RDWR;
1329     }
1330 
1331     assert(file == NULL);
1332     ret = bdrv_open_image(&file, filename, options, "file",
1333                           bdrv_open_flags(bs, flags | BDRV_O_UNMAP) |
1334                           BDRV_O_PROTOCOL, true, &local_err);
1335     if (ret < 0) {
1336         goto unlink_and_fail;
1337     }
1338 
1339     /* Find the right image format driver */
1340     drvname = qdict_get_try_str(options, "driver");
1341     if (drvname) {
1342         drv = bdrv_find_format(drvname);
1343         qdict_del(options, "driver");
1344         if (!drv) {
1345             error_setg(errp, "Invalid driver: '%s'", drvname);
1346             ret = -EINVAL;
1347             goto unlink_and_fail;
1348         }
1349     }
1350 
1351     if (!drv) {
1352         if (file) {
1353             ret = find_image_format(file, filename, &drv, &local_err);
1354         } else {
1355             error_setg(errp, "Must specify either driver or file");
1356             ret = -EINVAL;
1357             goto unlink_and_fail;
1358         }
1359     }
1360 
1361     if (!drv) {
1362         goto unlink_and_fail;
1363     }
1364 
1365     /* Open the image */
1366     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1367     if (ret < 0) {
1368         goto unlink_and_fail;
1369     }
1370 
1371     if (file && (bs->file != file)) {
1372         bdrv_unref(file);
1373         file = NULL;
1374     }
1375 
1376     /* If there is a backing file, use it */
1377     if ((flags & BDRV_O_NO_BACKING) == 0) {
1378         QDict *backing_options;
1379 
1380         qdict_extract_subqdict(options, &backing_options, "backing.");
1381         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1382         if (ret < 0) {
1383             goto close_and_fail;
1384         }
1385     }
1386 
1387     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1388      * temporary snapshot afterwards. */
1389     if (flags & BDRV_O_SNAPSHOT) {
1390         bdrv_append_temp_snapshot(bs, &local_err);
1391         if (local_err) {
1392             error_propagate(errp, local_err);
1393             goto close_and_fail;
1394         }
1395     }
1396 
1397 
1398 done:
1399     /* Check if any unknown options were used */
1400     if (options && (qdict_size(options) != 0)) {
1401         const QDictEntry *entry = qdict_first(options);
1402         if (flags & BDRV_O_PROTOCOL) {
1403             error_setg(errp, "Block protocol '%s' doesn't support the option "
1404                        "'%s'", drv->format_name, entry->key);
1405         } else {
1406             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1407                        "support the option '%s'", drv->format_name,
1408                        bs->device_name, entry->key);
1409         }
1410 
1411         ret = -EINVAL;
1412         goto close_and_fail;
1413     }
1414 
1415     if (!bdrv_key_required(bs)) {
1416         bdrv_dev_change_media_cb(bs, true);
1417     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1418                && !runstate_check(RUN_STATE_INMIGRATE)
1419                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1420         error_setg(errp,
1421                    "Guest must be stopped for opening of encrypted image");
1422         ret = -EBUSY;
1423         goto close_and_fail;
1424     }
1425 
1426     QDECREF(options);
1427     *pbs = bs;
1428     return 0;
1429 
1430 unlink_and_fail:
1431     if (file != NULL) {
1432         bdrv_unref(file);
1433     }
1434     if (bs->is_temporary) {
1435         unlink(filename);
1436     }
1437 fail:
1438     QDECREF(bs->options);
1439     QDECREF(options);
1440     bs->options = NULL;
1441     if (!*pbs) {
1442         /* If *pbs is NULL, a new BDS has been created in this function and
1443            needs to be freed now. Otherwise, it does not need to be closed,
1444            since it has not really been opened yet. */
1445         bdrv_unref(bs);
1446     }
1447     if (local_err) {
1448         error_propagate(errp, local_err);
1449     }
1450     return ret;
1451 
1452 close_and_fail:
1453     /* See fail path, but now the BDS has to be always closed */
1454     if (*pbs) {
1455         bdrv_close(bs);
1456     } else {
1457         bdrv_unref(bs);
1458     }
1459     QDECREF(options);
1460     if (local_err) {
1461         error_propagate(errp, local_err);
1462     }
1463     return ret;
1464 }
1465 
1466 typedef struct BlockReopenQueueEntry {
1467      bool prepared;
1468      BDRVReopenState state;
1469      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1470 } BlockReopenQueueEntry;
1471 
1472 /*
1473  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1474  * reopen of multiple devices.
1475  *
1476  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1477  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1478  * be created and initialized. This newly created BlockReopenQueue should be
1479  * passed back in for subsequent calls that are intended to be of the same
1480  * atomic 'set'.
1481  *
1482  * bs is the BlockDriverState to add to the reopen queue.
1483  *
1484  * flags contains the open flags for the associated bs
1485  *
1486  * returns a pointer to bs_queue, which is either the newly allocated
1487  * bs_queue, or the existing bs_queue being used.
1488  *
1489  */
1490 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1491                                     BlockDriverState *bs, int flags)
1492 {
1493     assert(bs != NULL);
1494 
1495     BlockReopenQueueEntry *bs_entry;
1496     if (bs_queue == NULL) {
1497         bs_queue = g_new0(BlockReopenQueue, 1);
1498         QSIMPLEQ_INIT(bs_queue);
1499     }
1500 
1501     if (bs->file) {
1502         bdrv_reopen_queue(bs_queue, bs->file, flags);
1503     }
1504 
1505     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1506     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1507 
1508     bs_entry->state.bs = bs;
1509     bs_entry->state.flags = flags;
1510 
1511     return bs_queue;
1512 }
1513 
1514 /*
1515  * Reopen multiple BlockDriverStates atomically & transactionally.
1516  *
1517  * The queue passed in (bs_queue) must have been built up previous
1518  * via bdrv_reopen_queue().
1519  *
1520  * Reopens all BDS specified in the queue, with the appropriate
1521  * flags.  All devices are prepared for reopen, and failure of any
1522  * device will cause all device changes to be abandonded, and intermediate
1523  * data cleaned up.
1524  *
1525  * If all devices prepare successfully, then the changes are committed
1526  * to all devices.
1527  *
1528  */
1529 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1530 {
1531     int ret = -1;
1532     BlockReopenQueueEntry *bs_entry, *next;
1533     Error *local_err = NULL;
1534 
1535     assert(bs_queue != NULL);
1536 
1537     bdrv_drain_all();
1538 
1539     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1540         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1541             error_propagate(errp, local_err);
1542             goto cleanup;
1543         }
1544         bs_entry->prepared = true;
1545     }
1546 
1547     /* If we reach this point, we have success and just need to apply the
1548      * changes
1549      */
1550     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1551         bdrv_reopen_commit(&bs_entry->state);
1552     }
1553 
1554     ret = 0;
1555 
1556 cleanup:
1557     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1558         if (ret && bs_entry->prepared) {
1559             bdrv_reopen_abort(&bs_entry->state);
1560         }
1561         g_free(bs_entry);
1562     }
1563     g_free(bs_queue);
1564     return ret;
1565 }
1566 
1567 
1568 /* Reopen a single BlockDriverState with the specified flags. */
1569 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1570 {
1571     int ret = -1;
1572     Error *local_err = NULL;
1573     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1574 
1575     ret = bdrv_reopen_multiple(queue, &local_err);
1576     if (local_err != NULL) {
1577         error_propagate(errp, local_err);
1578     }
1579     return ret;
1580 }
1581 
1582 
1583 /*
1584  * Prepares a BlockDriverState for reopen. All changes are staged in the
1585  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1586  * the block driver layer .bdrv_reopen_prepare()
1587  *
1588  * bs is the BlockDriverState to reopen
1589  * flags are the new open flags
1590  * queue is the reopen queue
1591  *
1592  * Returns 0 on success, non-zero on error.  On error errp will be set
1593  * as well.
1594  *
1595  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1596  * It is the responsibility of the caller to then call the abort() or
1597  * commit() for any other BDS that have been left in a prepare() state
1598  *
1599  */
1600 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1601                         Error **errp)
1602 {
1603     int ret = -1;
1604     Error *local_err = NULL;
1605     BlockDriver *drv;
1606 
1607     assert(reopen_state != NULL);
1608     assert(reopen_state->bs->drv != NULL);
1609     drv = reopen_state->bs->drv;
1610 
1611     /* if we are to stay read-only, do not allow permission change
1612      * to r/w */
1613     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1614         reopen_state->flags & BDRV_O_RDWR) {
1615         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1616                   reopen_state->bs->device_name);
1617         goto error;
1618     }
1619 
1620 
1621     ret = bdrv_flush(reopen_state->bs);
1622     if (ret) {
1623         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1624                   strerror(-ret));
1625         goto error;
1626     }
1627 
1628     if (drv->bdrv_reopen_prepare) {
1629         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1630         if (ret) {
1631             if (local_err != NULL) {
1632                 error_propagate(errp, local_err);
1633             } else {
1634                 error_setg(errp, "failed while preparing to reopen image '%s'",
1635                            reopen_state->bs->filename);
1636             }
1637             goto error;
1638         }
1639     } else {
1640         /* It is currently mandatory to have a bdrv_reopen_prepare()
1641          * handler for each supported drv. */
1642         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1643                   drv->format_name, reopen_state->bs->device_name,
1644                  "reopening of file");
1645         ret = -1;
1646         goto error;
1647     }
1648 
1649     ret = 0;
1650 
1651 error:
1652     return ret;
1653 }
1654 
1655 /*
1656  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1657  * makes them final by swapping the staging BlockDriverState contents into
1658  * the active BlockDriverState contents.
1659  */
1660 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1661 {
1662     BlockDriver *drv;
1663 
1664     assert(reopen_state != NULL);
1665     drv = reopen_state->bs->drv;
1666     assert(drv != NULL);
1667 
1668     /* If there are any driver level actions to take */
1669     if (drv->bdrv_reopen_commit) {
1670         drv->bdrv_reopen_commit(reopen_state);
1671     }
1672 
1673     /* set BDS specific flags now */
1674     reopen_state->bs->open_flags         = reopen_state->flags;
1675     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1676                                               BDRV_O_CACHE_WB);
1677     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1678 
1679     bdrv_refresh_limits(reopen_state->bs);
1680 }
1681 
1682 /*
1683  * Abort the reopen, and delete and free the staged changes in
1684  * reopen_state
1685  */
1686 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1687 {
1688     BlockDriver *drv;
1689 
1690     assert(reopen_state != NULL);
1691     drv = reopen_state->bs->drv;
1692     assert(drv != NULL);
1693 
1694     if (drv->bdrv_reopen_abort) {
1695         drv->bdrv_reopen_abort(reopen_state);
1696     }
1697 }
1698 
1699 
1700 void bdrv_close(BlockDriverState *bs)
1701 {
1702     if (bs->job) {
1703         block_job_cancel_sync(bs->job);
1704     }
1705     bdrv_drain_all(); /* complete I/O */
1706     bdrv_flush(bs);
1707     bdrv_drain_all(); /* in case flush left pending I/O */
1708     notifier_list_notify(&bs->close_notifiers, bs);
1709 
1710     if (bs->drv) {
1711         if (bs->backing_hd) {
1712             bdrv_unref(bs->backing_hd);
1713             bs->backing_hd = NULL;
1714         }
1715         bs->drv->bdrv_close(bs);
1716         g_free(bs->opaque);
1717 #ifdef _WIN32
1718         if (bs->is_temporary) {
1719             unlink(bs->filename);
1720         }
1721 #endif
1722         bs->opaque = NULL;
1723         bs->drv = NULL;
1724         bs->copy_on_read = 0;
1725         bs->backing_file[0] = '\0';
1726         bs->backing_format[0] = '\0';
1727         bs->total_sectors = 0;
1728         bs->encrypted = 0;
1729         bs->valid_key = 0;
1730         bs->sg = 0;
1731         bs->growable = 0;
1732         bs->zero_beyond_eof = false;
1733         QDECREF(bs->options);
1734         bs->options = NULL;
1735 
1736         if (bs->file != NULL) {
1737             bdrv_unref(bs->file);
1738             bs->file = NULL;
1739         }
1740     }
1741 
1742     bdrv_dev_change_media_cb(bs, false);
1743 
1744     /*throttling disk I/O limits*/
1745     if (bs->io_limits_enabled) {
1746         bdrv_io_limits_disable(bs);
1747     }
1748 }
1749 
1750 void bdrv_close_all(void)
1751 {
1752     BlockDriverState *bs;
1753 
1754     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1755         bdrv_close(bs);
1756     }
1757 }
1758 
1759 /* Check if any requests are in-flight (including throttled requests) */
1760 static bool bdrv_requests_pending(BlockDriverState *bs)
1761 {
1762     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1763         return true;
1764     }
1765     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1766         return true;
1767     }
1768     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1769         return true;
1770     }
1771     if (bs->file && bdrv_requests_pending(bs->file)) {
1772         return true;
1773     }
1774     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1775         return true;
1776     }
1777     return false;
1778 }
1779 
1780 static bool bdrv_requests_pending_all(void)
1781 {
1782     BlockDriverState *bs;
1783     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1784         if (bdrv_requests_pending(bs)) {
1785             return true;
1786         }
1787     }
1788     return false;
1789 }
1790 
1791 /*
1792  * Wait for pending requests to complete across all BlockDriverStates
1793  *
1794  * This function does not flush data to disk, use bdrv_flush_all() for that
1795  * after calling this function.
1796  *
1797  * Note that completion of an asynchronous I/O operation can trigger any
1798  * number of other I/O operations on other devices---for example a coroutine
1799  * can be arbitrarily complex and a constant flow of I/O can come until the
1800  * coroutine is complete.  Because of this, it is not possible to have a
1801  * function to drain a single device's I/O queue.
1802  */
1803 void bdrv_drain_all(void)
1804 {
1805     /* Always run first iteration so any pending completion BHs run */
1806     bool busy = true;
1807     BlockDriverState *bs;
1808 
1809     while (busy) {
1810         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1811             bdrv_start_throttled_reqs(bs);
1812         }
1813 
1814         busy = bdrv_requests_pending_all();
1815         busy |= aio_poll(qemu_get_aio_context(), busy);
1816     }
1817 }
1818 
1819 /* make a BlockDriverState anonymous by removing from bdrv_state and
1820  * graph_bdrv_state list.
1821    Also, NULL terminate the device_name to prevent double remove */
1822 void bdrv_make_anon(BlockDriverState *bs)
1823 {
1824     if (bs->device_name[0] != '\0') {
1825         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1826     }
1827     bs->device_name[0] = '\0';
1828     if (bs->node_name[0] != '\0') {
1829         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1830     }
1831     bs->node_name[0] = '\0';
1832 }
1833 
1834 static void bdrv_rebind(BlockDriverState *bs)
1835 {
1836     if (bs->drv && bs->drv->bdrv_rebind) {
1837         bs->drv->bdrv_rebind(bs);
1838     }
1839 }
1840 
1841 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1842                                      BlockDriverState *bs_src)
1843 {
1844     /* move some fields that need to stay attached to the device */
1845     bs_dest->open_flags         = bs_src->open_flags;
1846 
1847     /* dev info */
1848     bs_dest->dev_ops            = bs_src->dev_ops;
1849     bs_dest->dev_opaque         = bs_src->dev_opaque;
1850     bs_dest->dev                = bs_src->dev;
1851     bs_dest->guest_block_size   = bs_src->guest_block_size;
1852     bs_dest->copy_on_read       = bs_src->copy_on_read;
1853 
1854     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1855 
1856     /* i/o throttled req */
1857     memcpy(&bs_dest->throttle_state,
1858            &bs_src->throttle_state,
1859            sizeof(ThrottleState));
1860     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1861     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1862     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1863 
1864     /* r/w error */
1865     bs_dest->on_read_error      = bs_src->on_read_error;
1866     bs_dest->on_write_error     = bs_src->on_write_error;
1867 
1868     /* i/o status */
1869     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1870     bs_dest->iostatus           = bs_src->iostatus;
1871 
1872     /* dirty bitmap */
1873     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1874 
1875     /* reference count */
1876     bs_dest->refcnt             = bs_src->refcnt;
1877 
1878     /* job */
1879     bs_dest->in_use             = bs_src->in_use;
1880     bs_dest->job                = bs_src->job;
1881 
1882     /* keep the same entry in bdrv_states */
1883     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1884             bs_src->device_name);
1885     bs_dest->device_list = bs_src->device_list;
1886 }
1887 
1888 /*
1889  * Swap bs contents for two image chains while they are live,
1890  * while keeping required fields on the BlockDriverState that is
1891  * actually attached to a device.
1892  *
1893  * This will modify the BlockDriverState fields, and swap contents
1894  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1895  *
1896  * bs_new is required to be anonymous.
1897  *
1898  * This function does not create any image files.
1899  */
1900 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1901 {
1902     BlockDriverState tmp;
1903 
1904     /* The code needs to swap the node_name but simply swapping node_list won't
1905      * work so first remove the nodes from the graph list, do the swap then
1906      * insert them back if needed.
1907      */
1908     if (bs_new->node_name[0] != '\0') {
1909         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
1910     }
1911     if (bs_old->node_name[0] != '\0') {
1912         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
1913     }
1914 
1915     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1916     assert(bs_new->device_name[0] == '\0');
1917     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1918     assert(bs_new->job == NULL);
1919     assert(bs_new->dev == NULL);
1920     assert(bs_new->in_use == 0);
1921     assert(bs_new->io_limits_enabled == false);
1922     assert(!throttle_have_timer(&bs_new->throttle_state));
1923 
1924     tmp = *bs_new;
1925     *bs_new = *bs_old;
1926     *bs_old = tmp;
1927 
1928     /* there are some fields that should not be swapped, move them back */
1929     bdrv_move_feature_fields(&tmp, bs_old);
1930     bdrv_move_feature_fields(bs_old, bs_new);
1931     bdrv_move_feature_fields(bs_new, &tmp);
1932 
1933     /* bs_new shouldn't be in bdrv_states even after the swap!  */
1934     assert(bs_new->device_name[0] == '\0');
1935 
1936     /* Check a few fields that should remain attached to the device */
1937     assert(bs_new->dev == NULL);
1938     assert(bs_new->job == NULL);
1939     assert(bs_new->in_use == 0);
1940     assert(bs_new->io_limits_enabled == false);
1941     assert(!throttle_have_timer(&bs_new->throttle_state));
1942 
1943     /* insert the nodes back into the graph node list if needed */
1944     if (bs_new->node_name[0] != '\0') {
1945         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
1946     }
1947     if (bs_old->node_name[0] != '\0') {
1948         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
1949     }
1950 
1951     bdrv_rebind(bs_new);
1952     bdrv_rebind(bs_old);
1953 }
1954 
1955 /*
1956  * Add new bs contents at the top of an image chain while the chain is
1957  * live, while keeping required fields on the top layer.
1958  *
1959  * This will modify the BlockDriverState fields, and swap contents
1960  * between bs_new and bs_top. Both bs_new and bs_top are modified.
1961  *
1962  * bs_new is required to be anonymous.
1963  *
1964  * This function does not create any image files.
1965  */
1966 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1967 {
1968     bdrv_swap(bs_new, bs_top);
1969 
1970     /* The contents of 'tmp' will become bs_top, as we are
1971      * swapping bs_new and bs_top contents. */
1972     bs_top->backing_hd = bs_new;
1973     bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1974     pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1975             bs_new->filename);
1976     pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1977             bs_new->drv ? bs_new->drv->format_name : "");
1978 }
1979 
1980 static void bdrv_delete(BlockDriverState *bs)
1981 {
1982     assert(!bs->dev);
1983     assert(!bs->job);
1984     assert(!bs->in_use);
1985     assert(!bs->refcnt);
1986     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1987 
1988     bdrv_close(bs);
1989 
1990     /* remove from list, if necessary */
1991     bdrv_make_anon(bs);
1992 
1993     g_free(bs);
1994 }
1995 
1996 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1997 /* TODO change to DeviceState *dev when all users are qdevified */
1998 {
1999     if (bs->dev) {
2000         return -EBUSY;
2001     }
2002     bs->dev = dev;
2003     bdrv_iostatus_reset(bs);
2004     return 0;
2005 }
2006 
2007 /* TODO qdevified devices don't use this, remove when devices are qdevified */
2008 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
2009 {
2010     if (bdrv_attach_dev(bs, dev) < 0) {
2011         abort();
2012     }
2013 }
2014 
2015 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2016 /* TODO change to DeviceState *dev when all users are qdevified */
2017 {
2018     assert(bs->dev == dev);
2019     bs->dev = NULL;
2020     bs->dev_ops = NULL;
2021     bs->dev_opaque = NULL;
2022     bs->guest_block_size = 512;
2023 }
2024 
2025 /* TODO change to return DeviceState * when all users are qdevified */
2026 void *bdrv_get_attached_dev(BlockDriverState *bs)
2027 {
2028     return bs->dev;
2029 }
2030 
2031 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2032                       void *opaque)
2033 {
2034     bs->dev_ops = ops;
2035     bs->dev_opaque = opaque;
2036 }
2037 
2038 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2039                                enum MonitorEvent ev,
2040                                BlockErrorAction action, bool is_read)
2041 {
2042     QObject *data;
2043     const char *action_str;
2044 
2045     switch (action) {
2046     case BDRV_ACTION_REPORT:
2047         action_str = "report";
2048         break;
2049     case BDRV_ACTION_IGNORE:
2050         action_str = "ignore";
2051         break;
2052     case BDRV_ACTION_STOP:
2053         action_str = "stop";
2054         break;
2055     default:
2056         abort();
2057     }
2058 
2059     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2060                               bdrv->device_name,
2061                               action_str,
2062                               is_read ? "read" : "write");
2063     monitor_protocol_event(ev, data);
2064 
2065     qobject_decref(data);
2066 }
2067 
2068 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2069 {
2070     QObject *data;
2071 
2072     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2073                               bdrv_get_device_name(bs), ejected);
2074     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2075 
2076     qobject_decref(data);
2077 }
2078 
2079 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2080 {
2081     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2082         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2083         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2084         if (tray_was_closed) {
2085             /* tray open */
2086             bdrv_emit_qmp_eject_event(bs, true);
2087         }
2088         if (load) {
2089             /* tray close */
2090             bdrv_emit_qmp_eject_event(bs, false);
2091         }
2092     }
2093 }
2094 
2095 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2096 {
2097     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2098 }
2099 
2100 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2101 {
2102     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2103         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2104     }
2105 }
2106 
2107 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2108 {
2109     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2110         return bs->dev_ops->is_tray_open(bs->dev_opaque);
2111     }
2112     return false;
2113 }
2114 
2115 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2116 {
2117     if (bs->dev_ops && bs->dev_ops->resize_cb) {
2118         bs->dev_ops->resize_cb(bs->dev_opaque);
2119     }
2120 }
2121 
2122 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2123 {
2124     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2125         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2126     }
2127     return false;
2128 }
2129 
2130 /*
2131  * Run consistency checks on an image
2132  *
2133  * Returns 0 if the check could be completed (it doesn't mean that the image is
2134  * free of errors) or -errno when an internal error occurred. The results of the
2135  * check are stored in res.
2136  */
2137 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2138 {
2139     if (bs->drv->bdrv_check == NULL) {
2140         return -ENOTSUP;
2141     }
2142 
2143     memset(res, 0, sizeof(*res));
2144     return bs->drv->bdrv_check(bs, res, fix);
2145 }
2146 
2147 #define COMMIT_BUF_SECTORS 2048
2148 
2149 /* commit COW file into the raw image */
2150 int bdrv_commit(BlockDriverState *bs)
2151 {
2152     BlockDriver *drv = bs->drv;
2153     int64_t sector, total_sectors, length, backing_length;
2154     int n, ro, open_flags;
2155     int ret = 0;
2156     uint8_t *buf = NULL;
2157     char filename[PATH_MAX];
2158 
2159     if (!drv)
2160         return -ENOMEDIUM;
2161 
2162     if (!bs->backing_hd) {
2163         return -ENOTSUP;
2164     }
2165 
2166     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2167         return -EBUSY;
2168     }
2169 
2170     ro = bs->backing_hd->read_only;
2171     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2172     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2173     open_flags =  bs->backing_hd->open_flags;
2174 
2175     if (ro) {
2176         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2177             return -EACCES;
2178         }
2179     }
2180 
2181     length = bdrv_getlength(bs);
2182     if (length < 0) {
2183         ret = length;
2184         goto ro_cleanup;
2185     }
2186 
2187     backing_length = bdrv_getlength(bs->backing_hd);
2188     if (backing_length < 0) {
2189         ret = backing_length;
2190         goto ro_cleanup;
2191     }
2192 
2193     /* If our top snapshot is larger than the backing file image,
2194      * grow the backing file image if possible.  If not possible,
2195      * we must return an error */
2196     if (length > backing_length) {
2197         ret = bdrv_truncate(bs->backing_hd, length);
2198         if (ret < 0) {
2199             goto ro_cleanup;
2200         }
2201     }
2202 
2203     total_sectors = length >> BDRV_SECTOR_BITS;
2204     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2205 
2206     for (sector = 0; sector < total_sectors; sector += n) {
2207         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2208         if (ret < 0) {
2209             goto ro_cleanup;
2210         }
2211         if (ret) {
2212             ret = bdrv_read(bs, sector, buf, n);
2213             if (ret < 0) {
2214                 goto ro_cleanup;
2215             }
2216 
2217             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2218             if (ret < 0) {
2219                 goto ro_cleanup;
2220             }
2221         }
2222     }
2223 
2224     if (drv->bdrv_make_empty) {
2225         ret = drv->bdrv_make_empty(bs);
2226         if (ret < 0) {
2227             goto ro_cleanup;
2228         }
2229         bdrv_flush(bs);
2230     }
2231 
2232     /*
2233      * Make sure all data we wrote to the backing device is actually
2234      * stable on disk.
2235      */
2236     if (bs->backing_hd) {
2237         bdrv_flush(bs->backing_hd);
2238     }
2239 
2240     ret = 0;
2241 ro_cleanup:
2242     g_free(buf);
2243 
2244     if (ro) {
2245         /* ignoring error return here */
2246         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2247     }
2248 
2249     return ret;
2250 }
2251 
2252 int bdrv_commit_all(void)
2253 {
2254     BlockDriverState *bs;
2255 
2256     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2257         if (bs->drv && bs->backing_hd) {
2258             int ret = bdrv_commit(bs);
2259             if (ret < 0) {
2260                 return ret;
2261             }
2262         }
2263     }
2264     return 0;
2265 }
2266 
2267 /**
2268  * Remove an active request from the tracked requests list
2269  *
2270  * This function should be called when a tracked request is completing.
2271  */
2272 static void tracked_request_end(BdrvTrackedRequest *req)
2273 {
2274     if (req->serialising) {
2275         req->bs->serialising_in_flight--;
2276     }
2277 
2278     QLIST_REMOVE(req, list);
2279     qemu_co_queue_restart_all(&req->wait_queue);
2280 }
2281 
2282 /**
2283  * Add an active request to the tracked requests list
2284  */
2285 static void tracked_request_begin(BdrvTrackedRequest *req,
2286                                   BlockDriverState *bs,
2287                                   int64_t offset,
2288                                   unsigned int bytes, bool is_write)
2289 {
2290     *req = (BdrvTrackedRequest){
2291         .bs = bs,
2292         .offset         = offset,
2293         .bytes          = bytes,
2294         .is_write       = is_write,
2295         .co             = qemu_coroutine_self(),
2296         .serialising    = false,
2297         .overlap_offset = offset,
2298         .overlap_bytes  = bytes,
2299     };
2300 
2301     qemu_co_queue_init(&req->wait_queue);
2302 
2303     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2304 }
2305 
2306 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2307 {
2308     int64_t overlap_offset = req->offset & ~(align - 1);
2309     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2310                                - overlap_offset;
2311 
2312     if (!req->serialising) {
2313         req->bs->serialising_in_flight++;
2314         req->serialising = true;
2315     }
2316 
2317     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2318     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2319 }
2320 
2321 /**
2322  * Round a region to cluster boundaries
2323  */
2324 void bdrv_round_to_clusters(BlockDriverState *bs,
2325                             int64_t sector_num, int nb_sectors,
2326                             int64_t *cluster_sector_num,
2327                             int *cluster_nb_sectors)
2328 {
2329     BlockDriverInfo bdi;
2330 
2331     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2332         *cluster_sector_num = sector_num;
2333         *cluster_nb_sectors = nb_sectors;
2334     } else {
2335         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2336         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2337         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2338                                             nb_sectors, c);
2339     }
2340 }
2341 
2342 static int bdrv_get_cluster_size(BlockDriverState *bs)
2343 {
2344     BlockDriverInfo bdi;
2345     int ret;
2346 
2347     ret = bdrv_get_info(bs, &bdi);
2348     if (ret < 0 || bdi.cluster_size == 0) {
2349         return bs->request_alignment;
2350     } else {
2351         return bdi.cluster_size;
2352     }
2353 }
2354 
2355 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2356                                      int64_t offset, unsigned int bytes)
2357 {
2358     /*        aaaa   bbbb */
2359     if (offset >= req->overlap_offset + req->overlap_bytes) {
2360         return false;
2361     }
2362     /* bbbb   aaaa        */
2363     if (req->overlap_offset >= offset + bytes) {
2364         return false;
2365     }
2366     return true;
2367 }
2368 
2369 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2370 {
2371     BlockDriverState *bs = self->bs;
2372     BdrvTrackedRequest *req;
2373     bool retry;
2374     bool waited = false;
2375 
2376     if (!bs->serialising_in_flight) {
2377         return false;
2378     }
2379 
2380     do {
2381         retry = false;
2382         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2383             if (req == self || (!req->serialising && !self->serialising)) {
2384                 continue;
2385             }
2386             if (tracked_request_overlaps(req, self->overlap_offset,
2387                                          self->overlap_bytes))
2388             {
2389                 /* Hitting this means there was a reentrant request, for
2390                  * example, a block driver issuing nested requests.  This must
2391                  * never happen since it means deadlock.
2392                  */
2393                 assert(qemu_coroutine_self() != req->co);
2394 
2395                 /* If the request is already (indirectly) waiting for us, or
2396                  * will wait for us as soon as it wakes up, then just go on
2397                  * (instead of producing a deadlock in the former case). */
2398                 if (!req->waiting_for) {
2399                     self->waiting_for = req;
2400                     qemu_co_queue_wait(&req->wait_queue);
2401                     self->waiting_for = NULL;
2402                     retry = true;
2403                     waited = true;
2404                     break;
2405                 }
2406             }
2407         }
2408     } while (retry);
2409 
2410     return waited;
2411 }
2412 
2413 /*
2414  * Return values:
2415  * 0        - success
2416  * -EINVAL  - backing format specified, but no file
2417  * -ENOSPC  - can't update the backing file because no space is left in the
2418  *            image file header
2419  * -ENOTSUP - format driver doesn't support changing the backing file
2420  */
2421 int bdrv_change_backing_file(BlockDriverState *bs,
2422     const char *backing_file, const char *backing_fmt)
2423 {
2424     BlockDriver *drv = bs->drv;
2425     int ret;
2426 
2427     /* Backing file format doesn't make sense without a backing file */
2428     if (backing_fmt && !backing_file) {
2429         return -EINVAL;
2430     }
2431 
2432     if (drv->bdrv_change_backing_file != NULL) {
2433         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2434     } else {
2435         ret = -ENOTSUP;
2436     }
2437 
2438     if (ret == 0) {
2439         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2440         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2441     }
2442     return ret;
2443 }
2444 
2445 /*
2446  * Finds the image layer in the chain that has 'bs' as its backing file.
2447  *
2448  * active is the current topmost image.
2449  *
2450  * Returns NULL if bs is not found in active's image chain,
2451  * or if active == bs.
2452  */
2453 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2454                                     BlockDriverState *bs)
2455 {
2456     BlockDriverState *overlay = NULL;
2457     BlockDriverState *intermediate;
2458 
2459     assert(active != NULL);
2460     assert(bs != NULL);
2461 
2462     /* if bs is the same as active, then by definition it has no overlay
2463      */
2464     if (active == bs) {
2465         return NULL;
2466     }
2467 
2468     intermediate = active;
2469     while (intermediate->backing_hd) {
2470         if (intermediate->backing_hd == bs) {
2471             overlay = intermediate;
2472             break;
2473         }
2474         intermediate = intermediate->backing_hd;
2475     }
2476 
2477     return overlay;
2478 }
2479 
2480 typedef struct BlkIntermediateStates {
2481     BlockDriverState *bs;
2482     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2483 } BlkIntermediateStates;
2484 
2485 
2486 /*
2487  * Drops images above 'base' up to and including 'top', and sets the image
2488  * above 'top' to have base as its backing file.
2489  *
2490  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2491  * information in 'bs' can be properly updated.
2492  *
2493  * E.g., this will convert the following chain:
2494  * bottom <- base <- intermediate <- top <- active
2495  *
2496  * to
2497  *
2498  * bottom <- base <- active
2499  *
2500  * It is allowed for bottom==base, in which case it converts:
2501  *
2502  * base <- intermediate <- top <- active
2503  *
2504  * to
2505  *
2506  * base <- active
2507  *
2508  * Error conditions:
2509  *  if active == top, that is considered an error
2510  *
2511  */
2512 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2513                            BlockDriverState *base)
2514 {
2515     BlockDriverState *intermediate;
2516     BlockDriverState *base_bs = NULL;
2517     BlockDriverState *new_top_bs = NULL;
2518     BlkIntermediateStates *intermediate_state, *next;
2519     int ret = -EIO;
2520 
2521     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2522     QSIMPLEQ_INIT(&states_to_delete);
2523 
2524     if (!top->drv || !base->drv) {
2525         goto exit;
2526     }
2527 
2528     new_top_bs = bdrv_find_overlay(active, top);
2529 
2530     if (new_top_bs == NULL) {
2531         /* we could not find the image above 'top', this is an error */
2532         goto exit;
2533     }
2534 
2535     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2536      * to do, no intermediate images */
2537     if (new_top_bs->backing_hd == base) {
2538         ret = 0;
2539         goto exit;
2540     }
2541 
2542     intermediate = top;
2543 
2544     /* now we will go down through the list, and add each BDS we find
2545      * into our deletion queue, until we hit the 'base'
2546      */
2547     while (intermediate) {
2548         intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2549         intermediate_state->bs = intermediate;
2550         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2551 
2552         if (intermediate->backing_hd == base) {
2553             base_bs = intermediate->backing_hd;
2554             break;
2555         }
2556         intermediate = intermediate->backing_hd;
2557     }
2558     if (base_bs == NULL) {
2559         /* something went wrong, we did not end at the base. safely
2560          * unravel everything, and exit with error */
2561         goto exit;
2562     }
2563 
2564     /* success - we can delete the intermediate states, and link top->base */
2565     ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2566                                    base_bs->drv ? base_bs->drv->format_name : "");
2567     if (ret) {
2568         goto exit;
2569     }
2570     new_top_bs->backing_hd = base_bs;
2571 
2572     bdrv_refresh_limits(new_top_bs);
2573 
2574     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2575         /* so that bdrv_close() does not recursively close the chain */
2576         intermediate_state->bs->backing_hd = NULL;
2577         bdrv_unref(intermediate_state->bs);
2578     }
2579     ret = 0;
2580 
2581 exit:
2582     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2583         g_free(intermediate_state);
2584     }
2585     return ret;
2586 }
2587 
2588 
2589 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2590                                    size_t size)
2591 {
2592     int64_t len;
2593 
2594     if (size > INT_MAX) {
2595         return -EIO;
2596     }
2597 
2598     if (!bdrv_is_inserted(bs))
2599         return -ENOMEDIUM;
2600 
2601     if (bs->growable)
2602         return 0;
2603 
2604     len = bdrv_getlength(bs);
2605 
2606     if (offset < 0)
2607         return -EIO;
2608 
2609     if ((offset > len) || (len - offset < size))
2610         return -EIO;
2611 
2612     return 0;
2613 }
2614 
2615 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2616                               int nb_sectors)
2617 {
2618     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2619         return -EIO;
2620     }
2621 
2622     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2623                                    nb_sectors * BDRV_SECTOR_SIZE);
2624 }
2625 
2626 typedef struct RwCo {
2627     BlockDriverState *bs;
2628     int64_t offset;
2629     QEMUIOVector *qiov;
2630     bool is_write;
2631     int ret;
2632     BdrvRequestFlags flags;
2633 } RwCo;
2634 
2635 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2636 {
2637     RwCo *rwco = opaque;
2638 
2639     if (!rwco->is_write) {
2640         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2641                                       rwco->qiov->size, rwco->qiov,
2642                                       rwco->flags);
2643     } else {
2644         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2645                                        rwco->qiov->size, rwco->qiov,
2646                                        rwco->flags);
2647     }
2648 }
2649 
2650 /*
2651  * Process a vectored synchronous request using coroutines
2652  */
2653 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2654                         QEMUIOVector *qiov, bool is_write,
2655                         BdrvRequestFlags flags)
2656 {
2657     Coroutine *co;
2658     RwCo rwco = {
2659         .bs = bs,
2660         .offset = offset,
2661         .qiov = qiov,
2662         .is_write = is_write,
2663         .ret = NOT_DONE,
2664         .flags = flags,
2665     };
2666 
2667     /**
2668      * In sync call context, when the vcpu is blocked, this throttling timer
2669      * will not fire; so the I/O throttling function has to be disabled here
2670      * if it has been enabled.
2671      */
2672     if (bs->io_limits_enabled) {
2673         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2674                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2675         bdrv_io_limits_disable(bs);
2676     }
2677 
2678     if (qemu_in_coroutine()) {
2679         /* Fast-path if already in coroutine context */
2680         bdrv_rw_co_entry(&rwco);
2681     } else {
2682         co = qemu_coroutine_create(bdrv_rw_co_entry);
2683         qemu_coroutine_enter(co, &rwco);
2684         while (rwco.ret == NOT_DONE) {
2685             qemu_aio_wait();
2686         }
2687     }
2688     return rwco.ret;
2689 }
2690 
2691 /*
2692  * Process a synchronous request using coroutines
2693  */
2694 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2695                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2696 {
2697     QEMUIOVector qiov;
2698     struct iovec iov = {
2699         .iov_base = (void *)buf,
2700         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2701     };
2702 
2703     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2704         return -EINVAL;
2705     }
2706 
2707     qemu_iovec_init_external(&qiov, &iov, 1);
2708     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2709                         &qiov, is_write, flags);
2710 }
2711 
2712 /* return < 0 if error. See bdrv_write() for the return codes */
2713 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2714               uint8_t *buf, int nb_sectors)
2715 {
2716     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2717 }
2718 
2719 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2720 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2721                           uint8_t *buf, int nb_sectors)
2722 {
2723     bool enabled;
2724     int ret;
2725 
2726     enabled = bs->io_limits_enabled;
2727     bs->io_limits_enabled = false;
2728     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2729     bs->io_limits_enabled = enabled;
2730     return ret;
2731 }
2732 
2733 /* Return < 0 if error. Important errors are:
2734   -EIO         generic I/O error (may happen for all errors)
2735   -ENOMEDIUM   No media inserted.
2736   -EINVAL      Invalid sector number or nb_sectors
2737   -EACCES      Trying to write a read-only device
2738 */
2739 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2740                const uint8_t *buf, int nb_sectors)
2741 {
2742     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2743 }
2744 
2745 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2746                       int nb_sectors, BdrvRequestFlags flags)
2747 {
2748     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2749                       BDRV_REQ_ZERO_WRITE | flags);
2750 }
2751 
2752 /*
2753  * Completely zero out a block device with the help of bdrv_write_zeroes.
2754  * The operation is sped up by checking the block status and only writing
2755  * zeroes to the device if they currently do not return zeroes. Optional
2756  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2757  *
2758  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2759  */
2760 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2761 {
2762     int64_t target_size;
2763     int64_t ret, nb_sectors, sector_num = 0;
2764     int n;
2765 
2766     target_size = bdrv_getlength(bs);
2767     if (target_size < 0) {
2768         return target_size;
2769     }
2770     target_size /= BDRV_SECTOR_SIZE;
2771 
2772     for (;;) {
2773         nb_sectors = target_size - sector_num;
2774         if (nb_sectors <= 0) {
2775             return 0;
2776         }
2777         if (nb_sectors > INT_MAX) {
2778             nb_sectors = INT_MAX;
2779         }
2780         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2781         if (ret < 0) {
2782             error_report("error getting block status at sector %" PRId64 ": %s",
2783                          sector_num, strerror(-ret));
2784             return ret;
2785         }
2786         if (ret & BDRV_BLOCK_ZERO) {
2787             sector_num += n;
2788             continue;
2789         }
2790         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2791         if (ret < 0) {
2792             error_report("error writing zeroes at sector %" PRId64 ": %s",
2793                          sector_num, strerror(-ret));
2794             return ret;
2795         }
2796         sector_num += n;
2797     }
2798 }
2799 
2800 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2801 {
2802     QEMUIOVector qiov;
2803     struct iovec iov = {
2804         .iov_base = (void *)buf,
2805         .iov_len = bytes,
2806     };
2807     int ret;
2808 
2809     if (bytes < 0) {
2810         return -EINVAL;
2811     }
2812 
2813     qemu_iovec_init_external(&qiov, &iov, 1);
2814     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2815     if (ret < 0) {
2816         return ret;
2817     }
2818 
2819     return bytes;
2820 }
2821 
2822 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2823 {
2824     int ret;
2825 
2826     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2827     if (ret < 0) {
2828         return ret;
2829     }
2830 
2831     return qiov->size;
2832 }
2833 
2834 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2835                 const void *buf, int bytes)
2836 {
2837     QEMUIOVector qiov;
2838     struct iovec iov = {
2839         .iov_base   = (void *) buf,
2840         .iov_len    = bytes,
2841     };
2842 
2843     if (bytes < 0) {
2844         return -EINVAL;
2845     }
2846 
2847     qemu_iovec_init_external(&qiov, &iov, 1);
2848     return bdrv_pwritev(bs, offset, &qiov);
2849 }
2850 
2851 /*
2852  * Writes to the file and ensures that no writes are reordered across this
2853  * request (acts as a barrier)
2854  *
2855  * Returns 0 on success, -errno in error cases.
2856  */
2857 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2858     const void *buf, int count)
2859 {
2860     int ret;
2861 
2862     ret = bdrv_pwrite(bs, offset, buf, count);
2863     if (ret < 0) {
2864         return ret;
2865     }
2866 
2867     /* No flush needed for cache modes that already do it */
2868     if (bs->enable_write_cache) {
2869         bdrv_flush(bs);
2870     }
2871 
2872     return 0;
2873 }
2874 
2875 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2876         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2877 {
2878     /* Perform I/O through a temporary buffer so that users who scribble over
2879      * their read buffer while the operation is in progress do not end up
2880      * modifying the image file.  This is critical for zero-copy guest I/O
2881      * where anything might happen inside guest memory.
2882      */
2883     void *bounce_buffer;
2884 
2885     BlockDriver *drv = bs->drv;
2886     struct iovec iov;
2887     QEMUIOVector bounce_qiov;
2888     int64_t cluster_sector_num;
2889     int cluster_nb_sectors;
2890     size_t skip_bytes;
2891     int ret;
2892 
2893     /* Cover entire cluster so no additional backing file I/O is required when
2894      * allocating cluster in the image file.
2895      */
2896     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2897                            &cluster_sector_num, &cluster_nb_sectors);
2898 
2899     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2900                                    cluster_sector_num, cluster_nb_sectors);
2901 
2902     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2903     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2904     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2905 
2906     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2907                              &bounce_qiov);
2908     if (ret < 0) {
2909         goto err;
2910     }
2911 
2912     if (drv->bdrv_co_write_zeroes &&
2913         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2914         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2915                                       cluster_nb_sectors, 0);
2916     } else {
2917         /* This does not change the data on the disk, it is not necessary
2918          * to flush even in cache=writethrough mode.
2919          */
2920         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2921                                   &bounce_qiov);
2922     }
2923 
2924     if (ret < 0) {
2925         /* It might be okay to ignore write errors for guest requests.  If this
2926          * is a deliberate copy-on-read then we don't want to ignore the error.
2927          * Simply report it in all cases.
2928          */
2929         goto err;
2930     }
2931 
2932     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2933     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2934                         nb_sectors * BDRV_SECTOR_SIZE);
2935 
2936 err:
2937     qemu_vfree(bounce_buffer);
2938     return ret;
2939 }
2940 
2941 /*
2942  * Forwards an already correctly aligned request to the BlockDriver. This
2943  * handles copy on read and zeroing after EOF; any other features must be
2944  * implemented by the caller.
2945  */
2946 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2947     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2948     int64_t align, QEMUIOVector *qiov, int flags)
2949 {
2950     BlockDriver *drv = bs->drv;
2951     int ret;
2952 
2953     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2954     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2955 
2956     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2957     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2958 
2959     /* Handle Copy on Read and associated serialisation */
2960     if (flags & BDRV_REQ_COPY_ON_READ) {
2961         /* If we touch the same cluster it counts as an overlap.  This
2962          * guarantees that allocating writes will be serialized and not race
2963          * with each other for the same cluster.  For example, in copy-on-read
2964          * it ensures that the CoR read and write operations are atomic and
2965          * guest writes cannot interleave between them. */
2966         mark_request_serialising(req, bdrv_get_cluster_size(bs));
2967     }
2968 
2969     wait_serialising_requests(req);
2970 
2971     if (flags & BDRV_REQ_COPY_ON_READ) {
2972         int pnum;
2973 
2974         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2975         if (ret < 0) {
2976             goto out;
2977         }
2978 
2979         if (!ret || pnum != nb_sectors) {
2980             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2981             goto out;
2982         }
2983     }
2984 
2985     /* Forward the request to the BlockDriver */
2986     if (!(bs->zero_beyond_eof && bs->growable)) {
2987         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2988     } else {
2989         /* Read zeros after EOF of growable BDSes */
2990         int64_t len, total_sectors, max_nb_sectors;
2991 
2992         len = bdrv_getlength(bs);
2993         if (len < 0) {
2994             ret = len;
2995             goto out;
2996         }
2997 
2998         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2999         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3000                                   align >> BDRV_SECTOR_BITS);
3001         if (max_nb_sectors > 0) {
3002             ret = drv->bdrv_co_readv(bs, sector_num,
3003                                      MIN(nb_sectors, max_nb_sectors), qiov);
3004         } else {
3005             ret = 0;
3006         }
3007 
3008         /* Reading beyond end of file is supposed to produce zeroes */
3009         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3010             uint64_t offset = MAX(0, total_sectors - sector_num);
3011             uint64_t bytes = (sector_num + nb_sectors - offset) *
3012                               BDRV_SECTOR_SIZE;
3013             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3014         }
3015     }
3016 
3017 out:
3018     return ret;
3019 }
3020 
3021 /*
3022  * Handle a read request in coroutine context
3023  */
3024 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3025     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3026     BdrvRequestFlags flags)
3027 {
3028     BlockDriver *drv = bs->drv;
3029     BdrvTrackedRequest req;
3030 
3031     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3032     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3033     uint8_t *head_buf = NULL;
3034     uint8_t *tail_buf = NULL;
3035     QEMUIOVector local_qiov;
3036     bool use_local_qiov = false;
3037     int ret;
3038 
3039     if (!drv) {
3040         return -ENOMEDIUM;
3041     }
3042     if (bdrv_check_byte_request(bs, offset, bytes)) {
3043         return -EIO;
3044     }
3045 
3046     if (bs->copy_on_read) {
3047         flags |= BDRV_REQ_COPY_ON_READ;
3048     }
3049 
3050     /* throttling disk I/O */
3051     if (bs->io_limits_enabled) {
3052         bdrv_io_limits_intercept(bs, bytes, false);
3053     }
3054 
3055     /* Align read if necessary by padding qiov */
3056     if (offset & (align - 1)) {
3057         head_buf = qemu_blockalign(bs, align);
3058         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3059         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3060         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3061         use_local_qiov = true;
3062 
3063         bytes += offset & (align - 1);
3064         offset = offset & ~(align - 1);
3065     }
3066 
3067     if ((offset + bytes) & (align - 1)) {
3068         if (!use_local_qiov) {
3069             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3070             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3071             use_local_qiov = true;
3072         }
3073         tail_buf = qemu_blockalign(bs, align);
3074         qemu_iovec_add(&local_qiov, tail_buf,
3075                        align - ((offset + bytes) & (align - 1)));
3076 
3077         bytes = ROUND_UP(bytes, align);
3078     }
3079 
3080     tracked_request_begin(&req, bs, offset, bytes, false);
3081     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3082                               use_local_qiov ? &local_qiov : qiov,
3083                               flags);
3084     tracked_request_end(&req);
3085 
3086     if (use_local_qiov) {
3087         qemu_iovec_destroy(&local_qiov);
3088         qemu_vfree(head_buf);
3089         qemu_vfree(tail_buf);
3090     }
3091 
3092     return ret;
3093 }
3094 
3095 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3096     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3097     BdrvRequestFlags flags)
3098 {
3099     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3100         return -EINVAL;
3101     }
3102 
3103     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3104                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3105 }
3106 
3107 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3108     int nb_sectors, QEMUIOVector *qiov)
3109 {
3110     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3111 
3112     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3113 }
3114 
3115 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3116     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3117 {
3118     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3119 
3120     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3121                             BDRV_REQ_COPY_ON_READ);
3122 }
3123 
3124 /* if no limit is specified in the BlockLimits use a default
3125  * of 32768 512-byte sectors (16 MiB) per request.
3126  */
3127 #define MAX_WRITE_ZEROES_DEFAULT 32768
3128 
3129 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3130     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3131 {
3132     BlockDriver *drv = bs->drv;
3133     QEMUIOVector qiov;
3134     struct iovec iov = {0};
3135     int ret = 0;
3136 
3137     int max_write_zeroes = bs->bl.max_write_zeroes ?
3138                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3139 
3140     while (nb_sectors > 0 && !ret) {
3141         int num = nb_sectors;
3142 
3143         /* Align request.  Block drivers can expect the "bulk" of the request
3144          * to be aligned.
3145          */
3146         if (bs->bl.write_zeroes_alignment
3147             && num > bs->bl.write_zeroes_alignment) {
3148             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3149                 /* Make a small request up to the first aligned sector.  */
3150                 num = bs->bl.write_zeroes_alignment;
3151                 num -= sector_num % bs->bl.write_zeroes_alignment;
3152             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3153                 /* Shorten the request to the last aligned sector.  num cannot
3154                  * underflow because num > bs->bl.write_zeroes_alignment.
3155                  */
3156                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3157             }
3158         }
3159 
3160         /* limit request size */
3161         if (num > max_write_zeroes) {
3162             num = max_write_zeroes;
3163         }
3164 
3165         ret = -ENOTSUP;
3166         /* First try the efficient write zeroes operation */
3167         if (drv->bdrv_co_write_zeroes) {
3168             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3169         }
3170 
3171         if (ret == -ENOTSUP) {
3172             /* Fall back to bounce buffer if write zeroes is unsupported */
3173             iov.iov_len = num * BDRV_SECTOR_SIZE;
3174             if (iov.iov_base == NULL) {
3175                 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3176                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3177             }
3178             qemu_iovec_init_external(&qiov, &iov, 1);
3179 
3180             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3181 
3182             /* Keep bounce buffer around if it is big enough for all
3183              * all future requests.
3184              */
3185             if (num < max_write_zeroes) {
3186                 qemu_vfree(iov.iov_base);
3187                 iov.iov_base = NULL;
3188             }
3189         }
3190 
3191         sector_num += num;
3192         nb_sectors -= num;
3193     }
3194 
3195     qemu_vfree(iov.iov_base);
3196     return ret;
3197 }
3198 
3199 /*
3200  * Forwards an already correctly aligned write request to the BlockDriver.
3201  */
3202 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3203     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3204     QEMUIOVector *qiov, int flags)
3205 {
3206     BlockDriver *drv = bs->drv;
3207     bool waited;
3208     int ret;
3209 
3210     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3211     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3212 
3213     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3214     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3215 
3216     waited = wait_serialising_requests(req);
3217     assert(!waited || !req->serialising);
3218     assert(req->overlap_offset <= offset);
3219     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3220 
3221     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3222 
3223     if (ret < 0) {
3224         /* Do nothing, write notifier decided to fail this request */
3225     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3226         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3227         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3228     } else {
3229         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3230         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3231     }
3232     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3233 
3234     if (ret == 0 && !bs->enable_write_cache) {
3235         ret = bdrv_co_flush(bs);
3236     }
3237 
3238     bdrv_set_dirty(bs, sector_num, nb_sectors);
3239 
3240     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3241         bs->wr_highest_sector = sector_num + nb_sectors - 1;
3242     }
3243     if (bs->growable && ret >= 0) {
3244         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3245     }
3246 
3247     return ret;
3248 }
3249 
3250 /*
3251  * Handle a write request in coroutine context
3252  */
3253 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3254     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3255     BdrvRequestFlags flags)
3256 {
3257     BdrvTrackedRequest req;
3258     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3259     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3260     uint8_t *head_buf = NULL;
3261     uint8_t *tail_buf = NULL;
3262     QEMUIOVector local_qiov;
3263     bool use_local_qiov = false;
3264     int ret;
3265 
3266     if (!bs->drv) {
3267         return -ENOMEDIUM;
3268     }
3269     if (bs->read_only) {
3270         return -EACCES;
3271     }
3272     if (bdrv_check_byte_request(bs, offset, bytes)) {
3273         return -EIO;
3274     }
3275 
3276     /* throttling disk I/O */
3277     if (bs->io_limits_enabled) {
3278         bdrv_io_limits_intercept(bs, bytes, true);
3279     }
3280 
3281     /*
3282      * Align write if necessary by performing a read-modify-write cycle.
3283      * Pad qiov with the read parts and be sure to have a tracked request not
3284      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3285      */
3286     tracked_request_begin(&req, bs, offset, bytes, true);
3287 
3288     if (offset & (align - 1)) {
3289         QEMUIOVector head_qiov;
3290         struct iovec head_iov;
3291 
3292         mark_request_serialising(&req, align);
3293         wait_serialising_requests(&req);
3294 
3295         head_buf = qemu_blockalign(bs, align);
3296         head_iov = (struct iovec) {
3297             .iov_base   = head_buf,
3298             .iov_len    = align,
3299         };
3300         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3301 
3302         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3303         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3304                                   align, &head_qiov, 0);
3305         if (ret < 0) {
3306             goto fail;
3307         }
3308         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3309 
3310         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3311         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3312         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3313         use_local_qiov = true;
3314 
3315         bytes += offset & (align - 1);
3316         offset = offset & ~(align - 1);
3317     }
3318 
3319     if ((offset + bytes) & (align - 1)) {
3320         QEMUIOVector tail_qiov;
3321         struct iovec tail_iov;
3322         size_t tail_bytes;
3323         bool waited;
3324 
3325         mark_request_serialising(&req, align);
3326         waited = wait_serialising_requests(&req);
3327         assert(!waited || !use_local_qiov);
3328 
3329         tail_buf = qemu_blockalign(bs, align);
3330         tail_iov = (struct iovec) {
3331             .iov_base   = tail_buf,
3332             .iov_len    = align,
3333         };
3334         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3335 
3336         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3337         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3338                                   align, &tail_qiov, 0);
3339         if (ret < 0) {
3340             goto fail;
3341         }
3342         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3343 
3344         if (!use_local_qiov) {
3345             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3346             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3347             use_local_qiov = true;
3348         }
3349 
3350         tail_bytes = (offset + bytes) & (align - 1);
3351         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3352 
3353         bytes = ROUND_UP(bytes, align);
3354     }
3355 
3356     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3357                                use_local_qiov ? &local_qiov : qiov,
3358                                flags);
3359 
3360 fail:
3361     tracked_request_end(&req);
3362 
3363     if (use_local_qiov) {
3364         qemu_iovec_destroy(&local_qiov);
3365     }
3366     qemu_vfree(head_buf);
3367     qemu_vfree(tail_buf);
3368 
3369     return ret;
3370 }
3371 
3372 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3373     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3374     BdrvRequestFlags flags)
3375 {
3376     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3377         return -EINVAL;
3378     }
3379 
3380     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3381                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3382 }
3383 
3384 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3385     int nb_sectors, QEMUIOVector *qiov)
3386 {
3387     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3388 
3389     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3390 }
3391 
3392 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3393                                       int64_t sector_num, int nb_sectors,
3394                                       BdrvRequestFlags flags)
3395 {
3396     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3397 
3398     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3399         flags &= ~BDRV_REQ_MAY_UNMAP;
3400     }
3401 
3402     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3403                              BDRV_REQ_ZERO_WRITE | flags);
3404 }
3405 
3406 /**
3407  * Truncate file to 'offset' bytes (needed only for file protocols)
3408  */
3409 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3410 {
3411     BlockDriver *drv = bs->drv;
3412     int ret;
3413     if (!drv)
3414         return -ENOMEDIUM;
3415     if (!drv->bdrv_truncate)
3416         return -ENOTSUP;
3417     if (bs->read_only)
3418         return -EACCES;
3419     if (bdrv_in_use(bs))
3420         return -EBUSY;
3421     ret = drv->bdrv_truncate(bs, offset);
3422     if (ret == 0) {
3423         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3424         bdrv_dev_resize_cb(bs);
3425     }
3426     return ret;
3427 }
3428 
3429 /**
3430  * Length of a allocated file in bytes. Sparse files are counted by actual
3431  * allocated space. Return < 0 if error or unknown.
3432  */
3433 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3434 {
3435     BlockDriver *drv = bs->drv;
3436     if (!drv) {
3437         return -ENOMEDIUM;
3438     }
3439     if (drv->bdrv_get_allocated_file_size) {
3440         return drv->bdrv_get_allocated_file_size(bs);
3441     }
3442     if (bs->file) {
3443         return bdrv_get_allocated_file_size(bs->file);
3444     }
3445     return -ENOTSUP;
3446 }
3447 
3448 /**
3449  * Length of a file in bytes. Return < 0 if error or unknown.
3450  */
3451 int64_t bdrv_getlength(BlockDriverState *bs)
3452 {
3453     BlockDriver *drv = bs->drv;
3454     if (!drv)
3455         return -ENOMEDIUM;
3456 
3457     if (drv->has_variable_length) {
3458         int ret = refresh_total_sectors(bs, bs->total_sectors);
3459         if (ret < 0) {
3460             return ret;
3461         }
3462     }
3463     return bs->total_sectors * BDRV_SECTOR_SIZE;
3464 }
3465 
3466 /* return 0 as number of sectors if no device present or error */
3467 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3468 {
3469     int64_t length;
3470     length = bdrv_getlength(bs);
3471     if (length < 0)
3472         length = 0;
3473     else
3474         length = length >> BDRV_SECTOR_BITS;
3475     *nb_sectors_ptr = length;
3476 }
3477 
3478 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3479                        BlockdevOnError on_write_error)
3480 {
3481     bs->on_read_error = on_read_error;
3482     bs->on_write_error = on_write_error;
3483 }
3484 
3485 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3486 {
3487     return is_read ? bs->on_read_error : bs->on_write_error;
3488 }
3489 
3490 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3491 {
3492     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3493 
3494     switch (on_err) {
3495     case BLOCKDEV_ON_ERROR_ENOSPC:
3496         return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3497     case BLOCKDEV_ON_ERROR_STOP:
3498         return BDRV_ACTION_STOP;
3499     case BLOCKDEV_ON_ERROR_REPORT:
3500         return BDRV_ACTION_REPORT;
3501     case BLOCKDEV_ON_ERROR_IGNORE:
3502         return BDRV_ACTION_IGNORE;
3503     default:
3504         abort();
3505     }
3506 }
3507 
3508 /* This is done by device models because, while the block layer knows
3509  * about the error, it does not know whether an operation comes from
3510  * the device or the block layer (from a job, for example).
3511  */
3512 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3513                        bool is_read, int error)
3514 {
3515     assert(error >= 0);
3516     bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3517     if (action == BDRV_ACTION_STOP) {
3518         vm_stop(RUN_STATE_IO_ERROR);
3519         bdrv_iostatus_set_err(bs, error);
3520     }
3521 }
3522 
3523 int bdrv_is_read_only(BlockDriverState *bs)
3524 {
3525     return bs->read_only;
3526 }
3527 
3528 int bdrv_is_sg(BlockDriverState *bs)
3529 {
3530     return bs->sg;
3531 }
3532 
3533 int bdrv_enable_write_cache(BlockDriverState *bs)
3534 {
3535     return bs->enable_write_cache;
3536 }
3537 
3538 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3539 {
3540     bs->enable_write_cache = wce;
3541 
3542     /* so a reopen() will preserve wce */
3543     if (wce) {
3544         bs->open_flags |= BDRV_O_CACHE_WB;
3545     } else {
3546         bs->open_flags &= ~BDRV_O_CACHE_WB;
3547     }
3548 }
3549 
3550 int bdrv_is_encrypted(BlockDriverState *bs)
3551 {
3552     if (bs->backing_hd && bs->backing_hd->encrypted)
3553         return 1;
3554     return bs->encrypted;
3555 }
3556 
3557 int bdrv_key_required(BlockDriverState *bs)
3558 {
3559     BlockDriverState *backing_hd = bs->backing_hd;
3560 
3561     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3562         return 1;
3563     return (bs->encrypted && !bs->valid_key);
3564 }
3565 
3566 int bdrv_set_key(BlockDriverState *bs, const char *key)
3567 {
3568     int ret;
3569     if (bs->backing_hd && bs->backing_hd->encrypted) {
3570         ret = bdrv_set_key(bs->backing_hd, key);
3571         if (ret < 0)
3572             return ret;
3573         if (!bs->encrypted)
3574             return 0;
3575     }
3576     if (!bs->encrypted) {
3577         return -EINVAL;
3578     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3579         return -ENOMEDIUM;
3580     }
3581     ret = bs->drv->bdrv_set_key(bs, key);
3582     if (ret < 0) {
3583         bs->valid_key = 0;
3584     } else if (!bs->valid_key) {
3585         bs->valid_key = 1;
3586         /* call the change callback now, we skipped it on open */
3587         bdrv_dev_change_media_cb(bs, true);
3588     }
3589     return ret;
3590 }
3591 
3592 const char *bdrv_get_format_name(BlockDriverState *bs)
3593 {
3594     return bs->drv ? bs->drv->format_name : NULL;
3595 }
3596 
3597 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3598                          void *opaque)
3599 {
3600     BlockDriver *drv;
3601 
3602     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3603         it(opaque, drv->format_name);
3604     }
3605 }
3606 
3607 /* This function is to find block backend bs */
3608 BlockDriverState *bdrv_find(const char *name)
3609 {
3610     BlockDriverState *bs;
3611 
3612     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3613         if (!strcmp(name, bs->device_name)) {
3614             return bs;
3615         }
3616     }
3617     return NULL;
3618 }
3619 
3620 /* This function is to find a node in the bs graph */
3621 BlockDriverState *bdrv_find_node(const char *node_name)
3622 {
3623     BlockDriverState *bs;
3624 
3625     assert(node_name);
3626 
3627     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3628         if (!strcmp(node_name, bs->node_name)) {
3629             return bs;
3630         }
3631     }
3632     return NULL;
3633 }
3634 
3635 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3636 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3637 {
3638     BlockDeviceInfoList *list, *entry;
3639     BlockDriverState *bs;
3640 
3641     list = NULL;
3642     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3643         entry = g_malloc0(sizeof(*entry));
3644         entry->value = bdrv_block_device_info(bs);
3645         entry->next = list;
3646         list = entry;
3647     }
3648 
3649     return list;
3650 }
3651 
3652 BlockDriverState *bdrv_lookup_bs(const char *device,
3653                                  const char *node_name,
3654                                  Error **errp)
3655 {
3656     BlockDriverState *bs = NULL;
3657 
3658     if (device) {
3659         bs = bdrv_find(device);
3660 
3661         if (bs) {
3662             return bs;
3663         }
3664     }
3665 
3666     if (node_name) {
3667         bs = bdrv_find_node(node_name);
3668 
3669         if (bs) {
3670             return bs;
3671         }
3672     }
3673 
3674     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3675                      device ? device : "",
3676                      node_name ? node_name : "");
3677     return NULL;
3678 }
3679 
3680 BlockDriverState *bdrv_next(BlockDriverState *bs)
3681 {
3682     if (!bs) {
3683         return QTAILQ_FIRST(&bdrv_states);
3684     }
3685     return QTAILQ_NEXT(bs, device_list);
3686 }
3687 
3688 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3689 {
3690     BlockDriverState *bs;
3691 
3692     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3693         it(opaque, bs);
3694     }
3695 }
3696 
3697 const char *bdrv_get_device_name(BlockDriverState *bs)
3698 {
3699     return bs->device_name;
3700 }
3701 
3702 int bdrv_get_flags(BlockDriverState *bs)
3703 {
3704     return bs->open_flags;
3705 }
3706 
3707 int bdrv_flush_all(void)
3708 {
3709     BlockDriverState *bs;
3710     int result = 0;
3711 
3712     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3713         int ret = bdrv_flush(bs);
3714         if (ret < 0 && !result) {
3715             result = ret;
3716         }
3717     }
3718 
3719     return result;
3720 }
3721 
3722 int bdrv_has_zero_init_1(BlockDriverState *bs)
3723 {
3724     return 1;
3725 }
3726 
3727 int bdrv_has_zero_init(BlockDriverState *bs)
3728 {
3729     assert(bs->drv);
3730 
3731     /* If BS is a copy on write image, it is initialized to
3732        the contents of the base image, which may not be zeroes.  */
3733     if (bs->backing_hd) {
3734         return 0;
3735     }
3736     if (bs->drv->bdrv_has_zero_init) {
3737         return bs->drv->bdrv_has_zero_init(bs);
3738     }
3739 
3740     /* safe default */
3741     return 0;
3742 }
3743 
3744 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3745 {
3746     BlockDriverInfo bdi;
3747 
3748     if (bs->backing_hd) {
3749         return false;
3750     }
3751 
3752     if (bdrv_get_info(bs, &bdi) == 0) {
3753         return bdi.unallocated_blocks_are_zero;
3754     }
3755 
3756     return false;
3757 }
3758 
3759 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3760 {
3761     BlockDriverInfo bdi;
3762 
3763     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3764         return false;
3765     }
3766 
3767     if (bdrv_get_info(bs, &bdi) == 0) {
3768         return bdi.can_write_zeroes_with_unmap;
3769     }
3770 
3771     return false;
3772 }
3773 
3774 typedef struct BdrvCoGetBlockStatusData {
3775     BlockDriverState *bs;
3776     BlockDriverState *base;
3777     int64_t sector_num;
3778     int nb_sectors;
3779     int *pnum;
3780     int64_t ret;
3781     bool done;
3782 } BdrvCoGetBlockStatusData;
3783 
3784 /*
3785  * Returns true iff the specified sector is present in the disk image. Drivers
3786  * not implementing the functionality are assumed to not support backing files,
3787  * hence all their sectors are reported as allocated.
3788  *
3789  * If 'sector_num' is beyond the end of the disk image the return value is 0
3790  * and 'pnum' is set to 0.
3791  *
3792  * 'pnum' is set to the number of sectors (including and immediately following
3793  * the specified sector) that are known to be in the same
3794  * allocated/unallocated state.
3795  *
3796  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3797  * beyond the end of the disk image it will be clamped.
3798  */
3799 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3800                                                      int64_t sector_num,
3801                                                      int nb_sectors, int *pnum)
3802 {
3803     int64_t length;
3804     int64_t n;
3805     int64_t ret, ret2;
3806 
3807     length = bdrv_getlength(bs);
3808     if (length < 0) {
3809         return length;
3810     }
3811 
3812     if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3813         *pnum = 0;
3814         return 0;
3815     }
3816 
3817     n = bs->total_sectors - sector_num;
3818     if (n < nb_sectors) {
3819         nb_sectors = n;
3820     }
3821 
3822     if (!bs->drv->bdrv_co_get_block_status) {
3823         *pnum = nb_sectors;
3824         ret = BDRV_BLOCK_DATA;
3825         if (bs->drv->protocol_name) {
3826             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3827         }
3828         return ret;
3829     }
3830 
3831     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3832     if (ret < 0) {
3833         *pnum = 0;
3834         return ret;
3835     }
3836 
3837     if (ret & BDRV_BLOCK_RAW) {
3838         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3839         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3840                                      *pnum, pnum);
3841     }
3842 
3843     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3844         if (bdrv_unallocated_blocks_are_zero(bs)) {
3845             ret |= BDRV_BLOCK_ZERO;
3846         } else if (bs->backing_hd) {
3847             BlockDriverState *bs2 = bs->backing_hd;
3848             int64_t length2 = bdrv_getlength(bs2);
3849             if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3850                 ret |= BDRV_BLOCK_ZERO;
3851             }
3852         }
3853     }
3854 
3855     if (bs->file &&
3856         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3857         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3858         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3859                                         *pnum, pnum);
3860         if (ret2 >= 0) {
3861             /* Ignore errors.  This is just providing extra information, it
3862              * is useful but not necessary.
3863              */
3864             ret |= (ret2 & BDRV_BLOCK_ZERO);
3865         }
3866     }
3867 
3868     return ret;
3869 }
3870 
3871 /* Coroutine wrapper for bdrv_get_block_status() */
3872 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3873 {
3874     BdrvCoGetBlockStatusData *data = opaque;
3875     BlockDriverState *bs = data->bs;
3876 
3877     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3878                                          data->pnum);
3879     data->done = true;
3880 }
3881 
3882 /*
3883  * Synchronous wrapper around bdrv_co_get_block_status().
3884  *
3885  * See bdrv_co_get_block_status() for details.
3886  */
3887 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3888                               int nb_sectors, int *pnum)
3889 {
3890     Coroutine *co;
3891     BdrvCoGetBlockStatusData data = {
3892         .bs = bs,
3893         .sector_num = sector_num,
3894         .nb_sectors = nb_sectors,
3895         .pnum = pnum,
3896         .done = false,
3897     };
3898 
3899     if (qemu_in_coroutine()) {
3900         /* Fast-path if already in coroutine context */
3901         bdrv_get_block_status_co_entry(&data);
3902     } else {
3903         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3904         qemu_coroutine_enter(co, &data);
3905         while (!data.done) {
3906             qemu_aio_wait();
3907         }
3908     }
3909     return data.ret;
3910 }
3911 
3912 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3913                                    int nb_sectors, int *pnum)
3914 {
3915     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3916     if (ret < 0) {
3917         return ret;
3918     }
3919     return
3920         (ret & BDRV_BLOCK_DATA) ||
3921         ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3922 }
3923 
3924 /*
3925  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3926  *
3927  * Return true if the given sector is allocated in any image between
3928  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3929  * sector is allocated in any image of the chain.  Return false otherwise.
3930  *
3931  * 'pnum' is set to the number of sectors (including and immediately following
3932  *  the specified sector) that are known to be in the same
3933  *  allocated/unallocated state.
3934  *
3935  */
3936 int bdrv_is_allocated_above(BlockDriverState *top,
3937                             BlockDriverState *base,
3938                             int64_t sector_num,
3939                             int nb_sectors, int *pnum)
3940 {
3941     BlockDriverState *intermediate;
3942     int ret, n = nb_sectors;
3943 
3944     intermediate = top;
3945     while (intermediate && intermediate != base) {
3946         int pnum_inter;
3947         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3948                                 &pnum_inter);
3949         if (ret < 0) {
3950             return ret;
3951         } else if (ret) {
3952             *pnum = pnum_inter;
3953             return 1;
3954         }
3955 
3956         /*
3957          * [sector_num, nb_sectors] is unallocated on top but intermediate
3958          * might have
3959          *
3960          * [sector_num+x, nr_sectors] allocated.
3961          */
3962         if (n > pnum_inter &&
3963             (intermediate == top ||
3964              sector_num + pnum_inter < intermediate->total_sectors)) {
3965             n = pnum_inter;
3966         }
3967 
3968         intermediate = intermediate->backing_hd;
3969     }
3970 
3971     *pnum = n;
3972     return 0;
3973 }
3974 
3975 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3976 {
3977     if (bs->backing_hd && bs->backing_hd->encrypted)
3978         return bs->backing_file;
3979     else if (bs->encrypted)
3980         return bs->filename;
3981     else
3982         return NULL;
3983 }
3984 
3985 void bdrv_get_backing_filename(BlockDriverState *bs,
3986                                char *filename, int filename_size)
3987 {
3988     pstrcpy(filename, filename_size, bs->backing_file);
3989 }
3990 
3991 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3992                           const uint8_t *buf, int nb_sectors)
3993 {
3994     BlockDriver *drv = bs->drv;
3995     if (!drv)
3996         return -ENOMEDIUM;
3997     if (!drv->bdrv_write_compressed)
3998         return -ENOTSUP;
3999     if (bdrv_check_request(bs, sector_num, nb_sectors))
4000         return -EIO;
4001 
4002     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4003 
4004     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4005 }
4006 
4007 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4008 {
4009     BlockDriver *drv = bs->drv;
4010     if (!drv)
4011         return -ENOMEDIUM;
4012     if (!drv->bdrv_get_info)
4013         return -ENOTSUP;
4014     memset(bdi, 0, sizeof(*bdi));
4015     return drv->bdrv_get_info(bs, bdi);
4016 }
4017 
4018 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4019 {
4020     BlockDriver *drv = bs->drv;
4021     if (drv && drv->bdrv_get_specific_info) {
4022         return drv->bdrv_get_specific_info(bs);
4023     }
4024     return NULL;
4025 }
4026 
4027 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4028                       int64_t pos, int size)
4029 {
4030     QEMUIOVector qiov;
4031     struct iovec iov = {
4032         .iov_base   = (void *) buf,
4033         .iov_len    = size,
4034     };
4035 
4036     qemu_iovec_init_external(&qiov, &iov, 1);
4037     return bdrv_writev_vmstate(bs, &qiov, pos);
4038 }
4039 
4040 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4041 {
4042     BlockDriver *drv = bs->drv;
4043 
4044     if (!drv) {
4045         return -ENOMEDIUM;
4046     } else if (drv->bdrv_save_vmstate) {
4047         return drv->bdrv_save_vmstate(bs, qiov, pos);
4048     } else if (bs->file) {
4049         return bdrv_writev_vmstate(bs->file, qiov, pos);
4050     }
4051 
4052     return -ENOTSUP;
4053 }
4054 
4055 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4056                       int64_t pos, int size)
4057 {
4058     BlockDriver *drv = bs->drv;
4059     if (!drv)
4060         return -ENOMEDIUM;
4061     if (drv->bdrv_load_vmstate)
4062         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4063     if (bs->file)
4064         return bdrv_load_vmstate(bs->file, buf, pos, size);
4065     return -ENOTSUP;
4066 }
4067 
4068 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4069 {
4070     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4071         return;
4072     }
4073 
4074     bs->drv->bdrv_debug_event(bs, event);
4075 }
4076 
4077 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4078                           const char *tag)
4079 {
4080     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4081         bs = bs->file;
4082     }
4083 
4084     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4085         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4086     }
4087 
4088     return -ENOTSUP;
4089 }
4090 
4091 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4092 {
4093     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4094         bs = bs->file;
4095     }
4096 
4097     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4098         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4099     }
4100 
4101     return -ENOTSUP;
4102 }
4103 
4104 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4105 {
4106     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4107         bs = bs->file;
4108     }
4109 
4110     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4111         return bs->drv->bdrv_debug_resume(bs, tag);
4112     }
4113 
4114     return -ENOTSUP;
4115 }
4116 
4117 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4118 {
4119     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4120         bs = bs->file;
4121     }
4122 
4123     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4124         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4125     }
4126 
4127     return false;
4128 }
4129 
4130 int bdrv_is_snapshot(BlockDriverState *bs)
4131 {
4132     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4133 }
4134 
4135 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4136  * relative, it must be relative to the chain.  So, passing in bs->filename
4137  * from a BDS as backing_file should not be done, as that may be relative to
4138  * the CWD rather than the chain. */
4139 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4140         const char *backing_file)
4141 {
4142     char *filename_full = NULL;
4143     char *backing_file_full = NULL;
4144     char *filename_tmp = NULL;
4145     int is_protocol = 0;
4146     BlockDriverState *curr_bs = NULL;
4147     BlockDriverState *retval = NULL;
4148 
4149     if (!bs || !bs->drv || !backing_file) {
4150         return NULL;
4151     }
4152 
4153     filename_full     = g_malloc(PATH_MAX);
4154     backing_file_full = g_malloc(PATH_MAX);
4155     filename_tmp      = g_malloc(PATH_MAX);
4156 
4157     is_protocol = path_has_protocol(backing_file);
4158 
4159     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4160 
4161         /* If either of the filename paths is actually a protocol, then
4162          * compare unmodified paths; otherwise make paths relative */
4163         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4164             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4165                 retval = curr_bs->backing_hd;
4166                 break;
4167             }
4168         } else {
4169             /* If not an absolute filename path, make it relative to the current
4170              * image's filename path */
4171             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4172                          backing_file);
4173 
4174             /* We are going to compare absolute pathnames */
4175             if (!realpath(filename_tmp, filename_full)) {
4176                 continue;
4177             }
4178 
4179             /* We need to make sure the backing filename we are comparing against
4180              * is relative to the current image filename (or absolute) */
4181             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4182                          curr_bs->backing_file);
4183 
4184             if (!realpath(filename_tmp, backing_file_full)) {
4185                 continue;
4186             }
4187 
4188             if (strcmp(backing_file_full, filename_full) == 0) {
4189                 retval = curr_bs->backing_hd;
4190                 break;
4191             }
4192         }
4193     }
4194 
4195     g_free(filename_full);
4196     g_free(backing_file_full);
4197     g_free(filename_tmp);
4198     return retval;
4199 }
4200 
4201 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4202 {
4203     if (!bs->drv) {
4204         return 0;
4205     }
4206 
4207     if (!bs->backing_hd) {
4208         return 0;
4209     }
4210 
4211     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4212 }
4213 
4214 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4215 {
4216     BlockDriverState *curr_bs = NULL;
4217 
4218     if (!bs) {
4219         return NULL;
4220     }
4221 
4222     curr_bs = bs;
4223 
4224     while (curr_bs->backing_hd) {
4225         curr_bs = curr_bs->backing_hd;
4226     }
4227     return curr_bs;
4228 }
4229 
4230 /**************************************************************/
4231 /* async I/Os */
4232 
4233 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4234                                  QEMUIOVector *qiov, int nb_sectors,
4235                                  BlockDriverCompletionFunc *cb, void *opaque)
4236 {
4237     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4238 
4239     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4240                                  cb, opaque, false);
4241 }
4242 
4243 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4244                                   QEMUIOVector *qiov, int nb_sectors,
4245                                   BlockDriverCompletionFunc *cb, void *opaque)
4246 {
4247     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4248 
4249     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4250                                  cb, opaque, true);
4251 }
4252 
4253 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4254         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4255         BlockDriverCompletionFunc *cb, void *opaque)
4256 {
4257     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4258 
4259     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4260                                  BDRV_REQ_ZERO_WRITE | flags,
4261                                  cb, opaque, true);
4262 }
4263 
4264 
4265 typedef struct MultiwriteCB {
4266     int error;
4267     int num_requests;
4268     int num_callbacks;
4269     struct {
4270         BlockDriverCompletionFunc *cb;
4271         void *opaque;
4272         QEMUIOVector *free_qiov;
4273     } callbacks[];
4274 } MultiwriteCB;
4275 
4276 static void multiwrite_user_cb(MultiwriteCB *mcb)
4277 {
4278     int i;
4279 
4280     for (i = 0; i < mcb->num_callbacks; i++) {
4281         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4282         if (mcb->callbacks[i].free_qiov) {
4283             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4284         }
4285         g_free(mcb->callbacks[i].free_qiov);
4286     }
4287 }
4288 
4289 static void multiwrite_cb(void *opaque, int ret)
4290 {
4291     MultiwriteCB *mcb = opaque;
4292 
4293     trace_multiwrite_cb(mcb, ret);
4294 
4295     if (ret < 0 && !mcb->error) {
4296         mcb->error = ret;
4297     }
4298 
4299     mcb->num_requests--;
4300     if (mcb->num_requests == 0) {
4301         multiwrite_user_cb(mcb);
4302         g_free(mcb);
4303     }
4304 }
4305 
4306 static int multiwrite_req_compare(const void *a, const void *b)
4307 {
4308     const BlockRequest *req1 = a, *req2 = b;
4309 
4310     /*
4311      * Note that we can't simply subtract req2->sector from req1->sector
4312      * here as that could overflow the return value.
4313      */
4314     if (req1->sector > req2->sector) {
4315         return 1;
4316     } else if (req1->sector < req2->sector) {
4317         return -1;
4318     } else {
4319         return 0;
4320     }
4321 }
4322 
4323 /*
4324  * Takes a bunch of requests and tries to merge them. Returns the number of
4325  * requests that remain after merging.
4326  */
4327 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4328     int num_reqs, MultiwriteCB *mcb)
4329 {
4330     int i, outidx;
4331 
4332     // Sort requests by start sector
4333     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4334 
4335     // Check if adjacent requests touch the same clusters. If so, combine them,
4336     // filling up gaps with zero sectors.
4337     outidx = 0;
4338     for (i = 1; i < num_reqs; i++) {
4339         int merge = 0;
4340         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4341 
4342         // Handle exactly sequential writes and overlapping writes.
4343         if (reqs[i].sector <= oldreq_last) {
4344             merge = 1;
4345         }
4346 
4347         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4348             merge = 0;
4349         }
4350 
4351         if (merge) {
4352             size_t size;
4353             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4354             qemu_iovec_init(qiov,
4355                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4356 
4357             // Add the first request to the merged one. If the requests are
4358             // overlapping, drop the last sectors of the first request.
4359             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4360             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4361 
4362             // We should need to add any zeros between the two requests
4363             assert (reqs[i].sector <= oldreq_last);
4364 
4365             // Add the second request
4366             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4367 
4368             reqs[outidx].nb_sectors = qiov->size >> 9;
4369             reqs[outidx].qiov = qiov;
4370 
4371             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4372         } else {
4373             outidx++;
4374             reqs[outidx].sector     = reqs[i].sector;
4375             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4376             reqs[outidx].qiov       = reqs[i].qiov;
4377         }
4378     }
4379 
4380     return outidx + 1;
4381 }
4382 
4383 /*
4384  * Submit multiple AIO write requests at once.
4385  *
4386  * On success, the function returns 0 and all requests in the reqs array have
4387  * been submitted. In error case this function returns -1, and any of the
4388  * requests may or may not be submitted yet. In particular, this means that the
4389  * callback will be called for some of the requests, for others it won't. The
4390  * caller must check the error field of the BlockRequest to wait for the right
4391  * callbacks (if error != 0, no callback will be called).
4392  *
4393  * The implementation may modify the contents of the reqs array, e.g. to merge
4394  * requests. However, the fields opaque and error are left unmodified as they
4395  * are used to signal failure for a single request to the caller.
4396  */
4397 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4398 {
4399     MultiwriteCB *mcb;
4400     int i;
4401 
4402     /* don't submit writes if we don't have a medium */
4403     if (bs->drv == NULL) {
4404         for (i = 0; i < num_reqs; i++) {
4405             reqs[i].error = -ENOMEDIUM;
4406         }
4407         return -1;
4408     }
4409 
4410     if (num_reqs == 0) {
4411         return 0;
4412     }
4413 
4414     // Create MultiwriteCB structure
4415     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4416     mcb->num_requests = 0;
4417     mcb->num_callbacks = num_reqs;
4418 
4419     for (i = 0; i < num_reqs; i++) {
4420         mcb->callbacks[i].cb = reqs[i].cb;
4421         mcb->callbacks[i].opaque = reqs[i].opaque;
4422     }
4423 
4424     // Check for mergable requests
4425     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4426 
4427     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4428 
4429     /* Run the aio requests. */
4430     mcb->num_requests = num_reqs;
4431     for (i = 0; i < num_reqs; i++) {
4432         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4433                               reqs[i].nb_sectors, reqs[i].flags,
4434                               multiwrite_cb, mcb,
4435                               true);
4436     }
4437 
4438     return 0;
4439 }
4440 
4441 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4442 {
4443     acb->aiocb_info->cancel(acb);
4444 }
4445 
4446 /**************************************************************/
4447 /* async block device emulation */
4448 
4449 typedef struct BlockDriverAIOCBSync {
4450     BlockDriverAIOCB common;
4451     QEMUBH *bh;
4452     int ret;
4453     /* vector translation state */
4454     QEMUIOVector *qiov;
4455     uint8_t *bounce;
4456     int is_write;
4457 } BlockDriverAIOCBSync;
4458 
4459 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4460 {
4461     BlockDriverAIOCBSync *acb =
4462         container_of(blockacb, BlockDriverAIOCBSync, common);
4463     qemu_bh_delete(acb->bh);
4464     acb->bh = NULL;
4465     qemu_aio_release(acb);
4466 }
4467 
4468 static const AIOCBInfo bdrv_em_aiocb_info = {
4469     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4470     .cancel             = bdrv_aio_cancel_em,
4471 };
4472 
4473 static void bdrv_aio_bh_cb(void *opaque)
4474 {
4475     BlockDriverAIOCBSync *acb = opaque;
4476 
4477     if (!acb->is_write)
4478         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4479     qemu_vfree(acb->bounce);
4480     acb->common.cb(acb->common.opaque, acb->ret);
4481     qemu_bh_delete(acb->bh);
4482     acb->bh = NULL;
4483     qemu_aio_release(acb);
4484 }
4485 
4486 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4487                                             int64_t sector_num,
4488                                             QEMUIOVector *qiov,
4489                                             int nb_sectors,
4490                                             BlockDriverCompletionFunc *cb,
4491                                             void *opaque,
4492                                             int is_write)
4493 
4494 {
4495     BlockDriverAIOCBSync *acb;
4496 
4497     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4498     acb->is_write = is_write;
4499     acb->qiov = qiov;
4500     acb->bounce = qemu_blockalign(bs, qiov->size);
4501     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4502 
4503     if (is_write) {
4504         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4505         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4506     } else {
4507         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4508     }
4509 
4510     qemu_bh_schedule(acb->bh);
4511 
4512     return &acb->common;
4513 }
4514 
4515 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4516         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4517         BlockDriverCompletionFunc *cb, void *opaque)
4518 {
4519     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4520 }
4521 
4522 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4523         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4524         BlockDriverCompletionFunc *cb, void *opaque)
4525 {
4526     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4527 }
4528 
4529 
4530 typedef struct BlockDriverAIOCBCoroutine {
4531     BlockDriverAIOCB common;
4532     BlockRequest req;
4533     bool is_write;
4534     bool *done;
4535     QEMUBH* bh;
4536 } BlockDriverAIOCBCoroutine;
4537 
4538 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4539 {
4540     BlockDriverAIOCBCoroutine *acb =
4541         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4542     bool done = false;
4543 
4544     acb->done = &done;
4545     while (!done) {
4546         qemu_aio_wait();
4547     }
4548 }
4549 
4550 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4551     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4552     .cancel             = bdrv_aio_co_cancel_em,
4553 };
4554 
4555 static void bdrv_co_em_bh(void *opaque)
4556 {
4557     BlockDriverAIOCBCoroutine *acb = opaque;
4558 
4559     acb->common.cb(acb->common.opaque, acb->req.error);
4560 
4561     if (acb->done) {
4562         *acb->done = true;
4563     }
4564 
4565     qemu_bh_delete(acb->bh);
4566     qemu_aio_release(acb);
4567 }
4568 
4569 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4570 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4571 {
4572     BlockDriverAIOCBCoroutine *acb = opaque;
4573     BlockDriverState *bs = acb->common.bs;
4574 
4575     if (!acb->is_write) {
4576         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4577             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4578     } else {
4579         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4580             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4581     }
4582 
4583     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4584     qemu_bh_schedule(acb->bh);
4585 }
4586 
4587 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4588                                                int64_t sector_num,
4589                                                QEMUIOVector *qiov,
4590                                                int nb_sectors,
4591                                                BdrvRequestFlags flags,
4592                                                BlockDriverCompletionFunc *cb,
4593                                                void *opaque,
4594                                                bool is_write)
4595 {
4596     Coroutine *co;
4597     BlockDriverAIOCBCoroutine *acb;
4598 
4599     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4600     acb->req.sector = sector_num;
4601     acb->req.nb_sectors = nb_sectors;
4602     acb->req.qiov = qiov;
4603     acb->req.flags = flags;
4604     acb->is_write = is_write;
4605     acb->done = NULL;
4606 
4607     co = qemu_coroutine_create(bdrv_co_do_rw);
4608     qemu_coroutine_enter(co, acb);
4609 
4610     return &acb->common;
4611 }
4612 
4613 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4614 {
4615     BlockDriverAIOCBCoroutine *acb = opaque;
4616     BlockDriverState *bs = acb->common.bs;
4617 
4618     acb->req.error = bdrv_co_flush(bs);
4619     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4620     qemu_bh_schedule(acb->bh);
4621 }
4622 
4623 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4624         BlockDriverCompletionFunc *cb, void *opaque)
4625 {
4626     trace_bdrv_aio_flush(bs, opaque);
4627 
4628     Coroutine *co;
4629     BlockDriverAIOCBCoroutine *acb;
4630 
4631     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4632     acb->done = NULL;
4633 
4634     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4635     qemu_coroutine_enter(co, acb);
4636 
4637     return &acb->common;
4638 }
4639 
4640 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4641 {
4642     BlockDriverAIOCBCoroutine *acb = opaque;
4643     BlockDriverState *bs = acb->common.bs;
4644 
4645     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4646     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4647     qemu_bh_schedule(acb->bh);
4648 }
4649 
4650 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4651         int64_t sector_num, int nb_sectors,
4652         BlockDriverCompletionFunc *cb, void *opaque)
4653 {
4654     Coroutine *co;
4655     BlockDriverAIOCBCoroutine *acb;
4656 
4657     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4658 
4659     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4660     acb->req.sector = sector_num;
4661     acb->req.nb_sectors = nb_sectors;
4662     acb->done = NULL;
4663     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4664     qemu_coroutine_enter(co, acb);
4665 
4666     return &acb->common;
4667 }
4668 
4669 void bdrv_init(void)
4670 {
4671     module_call_init(MODULE_INIT_BLOCK);
4672 }
4673 
4674 void bdrv_init_with_whitelist(void)
4675 {
4676     use_bdrv_whitelist = 1;
4677     bdrv_init();
4678 }
4679 
4680 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4681                    BlockDriverCompletionFunc *cb, void *opaque)
4682 {
4683     BlockDriverAIOCB *acb;
4684 
4685     acb = g_slice_alloc(aiocb_info->aiocb_size);
4686     acb->aiocb_info = aiocb_info;
4687     acb->bs = bs;
4688     acb->cb = cb;
4689     acb->opaque = opaque;
4690     return acb;
4691 }
4692 
4693 void qemu_aio_release(void *p)
4694 {
4695     BlockDriverAIOCB *acb = p;
4696     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4697 }
4698 
4699 /**************************************************************/
4700 /* Coroutine block device emulation */
4701 
4702 typedef struct CoroutineIOCompletion {
4703     Coroutine *coroutine;
4704     int ret;
4705 } CoroutineIOCompletion;
4706 
4707 static void bdrv_co_io_em_complete(void *opaque, int ret)
4708 {
4709     CoroutineIOCompletion *co = opaque;
4710 
4711     co->ret = ret;
4712     qemu_coroutine_enter(co->coroutine, NULL);
4713 }
4714 
4715 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4716                                       int nb_sectors, QEMUIOVector *iov,
4717                                       bool is_write)
4718 {
4719     CoroutineIOCompletion co = {
4720         .coroutine = qemu_coroutine_self(),
4721     };
4722     BlockDriverAIOCB *acb;
4723 
4724     if (is_write) {
4725         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4726                                        bdrv_co_io_em_complete, &co);
4727     } else {
4728         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4729                                       bdrv_co_io_em_complete, &co);
4730     }
4731 
4732     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4733     if (!acb) {
4734         return -EIO;
4735     }
4736     qemu_coroutine_yield();
4737 
4738     return co.ret;
4739 }
4740 
4741 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4742                                          int64_t sector_num, int nb_sectors,
4743                                          QEMUIOVector *iov)
4744 {
4745     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4746 }
4747 
4748 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4749                                          int64_t sector_num, int nb_sectors,
4750                                          QEMUIOVector *iov)
4751 {
4752     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4753 }
4754 
4755 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4756 {
4757     RwCo *rwco = opaque;
4758 
4759     rwco->ret = bdrv_co_flush(rwco->bs);
4760 }
4761 
4762 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4763 {
4764     int ret;
4765 
4766     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4767         return 0;
4768     }
4769 
4770     /* Write back cached data to the OS even with cache=unsafe */
4771     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4772     if (bs->drv->bdrv_co_flush_to_os) {
4773         ret = bs->drv->bdrv_co_flush_to_os(bs);
4774         if (ret < 0) {
4775             return ret;
4776         }
4777     }
4778 
4779     /* But don't actually force it to the disk with cache=unsafe */
4780     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4781         goto flush_parent;
4782     }
4783 
4784     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4785     if (bs->drv->bdrv_co_flush_to_disk) {
4786         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4787     } else if (bs->drv->bdrv_aio_flush) {
4788         BlockDriverAIOCB *acb;
4789         CoroutineIOCompletion co = {
4790             .coroutine = qemu_coroutine_self(),
4791         };
4792 
4793         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4794         if (acb == NULL) {
4795             ret = -EIO;
4796         } else {
4797             qemu_coroutine_yield();
4798             ret = co.ret;
4799         }
4800     } else {
4801         /*
4802          * Some block drivers always operate in either writethrough or unsafe
4803          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4804          * know how the server works (because the behaviour is hardcoded or
4805          * depends on server-side configuration), so we can't ensure that
4806          * everything is safe on disk. Returning an error doesn't work because
4807          * that would break guests even if the server operates in writethrough
4808          * mode.
4809          *
4810          * Let's hope the user knows what he's doing.
4811          */
4812         ret = 0;
4813     }
4814     if (ret < 0) {
4815         return ret;
4816     }
4817 
4818     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4819      * in the case of cache=unsafe, so there are no useless flushes.
4820      */
4821 flush_parent:
4822     return bdrv_co_flush(bs->file);
4823 }
4824 
4825 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4826 {
4827     Error *local_err = NULL;
4828     int ret;
4829 
4830     if (!bs->drv)  {
4831         return;
4832     }
4833 
4834     if (bs->drv->bdrv_invalidate_cache) {
4835         bs->drv->bdrv_invalidate_cache(bs, &local_err);
4836     } else if (bs->file) {
4837         bdrv_invalidate_cache(bs->file, &local_err);
4838     }
4839     if (local_err) {
4840         error_propagate(errp, local_err);
4841         return;
4842     }
4843 
4844     ret = refresh_total_sectors(bs, bs->total_sectors);
4845     if (ret < 0) {
4846         error_setg_errno(errp, -ret, "Could not refresh total sector count");
4847         return;
4848     }
4849 }
4850 
4851 void bdrv_invalidate_cache_all(Error **errp)
4852 {
4853     BlockDriverState *bs;
4854     Error *local_err = NULL;
4855 
4856     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4857         bdrv_invalidate_cache(bs, &local_err);
4858         if (local_err) {
4859             error_propagate(errp, local_err);
4860             return;
4861         }
4862     }
4863 }
4864 
4865 void bdrv_clear_incoming_migration_all(void)
4866 {
4867     BlockDriverState *bs;
4868 
4869     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4870         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4871     }
4872 }
4873 
4874 int bdrv_flush(BlockDriverState *bs)
4875 {
4876     Coroutine *co;
4877     RwCo rwco = {
4878         .bs = bs,
4879         .ret = NOT_DONE,
4880     };
4881 
4882     if (qemu_in_coroutine()) {
4883         /* Fast-path if already in coroutine context */
4884         bdrv_flush_co_entry(&rwco);
4885     } else {
4886         co = qemu_coroutine_create(bdrv_flush_co_entry);
4887         qemu_coroutine_enter(co, &rwco);
4888         while (rwco.ret == NOT_DONE) {
4889             qemu_aio_wait();
4890         }
4891     }
4892 
4893     return rwco.ret;
4894 }
4895 
4896 typedef struct DiscardCo {
4897     BlockDriverState *bs;
4898     int64_t sector_num;
4899     int nb_sectors;
4900     int ret;
4901 } DiscardCo;
4902 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4903 {
4904     DiscardCo *rwco = opaque;
4905 
4906     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4907 }
4908 
4909 /* if no limit is specified in the BlockLimits use a default
4910  * of 32768 512-byte sectors (16 MiB) per request.
4911  */
4912 #define MAX_DISCARD_DEFAULT 32768
4913 
4914 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4915                                  int nb_sectors)
4916 {
4917     int max_discard;
4918 
4919     if (!bs->drv) {
4920         return -ENOMEDIUM;
4921     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4922         return -EIO;
4923     } else if (bs->read_only) {
4924         return -EROFS;
4925     }
4926 
4927     bdrv_reset_dirty(bs, sector_num, nb_sectors);
4928 
4929     /* Do nothing if disabled.  */
4930     if (!(bs->open_flags & BDRV_O_UNMAP)) {
4931         return 0;
4932     }
4933 
4934     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4935         return 0;
4936     }
4937 
4938     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4939     while (nb_sectors > 0) {
4940         int ret;
4941         int num = nb_sectors;
4942 
4943         /* align request */
4944         if (bs->bl.discard_alignment &&
4945             num >= bs->bl.discard_alignment &&
4946             sector_num % bs->bl.discard_alignment) {
4947             if (num > bs->bl.discard_alignment) {
4948                 num = bs->bl.discard_alignment;
4949             }
4950             num -= sector_num % bs->bl.discard_alignment;
4951         }
4952 
4953         /* limit request size */
4954         if (num > max_discard) {
4955             num = max_discard;
4956         }
4957 
4958         if (bs->drv->bdrv_co_discard) {
4959             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4960         } else {
4961             BlockDriverAIOCB *acb;
4962             CoroutineIOCompletion co = {
4963                 .coroutine = qemu_coroutine_self(),
4964             };
4965 
4966             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4967                                             bdrv_co_io_em_complete, &co);
4968             if (acb == NULL) {
4969                 return -EIO;
4970             } else {
4971                 qemu_coroutine_yield();
4972                 ret = co.ret;
4973             }
4974         }
4975         if (ret && ret != -ENOTSUP) {
4976             return ret;
4977         }
4978 
4979         sector_num += num;
4980         nb_sectors -= num;
4981     }
4982     return 0;
4983 }
4984 
4985 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4986 {
4987     Coroutine *co;
4988     DiscardCo rwco = {
4989         .bs = bs,
4990         .sector_num = sector_num,
4991         .nb_sectors = nb_sectors,
4992         .ret = NOT_DONE,
4993     };
4994 
4995     if (qemu_in_coroutine()) {
4996         /* Fast-path if already in coroutine context */
4997         bdrv_discard_co_entry(&rwco);
4998     } else {
4999         co = qemu_coroutine_create(bdrv_discard_co_entry);
5000         qemu_coroutine_enter(co, &rwco);
5001         while (rwco.ret == NOT_DONE) {
5002             qemu_aio_wait();
5003         }
5004     }
5005 
5006     return rwco.ret;
5007 }
5008 
5009 /**************************************************************/
5010 /* removable device support */
5011 
5012 /**
5013  * Return TRUE if the media is present
5014  */
5015 int bdrv_is_inserted(BlockDriverState *bs)
5016 {
5017     BlockDriver *drv = bs->drv;
5018 
5019     if (!drv)
5020         return 0;
5021     if (!drv->bdrv_is_inserted)
5022         return 1;
5023     return drv->bdrv_is_inserted(bs);
5024 }
5025 
5026 /**
5027  * Return whether the media changed since the last call to this
5028  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5029  */
5030 int bdrv_media_changed(BlockDriverState *bs)
5031 {
5032     BlockDriver *drv = bs->drv;
5033 
5034     if (drv && drv->bdrv_media_changed) {
5035         return drv->bdrv_media_changed(bs);
5036     }
5037     return -ENOTSUP;
5038 }
5039 
5040 /**
5041  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5042  */
5043 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5044 {
5045     BlockDriver *drv = bs->drv;
5046 
5047     if (drv && drv->bdrv_eject) {
5048         drv->bdrv_eject(bs, eject_flag);
5049     }
5050 
5051     if (bs->device_name[0] != '\0') {
5052         bdrv_emit_qmp_eject_event(bs, eject_flag);
5053     }
5054 }
5055 
5056 /**
5057  * Lock or unlock the media (if it is locked, the user won't be able
5058  * to eject it manually).
5059  */
5060 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5061 {
5062     BlockDriver *drv = bs->drv;
5063 
5064     trace_bdrv_lock_medium(bs, locked);
5065 
5066     if (drv && drv->bdrv_lock_medium) {
5067         drv->bdrv_lock_medium(bs, locked);
5068     }
5069 }
5070 
5071 /* needed for generic scsi interface */
5072 
5073 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5074 {
5075     BlockDriver *drv = bs->drv;
5076 
5077     if (drv && drv->bdrv_ioctl)
5078         return drv->bdrv_ioctl(bs, req, buf);
5079     return -ENOTSUP;
5080 }
5081 
5082 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5083         unsigned long int req, void *buf,
5084         BlockDriverCompletionFunc *cb, void *opaque)
5085 {
5086     BlockDriver *drv = bs->drv;
5087 
5088     if (drv && drv->bdrv_aio_ioctl)
5089         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5090     return NULL;
5091 }
5092 
5093 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5094 {
5095     bs->guest_block_size = align;
5096 }
5097 
5098 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5099 {
5100     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5101 }
5102 
5103 /*
5104  * Check if all memory in this vector is sector aligned.
5105  */
5106 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5107 {
5108     int i;
5109     size_t alignment = bdrv_opt_mem_align(bs);
5110 
5111     for (i = 0; i < qiov->niov; i++) {
5112         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5113             return false;
5114         }
5115         if (qiov->iov[i].iov_len % alignment) {
5116             return false;
5117         }
5118     }
5119 
5120     return true;
5121 }
5122 
5123 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5124                                           Error **errp)
5125 {
5126     int64_t bitmap_size;
5127     BdrvDirtyBitmap *bitmap;
5128 
5129     assert((granularity & (granularity - 1)) == 0);
5130 
5131     granularity >>= BDRV_SECTOR_BITS;
5132     assert(granularity);
5133     bitmap_size = bdrv_getlength(bs);
5134     if (bitmap_size < 0) {
5135         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5136         errno = -bitmap_size;
5137         return NULL;
5138     }
5139     bitmap_size >>= BDRV_SECTOR_BITS;
5140     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5141     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5142     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5143     return bitmap;
5144 }
5145 
5146 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5147 {
5148     BdrvDirtyBitmap *bm, *next;
5149     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5150         if (bm == bitmap) {
5151             QLIST_REMOVE(bitmap, list);
5152             hbitmap_free(bitmap->bitmap);
5153             g_free(bitmap);
5154             return;
5155         }
5156     }
5157 }
5158 
5159 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5160 {
5161     BdrvDirtyBitmap *bm;
5162     BlockDirtyInfoList *list = NULL;
5163     BlockDirtyInfoList **plist = &list;
5164 
5165     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5166         BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5167         BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5168         info->count = bdrv_get_dirty_count(bs, bm);
5169         info->granularity =
5170             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5171         entry->value = info;
5172         *plist = entry;
5173         plist = &entry->next;
5174     }
5175 
5176     return list;
5177 }
5178 
5179 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5180 {
5181     if (bitmap) {
5182         return hbitmap_get(bitmap->bitmap, sector);
5183     } else {
5184         return 0;
5185     }
5186 }
5187 
5188 void bdrv_dirty_iter_init(BlockDriverState *bs,
5189                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5190 {
5191     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5192 }
5193 
5194 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5195                     int nr_sectors)
5196 {
5197     BdrvDirtyBitmap *bitmap;
5198     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5199         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5200     }
5201 }
5202 
5203 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5204 {
5205     BdrvDirtyBitmap *bitmap;
5206     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5207         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5208     }
5209 }
5210 
5211 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5212 {
5213     return hbitmap_count(bitmap->bitmap);
5214 }
5215 
5216 /* Get a reference to bs */
5217 void bdrv_ref(BlockDriverState *bs)
5218 {
5219     bs->refcnt++;
5220 }
5221 
5222 /* Release a previously grabbed reference to bs.
5223  * If after releasing, reference count is zero, the BlockDriverState is
5224  * deleted. */
5225 void bdrv_unref(BlockDriverState *bs)
5226 {
5227     assert(bs->refcnt > 0);
5228     if (--bs->refcnt == 0) {
5229         bdrv_delete(bs);
5230     }
5231 }
5232 
5233 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5234 {
5235     assert(bs->in_use != in_use);
5236     bs->in_use = in_use;
5237 }
5238 
5239 int bdrv_in_use(BlockDriverState *bs)
5240 {
5241     return bs->in_use;
5242 }
5243 
5244 void bdrv_iostatus_enable(BlockDriverState *bs)
5245 {
5246     bs->iostatus_enabled = true;
5247     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5248 }
5249 
5250 /* The I/O status is only enabled if the drive explicitly
5251  * enables it _and_ the VM is configured to stop on errors */
5252 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5253 {
5254     return (bs->iostatus_enabled &&
5255            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5256             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5257             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5258 }
5259 
5260 void bdrv_iostatus_disable(BlockDriverState *bs)
5261 {
5262     bs->iostatus_enabled = false;
5263 }
5264 
5265 void bdrv_iostatus_reset(BlockDriverState *bs)
5266 {
5267     if (bdrv_iostatus_is_enabled(bs)) {
5268         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5269         if (bs->job) {
5270             block_job_iostatus_reset(bs->job);
5271         }
5272     }
5273 }
5274 
5275 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5276 {
5277     assert(bdrv_iostatus_is_enabled(bs));
5278     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5279         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5280                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5281     }
5282 }
5283 
5284 void
5285 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5286         enum BlockAcctType type)
5287 {
5288     assert(type < BDRV_MAX_IOTYPE);
5289 
5290     cookie->bytes = bytes;
5291     cookie->start_time_ns = get_clock();
5292     cookie->type = type;
5293 }
5294 
5295 void
5296 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5297 {
5298     assert(cookie->type < BDRV_MAX_IOTYPE);
5299 
5300     bs->nr_bytes[cookie->type] += cookie->bytes;
5301     bs->nr_ops[cookie->type]++;
5302     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5303 }
5304 
5305 void bdrv_img_create(const char *filename, const char *fmt,
5306                      const char *base_filename, const char *base_fmt,
5307                      char *options, uint64_t img_size, int flags,
5308                      Error **errp, bool quiet)
5309 {
5310     QEMUOptionParameter *param = NULL, *create_options = NULL;
5311     QEMUOptionParameter *backing_fmt, *backing_file, *size;
5312     BlockDriver *drv, *proto_drv;
5313     BlockDriver *backing_drv = NULL;
5314     Error *local_err = NULL;
5315     int ret = 0;
5316 
5317     /* Find driver and parse its options */
5318     drv = bdrv_find_format(fmt);
5319     if (!drv) {
5320         error_setg(errp, "Unknown file format '%s'", fmt);
5321         return;
5322     }
5323 
5324     proto_drv = bdrv_find_protocol(filename, true);
5325     if (!proto_drv) {
5326         error_setg(errp, "Unknown protocol '%s'", filename);
5327         return;
5328     }
5329 
5330     create_options = append_option_parameters(create_options,
5331                                               drv->create_options);
5332     create_options = append_option_parameters(create_options,
5333                                               proto_drv->create_options);
5334 
5335     /* Create parameter list with default values */
5336     param = parse_option_parameters("", create_options, param);
5337 
5338     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5339 
5340     /* Parse -o options */
5341     if (options) {
5342         param = parse_option_parameters(options, create_options, param);
5343         if (param == NULL) {
5344             error_setg(errp, "Invalid options for file format '%s'.", fmt);
5345             goto out;
5346         }
5347     }
5348 
5349     if (base_filename) {
5350         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5351                                  base_filename)) {
5352             error_setg(errp, "Backing file not supported for file format '%s'",
5353                        fmt);
5354             goto out;
5355         }
5356     }
5357 
5358     if (base_fmt) {
5359         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5360             error_setg(errp, "Backing file format not supported for file "
5361                              "format '%s'", fmt);
5362             goto out;
5363         }
5364     }
5365 
5366     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5367     if (backing_file && backing_file->value.s) {
5368         if (!strcmp(filename, backing_file->value.s)) {
5369             error_setg(errp, "Error: Trying to create an image with the "
5370                              "same filename as the backing file");
5371             goto out;
5372         }
5373     }
5374 
5375     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5376     if (backing_fmt && backing_fmt->value.s) {
5377         backing_drv = bdrv_find_format(backing_fmt->value.s);
5378         if (!backing_drv) {
5379             error_setg(errp, "Unknown backing file format '%s'",
5380                        backing_fmt->value.s);
5381             goto out;
5382         }
5383     }
5384 
5385     // The size for the image must always be specified, with one exception:
5386     // If we are using a backing file, we can obtain the size from there
5387     size = get_option_parameter(param, BLOCK_OPT_SIZE);
5388     if (size && size->value.n == -1) {
5389         if (backing_file && backing_file->value.s) {
5390             BlockDriverState *bs;
5391             uint64_t size;
5392             char buf[32];
5393             int back_flags;
5394 
5395             /* backing files always opened read-only */
5396             back_flags =
5397                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5398 
5399             bs = NULL;
5400             ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
5401                             backing_drv, &local_err);
5402             if (ret < 0) {
5403                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5404                                  backing_file->value.s,
5405                                  error_get_pretty(local_err));
5406                 error_free(local_err);
5407                 local_err = NULL;
5408                 goto out;
5409             }
5410             bdrv_get_geometry(bs, &size);
5411             size *= 512;
5412 
5413             snprintf(buf, sizeof(buf), "%" PRId64, size);
5414             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5415 
5416             bdrv_unref(bs);
5417         } else {
5418             error_setg(errp, "Image creation needs a size parameter");
5419             goto out;
5420         }
5421     }
5422 
5423     if (!quiet) {
5424         printf("Formatting '%s', fmt=%s ", filename, fmt);
5425         print_option_parameters(param);
5426         puts("");
5427     }
5428     ret = bdrv_create(drv, filename, param, &local_err);
5429     if (ret == -EFBIG) {
5430         /* This is generally a better message than whatever the driver would
5431          * deliver (especially because of the cluster_size_hint), since that
5432          * is most probably not much different from "image too large". */
5433         const char *cluster_size_hint = "";
5434         if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5435             cluster_size_hint = " (try using a larger cluster size)";
5436         }
5437         error_setg(errp, "The image size is too large for file format '%s'"
5438                    "%s", fmt, cluster_size_hint);
5439         error_free(local_err);
5440         local_err = NULL;
5441     }
5442 
5443 out:
5444     free_option_parameters(create_options);
5445     free_option_parameters(param);
5446 
5447     if (local_err) {
5448         error_propagate(errp, local_err);
5449     }
5450 }
5451 
5452 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5453 {
5454     /* Currently BlockDriverState always uses the main loop AioContext */
5455     return qemu_get_aio_context();
5456 }
5457 
5458 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5459                                     NotifierWithReturn *notifier)
5460 {
5461     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5462 }
5463 
5464 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5465 {
5466     if (bs->drv->bdrv_amend_options == NULL) {
5467         return -ENOTSUP;
5468     }
5469     return bs->drv->bdrv_amend_options(bs, options);
5470 }
5471 
5472 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5473  * of block filter and by bdrv_is_first_non_filter.
5474  * It is used to test if the given bs is the candidate or recurse more in the
5475  * node graph.
5476  */
5477 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5478                                       BlockDriverState *candidate)
5479 {
5480     /* return false if basic checks fails */
5481     if (!bs || !bs->drv) {
5482         return false;
5483     }
5484 
5485     /* the code reached a non block filter driver -> check if the bs is
5486      * the same as the candidate. It's the recursion termination condition.
5487      */
5488     if (!bs->drv->is_filter) {
5489         return bs == candidate;
5490     }
5491     /* Down this path the driver is a block filter driver */
5492 
5493     /* If the block filter recursion method is defined use it to recurse down
5494      * the node graph.
5495      */
5496     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5497         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5498     }
5499 
5500     /* the driver is a block filter but don't allow to recurse -> return false
5501      */
5502     return false;
5503 }
5504 
5505 /* This function checks if the candidate is the first non filter bs down it's
5506  * bs chain. Since we don't have pointers to parents it explore all bs chains
5507  * from the top. Some filters can choose not to pass down the recursion.
5508  */
5509 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5510 {
5511     BlockDriverState *bs;
5512 
5513     /* walk down the bs forest recursively */
5514     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5515         bool perm;
5516 
5517         /* try to recurse in this top level bs */
5518         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5519 
5520         /* candidate is the first non filter */
5521         if (perm) {
5522             return true;
5523         }
5524     }
5525 
5526     return false;
5527 }
5528