xref: /qemu/block.c (revision d072cdf3)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/sysemu.h"
32 #include "qemu/notify.h"
33 #include "block/coroutine.h"
34 #include "block/qapi.h"
35 #include "qmp-commands.h"
36 #include "qemu/timer.h"
37 #include "qapi-event.h"
38 
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
48 
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
52 
53 struct BdrvDirtyBitmap {
54     HBitmap *bitmap;
55     QLIST_ENTRY(BdrvDirtyBitmap) list;
56 };
57 
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59 
60 #define COROUTINE_POOL_RESERVATION 64 /* number of coroutines to reserve */
61 
62 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
63 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
64         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65         BlockDriverCompletionFunc *cb, void *opaque);
66 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
67         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
68         BlockDriverCompletionFunc *cb, void *opaque);
69 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
70                                          int64_t sector_num, int nb_sectors,
71                                          QEMUIOVector *iov);
72 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
73                                          int64_t sector_num, int nb_sectors,
74                                          QEMUIOVector *iov);
75 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
76     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
77     BdrvRequestFlags flags);
78 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
79     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
80     BdrvRequestFlags flags);
81 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
82                                                int64_t sector_num,
83                                                QEMUIOVector *qiov,
84                                                int nb_sectors,
85                                                BdrvRequestFlags flags,
86                                                BlockDriverCompletionFunc *cb,
87                                                void *opaque,
88                                                bool is_write);
89 static void coroutine_fn bdrv_co_do_rw(void *opaque);
90 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
91     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
92 
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94     QTAILQ_HEAD_INITIALIZER(bdrv_states);
95 
96 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
97     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
98 
99 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
100     QLIST_HEAD_INITIALIZER(bdrv_drivers);
101 
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
104 
105 #ifdef _WIN32
106 static int is_windows_drive_prefix(const char *filename)
107 {
108     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110             filename[1] == ':');
111 }
112 
113 int is_windows_drive(const char *filename)
114 {
115     if (is_windows_drive_prefix(filename) &&
116         filename[2] == '\0')
117         return 1;
118     if (strstart(filename, "\\\\.\\", NULL) ||
119         strstart(filename, "//./", NULL))
120         return 1;
121     return 0;
122 }
123 #endif
124 
125 /* throttling disk I/O limits */
126 void bdrv_set_io_limits(BlockDriverState *bs,
127                         ThrottleConfig *cfg)
128 {
129     int i;
130 
131     throttle_config(&bs->throttle_state, cfg);
132 
133     for (i = 0; i < 2; i++) {
134         qemu_co_enter_next(&bs->throttled_reqs[i]);
135     }
136 }
137 
138 /* this function drain all the throttled IOs */
139 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
140 {
141     bool drained = false;
142     bool enabled = bs->io_limits_enabled;
143     int i;
144 
145     bs->io_limits_enabled = false;
146 
147     for (i = 0; i < 2; i++) {
148         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
149             drained = true;
150         }
151     }
152 
153     bs->io_limits_enabled = enabled;
154 
155     return drained;
156 }
157 
158 void bdrv_io_limits_disable(BlockDriverState *bs)
159 {
160     bs->io_limits_enabled = false;
161 
162     bdrv_start_throttled_reqs(bs);
163 
164     throttle_destroy(&bs->throttle_state);
165 }
166 
167 static void bdrv_throttle_read_timer_cb(void *opaque)
168 {
169     BlockDriverState *bs = opaque;
170     qemu_co_enter_next(&bs->throttled_reqs[0]);
171 }
172 
173 static void bdrv_throttle_write_timer_cb(void *opaque)
174 {
175     BlockDriverState *bs = opaque;
176     qemu_co_enter_next(&bs->throttled_reqs[1]);
177 }
178 
179 /* should be called before bdrv_set_io_limits if a limit is set */
180 void bdrv_io_limits_enable(BlockDriverState *bs)
181 {
182     assert(!bs->io_limits_enabled);
183     throttle_init(&bs->throttle_state,
184                   bdrv_get_aio_context(bs),
185                   QEMU_CLOCK_VIRTUAL,
186                   bdrv_throttle_read_timer_cb,
187                   bdrv_throttle_write_timer_cb,
188                   bs);
189     bs->io_limits_enabled = true;
190 }
191 
192 /* This function makes an IO wait if needed
193  *
194  * @nb_sectors: the number of sectors of the IO
195  * @is_write:   is the IO a write
196  */
197 static void bdrv_io_limits_intercept(BlockDriverState *bs,
198                                      unsigned int bytes,
199                                      bool is_write)
200 {
201     /* does this io must wait */
202     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
203 
204     /* if must wait or any request of this type throttled queue the IO */
205     if (must_wait ||
206         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
207         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
208     }
209 
210     /* the IO will be executed, do the accounting */
211     throttle_account(&bs->throttle_state, is_write, bytes);
212 
213 
214     /* if the next request must wait -> do nothing */
215     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
216         return;
217     }
218 
219     /* else queue next request for execution */
220     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
221 }
222 
223 size_t bdrv_opt_mem_align(BlockDriverState *bs)
224 {
225     if (!bs || !bs->drv) {
226         /* 4k should be on the safe side */
227         return 4096;
228     }
229 
230     return bs->bl.opt_mem_alignment;
231 }
232 
233 /* check if the path starts with "<protocol>:" */
234 static int path_has_protocol(const char *path)
235 {
236     const char *p;
237 
238 #ifdef _WIN32
239     if (is_windows_drive(path) ||
240         is_windows_drive_prefix(path)) {
241         return 0;
242     }
243     p = path + strcspn(path, ":/\\");
244 #else
245     p = path + strcspn(path, ":/");
246 #endif
247 
248     return *p == ':';
249 }
250 
251 int path_is_absolute(const char *path)
252 {
253 #ifdef _WIN32
254     /* specific case for names like: "\\.\d:" */
255     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
256         return 1;
257     }
258     return (*path == '/' || *path == '\\');
259 #else
260     return (*path == '/');
261 #endif
262 }
263 
264 /* if filename is absolute, just copy it to dest. Otherwise, build a
265    path to it by considering it is relative to base_path. URL are
266    supported. */
267 void path_combine(char *dest, int dest_size,
268                   const char *base_path,
269                   const char *filename)
270 {
271     const char *p, *p1;
272     int len;
273 
274     if (dest_size <= 0)
275         return;
276     if (path_is_absolute(filename)) {
277         pstrcpy(dest, dest_size, filename);
278     } else {
279         p = strchr(base_path, ':');
280         if (p)
281             p++;
282         else
283             p = base_path;
284         p1 = strrchr(base_path, '/');
285 #ifdef _WIN32
286         {
287             const char *p2;
288             p2 = strrchr(base_path, '\\');
289             if (!p1 || p2 > p1)
290                 p1 = p2;
291         }
292 #endif
293         if (p1)
294             p1++;
295         else
296             p1 = base_path;
297         if (p1 > p)
298             p = p1;
299         len = p - base_path;
300         if (len > dest_size - 1)
301             len = dest_size - 1;
302         memcpy(dest, base_path, len);
303         dest[len] = '\0';
304         pstrcat(dest, dest_size, filename);
305     }
306 }
307 
308 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
309 {
310     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
311         pstrcpy(dest, sz, bs->backing_file);
312     } else {
313         path_combine(dest, sz, bs->filename, bs->backing_file);
314     }
315 }
316 
317 void bdrv_register(BlockDriver *bdrv)
318 {
319     /* Block drivers without coroutine functions need emulation */
320     if (!bdrv->bdrv_co_readv) {
321         bdrv->bdrv_co_readv = bdrv_co_readv_em;
322         bdrv->bdrv_co_writev = bdrv_co_writev_em;
323 
324         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
325          * the block driver lacks aio we need to emulate that too.
326          */
327         if (!bdrv->bdrv_aio_readv) {
328             /* add AIO emulation layer */
329             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
330             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
331         }
332     }
333 
334     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
335 }
336 
337 /* create a new block device (by default it is empty) */
338 BlockDriverState *bdrv_new(const char *device_name, Error **errp)
339 {
340     BlockDriverState *bs;
341     int i;
342 
343     if (bdrv_find(device_name)) {
344         error_setg(errp, "Device with id '%s' already exists",
345                    device_name);
346         return NULL;
347     }
348     if (bdrv_find_node(device_name)) {
349         error_setg(errp, "Device with node-name '%s' already exists",
350                    device_name);
351         return NULL;
352     }
353 
354     bs = g_new0(BlockDriverState, 1);
355     QLIST_INIT(&bs->dirty_bitmaps);
356     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
357     if (device_name[0] != '\0') {
358         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
359     }
360     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
361         QLIST_INIT(&bs->op_blockers[i]);
362     }
363     bdrv_iostatus_disable(bs);
364     notifier_list_init(&bs->close_notifiers);
365     notifier_with_return_list_init(&bs->before_write_notifiers);
366     qemu_co_queue_init(&bs->throttled_reqs[0]);
367     qemu_co_queue_init(&bs->throttled_reqs[1]);
368     bs->refcnt = 1;
369     bs->aio_context = qemu_get_aio_context();
370 
371     return bs;
372 }
373 
374 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
375 {
376     notifier_list_add(&bs->close_notifiers, notify);
377 }
378 
379 BlockDriver *bdrv_find_format(const char *format_name)
380 {
381     BlockDriver *drv1;
382     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
383         if (!strcmp(drv1->format_name, format_name)) {
384             return drv1;
385         }
386     }
387     return NULL;
388 }
389 
390 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
391 {
392     static const char *whitelist_rw[] = {
393         CONFIG_BDRV_RW_WHITELIST
394     };
395     static const char *whitelist_ro[] = {
396         CONFIG_BDRV_RO_WHITELIST
397     };
398     const char **p;
399 
400     if (!whitelist_rw[0] && !whitelist_ro[0]) {
401         return 1;               /* no whitelist, anything goes */
402     }
403 
404     for (p = whitelist_rw; *p; p++) {
405         if (!strcmp(drv->format_name, *p)) {
406             return 1;
407         }
408     }
409     if (read_only) {
410         for (p = whitelist_ro; *p; p++) {
411             if (!strcmp(drv->format_name, *p)) {
412                 return 1;
413             }
414         }
415     }
416     return 0;
417 }
418 
419 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
420                                           bool read_only)
421 {
422     BlockDriver *drv = bdrv_find_format(format_name);
423     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
424 }
425 
426 typedef struct CreateCo {
427     BlockDriver *drv;
428     char *filename;
429     QemuOpts *opts;
430     int ret;
431     Error *err;
432 } CreateCo;
433 
434 static void coroutine_fn bdrv_create_co_entry(void *opaque)
435 {
436     Error *local_err = NULL;
437     int ret;
438 
439     CreateCo *cco = opaque;
440     assert(cco->drv);
441 
442     ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
443     if (local_err) {
444         error_propagate(&cco->err, local_err);
445     }
446     cco->ret = ret;
447 }
448 
449 int bdrv_create(BlockDriver *drv, const char* filename,
450                 QemuOpts *opts, Error **errp)
451 {
452     int ret;
453 
454     Coroutine *co;
455     CreateCo cco = {
456         .drv = drv,
457         .filename = g_strdup(filename),
458         .opts = opts,
459         .ret = NOT_DONE,
460         .err = NULL,
461     };
462 
463     if (!drv->bdrv_create) {
464         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
465         ret = -ENOTSUP;
466         goto out;
467     }
468 
469     if (qemu_in_coroutine()) {
470         /* Fast-path if already in coroutine context */
471         bdrv_create_co_entry(&cco);
472     } else {
473         co = qemu_coroutine_create(bdrv_create_co_entry);
474         qemu_coroutine_enter(co, &cco);
475         while (cco.ret == NOT_DONE) {
476             aio_poll(qemu_get_aio_context(), true);
477         }
478     }
479 
480     ret = cco.ret;
481     if (ret < 0) {
482         if (cco.err) {
483             error_propagate(errp, cco.err);
484         } else {
485             error_setg_errno(errp, -ret, "Could not create image");
486         }
487     }
488 
489 out:
490     g_free(cco.filename);
491     return ret;
492 }
493 
494 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
495 {
496     BlockDriver *drv;
497     Error *local_err = NULL;
498     int ret;
499 
500     drv = bdrv_find_protocol(filename, true);
501     if (drv == NULL) {
502         error_setg(errp, "Could not find protocol for file '%s'", filename);
503         return -ENOENT;
504     }
505 
506     ret = bdrv_create(drv, filename, opts, &local_err);
507     if (local_err) {
508         error_propagate(errp, local_err);
509     }
510     return ret;
511 }
512 
513 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
514 {
515     BlockDriver *drv = bs->drv;
516     Error *local_err = NULL;
517 
518     memset(&bs->bl, 0, sizeof(bs->bl));
519 
520     if (!drv) {
521         return;
522     }
523 
524     /* Take some limits from the children as a default */
525     if (bs->file) {
526         bdrv_refresh_limits(bs->file, &local_err);
527         if (local_err) {
528             error_propagate(errp, local_err);
529             return;
530         }
531         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
532         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
533     } else {
534         bs->bl.opt_mem_alignment = 512;
535     }
536 
537     if (bs->backing_hd) {
538         bdrv_refresh_limits(bs->backing_hd, &local_err);
539         if (local_err) {
540             error_propagate(errp, local_err);
541             return;
542         }
543         bs->bl.opt_transfer_length =
544             MAX(bs->bl.opt_transfer_length,
545                 bs->backing_hd->bl.opt_transfer_length);
546         bs->bl.opt_mem_alignment =
547             MAX(bs->bl.opt_mem_alignment,
548                 bs->backing_hd->bl.opt_mem_alignment);
549     }
550 
551     /* Then let the driver override it */
552     if (drv->bdrv_refresh_limits) {
553         drv->bdrv_refresh_limits(bs, errp);
554     }
555 }
556 
557 /*
558  * Create a uniquely-named empty temporary file.
559  * Return 0 upon success, otherwise a negative errno value.
560  */
561 int get_tmp_filename(char *filename, int size)
562 {
563 #ifdef _WIN32
564     char temp_dir[MAX_PATH];
565     /* GetTempFileName requires that its output buffer (4th param)
566        have length MAX_PATH or greater.  */
567     assert(size >= MAX_PATH);
568     return (GetTempPath(MAX_PATH, temp_dir)
569             && GetTempFileName(temp_dir, "qem", 0, filename)
570             ? 0 : -GetLastError());
571 #else
572     int fd;
573     const char *tmpdir;
574     tmpdir = getenv("TMPDIR");
575     if (!tmpdir) {
576         tmpdir = "/var/tmp";
577     }
578     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
579         return -EOVERFLOW;
580     }
581     fd = mkstemp(filename);
582     if (fd < 0) {
583         return -errno;
584     }
585     if (close(fd) != 0) {
586         unlink(filename);
587         return -errno;
588     }
589     return 0;
590 #endif
591 }
592 
593 /*
594  * Detect host devices. By convention, /dev/cdrom[N] is always
595  * recognized as a host CDROM.
596  */
597 static BlockDriver *find_hdev_driver(const char *filename)
598 {
599     int score_max = 0, score;
600     BlockDriver *drv = NULL, *d;
601 
602     QLIST_FOREACH(d, &bdrv_drivers, list) {
603         if (d->bdrv_probe_device) {
604             score = d->bdrv_probe_device(filename);
605             if (score > score_max) {
606                 score_max = score;
607                 drv = d;
608             }
609         }
610     }
611 
612     return drv;
613 }
614 
615 BlockDriver *bdrv_find_protocol(const char *filename,
616                                 bool allow_protocol_prefix)
617 {
618     BlockDriver *drv1;
619     char protocol[128];
620     int len;
621     const char *p;
622 
623     /* TODO Drivers without bdrv_file_open must be specified explicitly */
624 
625     /*
626      * XXX(hch): we really should not let host device detection
627      * override an explicit protocol specification, but moving this
628      * later breaks access to device names with colons in them.
629      * Thanks to the brain-dead persistent naming schemes on udev-
630      * based Linux systems those actually are quite common.
631      */
632     drv1 = find_hdev_driver(filename);
633     if (drv1) {
634         return drv1;
635     }
636 
637     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
638         return bdrv_find_format("file");
639     }
640 
641     p = strchr(filename, ':');
642     assert(p != NULL);
643     len = p - filename;
644     if (len > sizeof(protocol) - 1)
645         len = sizeof(protocol) - 1;
646     memcpy(protocol, filename, len);
647     protocol[len] = '\0';
648     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
649         if (drv1->protocol_name &&
650             !strcmp(drv1->protocol_name, protocol)) {
651             return drv1;
652         }
653     }
654     return NULL;
655 }
656 
657 static int find_image_format(BlockDriverState *bs, const char *filename,
658                              BlockDriver **pdrv, Error **errp)
659 {
660     int score, score_max;
661     BlockDriver *drv1, *drv;
662     uint8_t buf[2048];
663     int ret = 0;
664 
665     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
666     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
667         drv = bdrv_find_format("raw");
668         if (!drv) {
669             error_setg(errp, "Could not find raw image format");
670             ret = -ENOENT;
671         }
672         *pdrv = drv;
673         return ret;
674     }
675 
676     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
677     if (ret < 0) {
678         error_setg_errno(errp, -ret, "Could not read image for determining its "
679                          "format");
680         *pdrv = NULL;
681         return ret;
682     }
683 
684     score_max = 0;
685     drv = NULL;
686     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
687         if (drv1->bdrv_probe) {
688             score = drv1->bdrv_probe(buf, ret, filename);
689             if (score > score_max) {
690                 score_max = score;
691                 drv = drv1;
692             }
693         }
694     }
695     if (!drv) {
696         error_setg(errp, "Could not determine image format: No compatible "
697                    "driver found");
698         ret = -ENOENT;
699     }
700     *pdrv = drv;
701     return ret;
702 }
703 
704 /**
705  * Set the current 'total_sectors' value
706  * Return 0 on success, -errno on error.
707  */
708 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
709 {
710     BlockDriver *drv = bs->drv;
711 
712     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
713     if (bs->sg)
714         return 0;
715 
716     /* query actual device if possible, otherwise just trust the hint */
717     if (drv->bdrv_getlength) {
718         int64_t length = drv->bdrv_getlength(bs);
719         if (length < 0) {
720             return length;
721         }
722         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
723     }
724 
725     bs->total_sectors = hint;
726     return 0;
727 }
728 
729 /**
730  * Set open flags for a given discard mode
731  *
732  * Return 0 on success, -1 if the discard mode was invalid.
733  */
734 int bdrv_parse_discard_flags(const char *mode, int *flags)
735 {
736     *flags &= ~BDRV_O_UNMAP;
737 
738     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
739         /* do nothing */
740     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
741         *flags |= BDRV_O_UNMAP;
742     } else {
743         return -1;
744     }
745 
746     return 0;
747 }
748 
749 /**
750  * Set open flags for a given cache mode
751  *
752  * Return 0 on success, -1 if the cache mode was invalid.
753  */
754 int bdrv_parse_cache_flags(const char *mode, int *flags)
755 {
756     *flags &= ~BDRV_O_CACHE_MASK;
757 
758     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
759         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
760     } else if (!strcmp(mode, "directsync")) {
761         *flags |= BDRV_O_NOCACHE;
762     } else if (!strcmp(mode, "writeback")) {
763         *flags |= BDRV_O_CACHE_WB;
764     } else if (!strcmp(mode, "unsafe")) {
765         *flags |= BDRV_O_CACHE_WB;
766         *flags |= BDRV_O_NO_FLUSH;
767     } else if (!strcmp(mode, "writethrough")) {
768         /* this is the default */
769     } else {
770         return -1;
771     }
772 
773     return 0;
774 }
775 
776 /**
777  * The copy-on-read flag is actually a reference count so multiple users may
778  * use the feature without worrying about clobbering its previous state.
779  * Copy-on-read stays enabled until all users have called to disable it.
780  */
781 void bdrv_enable_copy_on_read(BlockDriverState *bs)
782 {
783     bs->copy_on_read++;
784 }
785 
786 void bdrv_disable_copy_on_read(BlockDriverState *bs)
787 {
788     assert(bs->copy_on_read > 0);
789     bs->copy_on_read--;
790 }
791 
792 /*
793  * Returns the flags that a temporary snapshot should get, based on the
794  * originally requested flags (the originally requested image will have flags
795  * like a backing file)
796  */
797 static int bdrv_temp_snapshot_flags(int flags)
798 {
799     return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
800 }
801 
802 /*
803  * Returns the flags that bs->file should get, based on the given flags for
804  * the parent BDS
805  */
806 static int bdrv_inherited_flags(int flags)
807 {
808     /* Enable protocol handling, disable format probing for bs->file */
809     flags |= BDRV_O_PROTOCOL;
810 
811     /* Our block drivers take care to send flushes and respect unmap policy,
812      * so we can enable both unconditionally on lower layers. */
813     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
814 
815     /* Clear flags that only apply to the top layer */
816     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
817 
818     return flags;
819 }
820 
821 /*
822  * Returns the flags that bs->backing_hd should get, based on the given flags
823  * for the parent BDS
824  */
825 static int bdrv_backing_flags(int flags)
826 {
827     /* backing files always opened read-only */
828     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
829 
830     /* snapshot=on is handled on the top layer */
831     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
832 
833     return flags;
834 }
835 
836 static int bdrv_open_flags(BlockDriverState *bs, int flags)
837 {
838     int open_flags = flags | BDRV_O_CACHE_WB;
839 
840     /*
841      * Clear flags that are internal to the block layer before opening the
842      * image.
843      */
844     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
845 
846     /*
847      * Snapshots should be writable.
848      */
849     if (flags & BDRV_O_TEMPORARY) {
850         open_flags |= BDRV_O_RDWR;
851     }
852 
853     return open_flags;
854 }
855 
856 static void bdrv_assign_node_name(BlockDriverState *bs,
857                                   const char *node_name,
858                                   Error **errp)
859 {
860     if (!node_name) {
861         return;
862     }
863 
864     /* empty string node name is invalid */
865     if (node_name[0] == '\0') {
866         error_setg(errp, "Empty node name");
867         return;
868     }
869 
870     /* takes care of avoiding namespaces collisions */
871     if (bdrv_find(node_name)) {
872         error_setg(errp, "node-name=%s is conflicting with a device id",
873                    node_name);
874         return;
875     }
876 
877     /* takes care of avoiding duplicates node names */
878     if (bdrv_find_node(node_name)) {
879         error_setg(errp, "Duplicate node name");
880         return;
881     }
882 
883     /* copy node name into the bs and insert it into the graph list */
884     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
885     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
886 }
887 
888 /*
889  * Common part for opening disk images and files
890  *
891  * Removes all processed options from *options.
892  */
893 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
894     QDict *options, int flags, BlockDriver *drv, Error **errp)
895 {
896     int ret, open_flags;
897     const char *filename;
898     const char *node_name = NULL;
899     Error *local_err = NULL;
900 
901     assert(drv != NULL);
902     assert(bs->file == NULL);
903     assert(options != NULL && bs->options != options);
904 
905     if (file != NULL) {
906         filename = file->filename;
907     } else {
908         filename = qdict_get_try_str(options, "filename");
909     }
910 
911     if (drv->bdrv_needs_filename && !filename) {
912         error_setg(errp, "The '%s' block driver requires a file name",
913                    drv->format_name);
914         return -EINVAL;
915     }
916 
917     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
918 
919     node_name = qdict_get_try_str(options, "node-name");
920     bdrv_assign_node_name(bs, node_name, &local_err);
921     if (local_err) {
922         error_propagate(errp, local_err);
923         return -EINVAL;
924     }
925     qdict_del(options, "node-name");
926 
927     /* bdrv_open() with directly using a protocol as drv. This layer is already
928      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
929      * and return immediately. */
930     if (file != NULL && drv->bdrv_file_open) {
931         bdrv_swap(file, bs);
932         return 0;
933     }
934 
935     bs->open_flags = flags;
936     bs->guest_block_size = 512;
937     bs->request_alignment = 512;
938     bs->zero_beyond_eof = true;
939     open_flags = bdrv_open_flags(bs, flags);
940     bs->read_only = !(open_flags & BDRV_O_RDWR);
941     bs->growable = !!(flags & BDRV_O_PROTOCOL);
942 
943     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
944         error_setg(errp,
945                    !bs->read_only && bdrv_is_whitelisted(drv, true)
946                         ? "Driver '%s' can only be used for read-only devices"
947                         : "Driver '%s' is not whitelisted",
948                    drv->format_name);
949         return -ENOTSUP;
950     }
951 
952     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
953     if (flags & BDRV_O_COPY_ON_READ) {
954         if (!bs->read_only) {
955             bdrv_enable_copy_on_read(bs);
956         } else {
957             error_setg(errp, "Can't use copy-on-read on read-only device");
958             return -EINVAL;
959         }
960     }
961 
962     if (filename != NULL) {
963         pstrcpy(bs->filename, sizeof(bs->filename), filename);
964     } else {
965         bs->filename[0] = '\0';
966     }
967     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
968 
969     bs->drv = drv;
970     bs->opaque = g_malloc0(drv->instance_size);
971 
972     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
973 
974     /* Open the image, either directly or using a protocol */
975     if (drv->bdrv_file_open) {
976         assert(file == NULL);
977         assert(!drv->bdrv_needs_filename || filename != NULL);
978         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
979     } else {
980         if (file == NULL) {
981             error_setg(errp, "Can't use '%s' as a block driver for the "
982                        "protocol level", drv->format_name);
983             ret = -EINVAL;
984             goto free_and_fail;
985         }
986         bs->file = file;
987         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
988     }
989 
990     if (ret < 0) {
991         if (local_err) {
992             error_propagate(errp, local_err);
993         } else if (bs->filename[0]) {
994             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
995         } else {
996             error_setg_errno(errp, -ret, "Could not open image");
997         }
998         goto free_and_fail;
999     }
1000 
1001     ret = refresh_total_sectors(bs, bs->total_sectors);
1002     if (ret < 0) {
1003         error_setg_errno(errp, -ret, "Could not refresh total sector count");
1004         goto free_and_fail;
1005     }
1006 
1007     bdrv_refresh_limits(bs, &local_err);
1008     if (local_err) {
1009         error_propagate(errp, local_err);
1010         ret = -EINVAL;
1011         goto free_and_fail;
1012     }
1013 
1014     assert(bdrv_opt_mem_align(bs) != 0);
1015     assert((bs->request_alignment != 0) || bs->sg);
1016     return 0;
1017 
1018 free_and_fail:
1019     bs->file = NULL;
1020     g_free(bs->opaque);
1021     bs->opaque = NULL;
1022     bs->drv = NULL;
1023     return ret;
1024 }
1025 
1026 static QDict *parse_json_filename(const char *filename, Error **errp)
1027 {
1028     QObject *options_obj;
1029     QDict *options;
1030     int ret;
1031 
1032     ret = strstart(filename, "json:", &filename);
1033     assert(ret);
1034 
1035     options_obj = qobject_from_json(filename);
1036     if (!options_obj) {
1037         error_setg(errp, "Could not parse the JSON options");
1038         return NULL;
1039     }
1040 
1041     if (qobject_type(options_obj) != QTYPE_QDICT) {
1042         qobject_decref(options_obj);
1043         error_setg(errp, "Invalid JSON object given");
1044         return NULL;
1045     }
1046 
1047     options = qobject_to_qdict(options_obj);
1048     qdict_flatten(options);
1049 
1050     return options;
1051 }
1052 
1053 /*
1054  * Fills in default options for opening images and converts the legacy
1055  * filename/flags pair to option QDict entries.
1056  */
1057 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1058                              BlockDriver *drv, Error **errp)
1059 {
1060     const char *filename = *pfilename;
1061     const char *drvname;
1062     bool protocol = flags & BDRV_O_PROTOCOL;
1063     bool parse_filename = false;
1064     Error *local_err = NULL;
1065 
1066     /* Parse json: pseudo-protocol */
1067     if (filename && g_str_has_prefix(filename, "json:")) {
1068         QDict *json_options = parse_json_filename(filename, &local_err);
1069         if (local_err) {
1070             error_propagate(errp, local_err);
1071             return -EINVAL;
1072         }
1073 
1074         /* Options given in the filename have lower priority than options
1075          * specified directly */
1076         qdict_join(*options, json_options, false);
1077         QDECREF(json_options);
1078         *pfilename = filename = NULL;
1079     }
1080 
1081     /* Fetch the file name from the options QDict if necessary */
1082     if (protocol && filename) {
1083         if (!qdict_haskey(*options, "filename")) {
1084             qdict_put(*options, "filename", qstring_from_str(filename));
1085             parse_filename = true;
1086         } else {
1087             error_setg(errp, "Can't specify 'file' and 'filename' options at "
1088                              "the same time");
1089             return -EINVAL;
1090         }
1091     }
1092 
1093     /* Find the right block driver */
1094     filename = qdict_get_try_str(*options, "filename");
1095     drvname = qdict_get_try_str(*options, "driver");
1096 
1097     if (drv) {
1098         if (drvname) {
1099             error_setg(errp, "Driver specified twice");
1100             return -EINVAL;
1101         }
1102         drvname = drv->format_name;
1103         qdict_put(*options, "driver", qstring_from_str(drvname));
1104     } else {
1105         if (!drvname && protocol) {
1106             if (filename) {
1107                 drv = bdrv_find_protocol(filename, parse_filename);
1108                 if (!drv) {
1109                     error_setg(errp, "Unknown protocol");
1110                     return -EINVAL;
1111                 }
1112 
1113                 drvname = drv->format_name;
1114                 qdict_put(*options, "driver", qstring_from_str(drvname));
1115             } else {
1116                 error_setg(errp, "Must specify either driver or file");
1117                 return -EINVAL;
1118             }
1119         } else if (drvname) {
1120             drv = bdrv_find_format(drvname);
1121             if (!drv) {
1122                 error_setg(errp, "Unknown driver '%s'", drvname);
1123                 return -ENOENT;
1124             }
1125         }
1126     }
1127 
1128     assert(drv || !protocol);
1129 
1130     /* Driver-specific filename parsing */
1131     if (drv && drv->bdrv_parse_filename && parse_filename) {
1132         drv->bdrv_parse_filename(filename, *options, &local_err);
1133         if (local_err) {
1134             error_propagate(errp, local_err);
1135             return -EINVAL;
1136         }
1137 
1138         if (!drv->bdrv_needs_filename) {
1139             qdict_del(*options, "filename");
1140         }
1141     }
1142 
1143     return 0;
1144 }
1145 
1146 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1147 {
1148 
1149     if (bs->backing_hd) {
1150         assert(bs->backing_blocker);
1151         bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1152     } else if (backing_hd) {
1153         error_setg(&bs->backing_blocker,
1154                    "device is used as backing hd of '%s'",
1155                    bs->device_name);
1156     }
1157 
1158     bs->backing_hd = backing_hd;
1159     if (!backing_hd) {
1160         error_free(bs->backing_blocker);
1161         bs->backing_blocker = NULL;
1162         goto out;
1163     }
1164     bs->open_flags &= ~BDRV_O_NO_BACKING;
1165     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1166     pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1167             backing_hd->drv ? backing_hd->drv->format_name : "");
1168 
1169     bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1170     /* Otherwise we won't be able to commit due to check in bdrv_commit */
1171     bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1172                     bs->backing_blocker);
1173 out:
1174     bdrv_refresh_limits(bs, NULL);
1175 }
1176 
1177 /*
1178  * Opens the backing file for a BlockDriverState if not yet open
1179  *
1180  * options is a QDict of options to pass to the block drivers, or NULL for an
1181  * empty set of options. The reference to the QDict is transferred to this
1182  * function (even on failure), so if the caller intends to reuse the dictionary,
1183  * it needs to use QINCREF() before calling bdrv_file_open.
1184  */
1185 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1186 {
1187     char *backing_filename = g_malloc0(PATH_MAX);
1188     int ret = 0;
1189     BlockDriver *back_drv = NULL;
1190     BlockDriverState *backing_hd;
1191     Error *local_err = NULL;
1192 
1193     if (bs->backing_hd != NULL) {
1194         QDECREF(options);
1195         goto free_exit;
1196     }
1197 
1198     /* NULL means an empty set of options */
1199     if (options == NULL) {
1200         options = qdict_new();
1201     }
1202 
1203     bs->open_flags &= ~BDRV_O_NO_BACKING;
1204     if (qdict_haskey(options, "file.filename")) {
1205         backing_filename[0] = '\0';
1206     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1207         QDECREF(options);
1208         goto free_exit;
1209     } else {
1210         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1211     }
1212 
1213     if (!bs->drv || !bs->drv->supports_backing) {
1214         ret = -EINVAL;
1215         error_setg(errp, "Driver doesn't support backing files");
1216         QDECREF(options);
1217         goto free_exit;
1218     }
1219 
1220     backing_hd = bdrv_new("", errp);
1221 
1222     if (bs->backing_format[0] != '\0') {
1223         back_drv = bdrv_find_format(bs->backing_format);
1224     }
1225 
1226     assert(bs->backing_hd == NULL);
1227     ret = bdrv_open(&backing_hd,
1228                     *backing_filename ? backing_filename : NULL, NULL, options,
1229                     bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1230     if (ret < 0) {
1231         bdrv_unref(backing_hd);
1232         backing_hd = NULL;
1233         bs->open_flags |= BDRV_O_NO_BACKING;
1234         error_setg(errp, "Could not open backing file: %s",
1235                    error_get_pretty(local_err));
1236         error_free(local_err);
1237         goto free_exit;
1238     }
1239     bdrv_set_backing_hd(bs, backing_hd);
1240 
1241 free_exit:
1242     g_free(backing_filename);
1243     return ret;
1244 }
1245 
1246 /*
1247  * Opens a disk image whose options are given as BlockdevRef in another block
1248  * device's options.
1249  *
1250  * If allow_none is true, no image will be opened if filename is false and no
1251  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1252  *
1253  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1254  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1255  * itself, all options starting with "${bdref_key}." are considered part of the
1256  * BlockdevRef.
1257  *
1258  * The BlockdevRef will be removed from the options QDict.
1259  *
1260  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1261  */
1262 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1263                     QDict *options, const char *bdref_key, int flags,
1264                     bool allow_none, Error **errp)
1265 {
1266     QDict *image_options;
1267     int ret;
1268     char *bdref_key_dot;
1269     const char *reference;
1270 
1271     assert(pbs);
1272     assert(*pbs == NULL);
1273 
1274     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1275     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1276     g_free(bdref_key_dot);
1277 
1278     reference = qdict_get_try_str(options, bdref_key);
1279     if (!filename && !reference && !qdict_size(image_options)) {
1280         if (allow_none) {
1281             ret = 0;
1282         } else {
1283             error_setg(errp, "A block device must be specified for \"%s\"",
1284                        bdref_key);
1285             ret = -EINVAL;
1286         }
1287         QDECREF(image_options);
1288         goto done;
1289     }
1290 
1291     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1292 
1293 done:
1294     qdict_del(options, bdref_key);
1295     return ret;
1296 }
1297 
1298 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1299 {
1300     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1301     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1302     int64_t total_size;
1303     BlockDriver *bdrv_qcow2;
1304     QemuOpts *opts = NULL;
1305     QDict *snapshot_options;
1306     BlockDriverState *bs_snapshot;
1307     Error *local_err;
1308     int ret;
1309 
1310     /* if snapshot, we create a temporary backing file and open it
1311        instead of opening 'filename' directly */
1312 
1313     /* Get the required size from the image */
1314     total_size = bdrv_getlength(bs);
1315     if (total_size < 0) {
1316         ret = total_size;
1317         error_setg_errno(errp, -total_size, "Could not get image size");
1318         goto out;
1319     }
1320 
1321     /* Create the temporary image */
1322     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1323     if (ret < 0) {
1324         error_setg_errno(errp, -ret, "Could not get temporary filename");
1325         goto out;
1326     }
1327 
1328     bdrv_qcow2 = bdrv_find_format("qcow2");
1329     opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1330                             &error_abort);
1331     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1332     ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
1333     qemu_opts_del(opts);
1334     if (ret < 0) {
1335         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1336                          "'%s': %s", tmp_filename,
1337                          error_get_pretty(local_err));
1338         error_free(local_err);
1339         goto out;
1340     }
1341 
1342     /* Prepare a new options QDict for the temporary file */
1343     snapshot_options = qdict_new();
1344     qdict_put(snapshot_options, "file.driver",
1345               qstring_from_str("file"));
1346     qdict_put(snapshot_options, "file.filename",
1347               qstring_from_str(tmp_filename));
1348 
1349     bs_snapshot = bdrv_new("", &error_abort);
1350 
1351     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1352                     flags, bdrv_qcow2, &local_err);
1353     if (ret < 0) {
1354         error_propagate(errp, local_err);
1355         goto out;
1356     }
1357 
1358     bdrv_append(bs_snapshot, bs);
1359 
1360 out:
1361     g_free(tmp_filename);
1362     return ret;
1363 }
1364 
1365 /*
1366  * Opens a disk image (raw, qcow2, vmdk, ...)
1367  *
1368  * options is a QDict of options to pass to the block drivers, or NULL for an
1369  * empty set of options. The reference to the QDict belongs to the block layer
1370  * after the call (even on failure), so if the caller intends to reuse the
1371  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1372  *
1373  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1374  * If it is not NULL, the referenced BDS will be reused.
1375  *
1376  * The reference parameter may be used to specify an existing block device which
1377  * should be opened. If specified, neither options nor a filename may be given,
1378  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1379  */
1380 int bdrv_open(BlockDriverState **pbs, const char *filename,
1381               const char *reference, QDict *options, int flags,
1382               BlockDriver *drv, Error **errp)
1383 {
1384     int ret;
1385     BlockDriverState *file = NULL, *bs;
1386     const char *drvname;
1387     Error *local_err = NULL;
1388     int snapshot_flags = 0;
1389 
1390     assert(pbs);
1391 
1392     if (reference) {
1393         bool options_non_empty = options ? qdict_size(options) : false;
1394         QDECREF(options);
1395 
1396         if (*pbs) {
1397             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1398                        "another block device");
1399             return -EINVAL;
1400         }
1401 
1402         if (filename || options_non_empty) {
1403             error_setg(errp, "Cannot reference an existing block device with "
1404                        "additional options or a new filename");
1405             return -EINVAL;
1406         }
1407 
1408         bs = bdrv_lookup_bs(reference, reference, errp);
1409         if (!bs) {
1410             return -ENODEV;
1411         }
1412         bdrv_ref(bs);
1413         *pbs = bs;
1414         return 0;
1415     }
1416 
1417     if (*pbs) {
1418         bs = *pbs;
1419     } else {
1420         bs = bdrv_new("", &error_abort);
1421     }
1422 
1423     /* NULL means an empty set of options */
1424     if (options == NULL) {
1425         options = qdict_new();
1426     }
1427 
1428     ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1429     if (local_err) {
1430         goto fail;
1431     }
1432 
1433     /* Find the right image format driver */
1434     drv = NULL;
1435     drvname = qdict_get_try_str(options, "driver");
1436     if (drvname) {
1437         drv = bdrv_find_format(drvname);
1438         qdict_del(options, "driver");
1439         if (!drv) {
1440             error_setg(errp, "Unknown driver: '%s'", drvname);
1441             ret = -EINVAL;
1442             goto fail;
1443         }
1444     }
1445 
1446     assert(drvname || !(flags & BDRV_O_PROTOCOL));
1447     if (drv && !drv->bdrv_file_open) {
1448         /* If the user explicitly wants a format driver here, we'll need to add
1449          * another layer for the protocol in bs->file */
1450         flags &= ~BDRV_O_PROTOCOL;
1451     }
1452 
1453     bs->options = options;
1454     options = qdict_clone_shallow(options);
1455 
1456     /* Open image file without format layer */
1457     if ((flags & BDRV_O_PROTOCOL) == 0) {
1458         if (flags & BDRV_O_RDWR) {
1459             flags |= BDRV_O_ALLOW_RDWR;
1460         }
1461         if (flags & BDRV_O_SNAPSHOT) {
1462             snapshot_flags = bdrv_temp_snapshot_flags(flags);
1463             flags = bdrv_backing_flags(flags);
1464         }
1465 
1466         assert(file == NULL);
1467         ret = bdrv_open_image(&file, filename, options, "file",
1468                               bdrv_inherited_flags(flags),
1469                               true, &local_err);
1470         if (ret < 0) {
1471             goto fail;
1472         }
1473     }
1474 
1475     /* Image format probing */
1476     if (!drv && file) {
1477         ret = find_image_format(file, filename, &drv, &local_err);
1478         if (ret < 0) {
1479             goto fail;
1480         }
1481     } else if (!drv) {
1482         error_setg(errp, "Must specify either driver or file");
1483         ret = -EINVAL;
1484         goto fail;
1485     }
1486 
1487     /* Open the image */
1488     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1489     if (ret < 0) {
1490         goto fail;
1491     }
1492 
1493     if (file && (bs->file != file)) {
1494         bdrv_unref(file);
1495         file = NULL;
1496     }
1497 
1498     /* If there is a backing file, use it */
1499     if ((flags & BDRV_O_NO_BACKING) == 0) {
1500         QDict *backing_options;
1501 
1502         qdict_extract_subqdict(options, &backing_options, "backing.");
1503         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1504         if (ret < 0) {
1505             goto close_and_fail;
1506         }
1507     }
1508 
1509     bdrv_refresh_filename(bs);
1510 
1511     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1512      * temporary snapshot afterwards. */
1513     if (snapshot_flags) {
1514         ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1515         if (local_err) {
1516             goto close_and_fail;
1517         }
1518     }
1519 
1520     /* Check if any unknown options were used */
1521     if (options && (qdict_size(options) != 0)) {
1522         const QDictEntry *entry = qdict_first(options);
1523         if (flags & BDRV_O_PROTOCOL) {
1524             error_setg(errp, "Block protocol '%s' doesn't support the option "
1525                        "'%s'", drv->format_name, entry->key);
1526         } else {
1527             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1528                        "support the option '%s'", drv->format_name,
1529                        bs->device_name, entry->key);
1530         }
1531 
1532         ret = -EINVAL;
1533         goto close_and_fail;
1534     }
1535 
1536     if (!bdrv_key_required(bs)) {
1537         bdrv_dev_change_media_cb(bs, true);
1538     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1539                && !runstate_check(RUN_STATE_INMIGRATE)
1540                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1541         error_setg(errp,
1542                    "Guest must be stopped for opening of encrypted image");
1543         ret = -EBUSY;
1544         goto close_and_fail;
1545     }
1546 
1547     QDECREF(options);
1548     *pbs = bs;
1549     return 0;
1550 
1551 fail:
1552     if (file != NULL) {
1553         bdrv_unref(file);
1554     }
1555     QDECREF(bs->options);
1556     QDECREF(options);
1557     bs->options = NULL;
1558     if (!*pbs) {
1559         /* If *pbs is NULL, a new BDS has been created in this function and
1560            needs to be freed now. Otherwise, it does not need to be closed,
1561            since it has not really been opened yet. */
1562         bdrv_unref(bs);
1563     }
1564     if (local_err) {
1565         error_propagate(errp, local_err);
1566     }
1567     return ret;
1568 
1569 close_and_fail:
1570     /* See fail path, but now the BDS has to be always closed */
1571     if (*pbs) {
1572         bdrv_close(bs);
1573     } else {
1574         bdrv_unref(bs);
1575     }
1576     QDECREF(options);
1577     if (local_err) {
1578         error_propagate(errp, local_err);
1579     }
1580     return ret;
1581 }
1582 
1583 typedef struct BlockReopenQueueEntry {
1584      bool prepared;
1585      BDRVReopenState state;
1586      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1587 } BlockReopenQueueEntry;
1588 
1589 /*
1590  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1591  * reopen of multiple devices.
1592  *
1593  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1594  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1595  * be created and initialized. This newly created BlockReopenQueue should be
1596  * passed back in for subsequent calls that are intended to be of the same
1597  * atomic 'set'.
1598  *
1599  * bs is the BlockDriverState to add to the reopen queue.
1600  *
1601  * flags contains the open flags for the associated bs
1602  *
1603  * returns a pointer to bs_queue, which is either the newly allocated
1604  * bs_queue, or the existing bs_queue being used.
1605  *
1606  */
1607 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1608                                     BlockDriverState *bs, int flags)
1609 {
1610     assert(bs != NULL);
1611 
1612     BlockReopenQueueEntry *bs_entry;
1613     if (bs_queue == NULL) {
1614         bs_queue = g_new0(BlockReopenQueue, 1);
1615         QSIMPLEQ_INIT(bs_queue);
1616     }
1617 
1618     /* bdrv_open() masks this flag out */
1619     flags &= ~BDRV_O_PROTOCOL;
1620 
1621     if (bs->file) {
1622         bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1623     }
1624 
1625     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1626     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1627 
1628     bs_entry->state.bs = bs;
1629     bs_entry->state.flags = flags;
1630 
1631     return bs_queue;
1632 }
1633 
1634 /*
1635  * Reopen multiple BlockDriverStates atomically & transactionally.
1636  *
1637  * The queue passed in (bs_queue) must have been built up previous
1638  * via bdrv_reopen_queue().
1639  *
1640  * Reopens all BDS specified in the queue, with the appropriate
1641  * flags.  All devices are prepared for reopen, and failure of any
1642  * device will cause all device changes to be abandonded, and intermediate
1643  * data cleaned up.
1644  *
1645  * If all devices prepare successfully, then the changes are committed
1646  * to all devices.
1647  *
1648  */
1649 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1650 {
1651     int ret = -1;
1652     BlockReopenQueueEntry *bs_entry, *next;
1653     Error *local_err = NULL;
1654 
1655     assert(bs_queue != NULL);
1656 
1657     bdrv_drain_all();
1658 
1659     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1660         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1661             error_propagate(errp, local_err);
1662             goto cleanup;
1663         }
1664         bs_entry->prepared = true;
1665     }
1666 
1667     /* If we reach this point, we have success and just need to apply the
1668      * changes
1669      */
1670     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1671         bdrv_reopen_commit(&bs_entry->state);
1672     }
1673 
1674     ret = 0;
1675 
1676 cleanup:
1677     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1678         if (ret && bs_entry->prepared) {
1679             bdrv_reopen_abort(&bs_entry->state);
1680         }
1681         g_free(bs_entry);
1682     }
1683     g_free(bs_queue);
1684     return ret;
1685 }
1686 
1687 
1688 /* Reopen a single BlockDriverState with the specified flags. */
1689 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1690 {
1691     int ret = -1;
1692     Error *local_err = NULL;
1693     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1694 
1695     ret = bdrv_reopen_multiple(queue, &local_err);
1696     if (local_err != NULL) {
1697         error_propagate(errp, local_err);
1698     }
1699     return ret;
1700 }
1701 
1702 
1703 /*
1704  * Prepares a BlockDriverState for reopen. All changes are staged in the
1705  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1706  * the block driver layer .bdrv_reopen_prepare()
1707  *
1708  * bs is the BlockDriverState to reopen
1709  * flags are the new open flags
1710  * queue is the reopen queue
1711  *
1712  * Returns 0 on success, non-zero on error.  On error errp will be set
1713  * as well.
1714  *
1715  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1716  * It is the responsibility of the caller to then call the abort() or
1717  * commit() for any other BDS that have been left in a prepare() state
1718  *
1719  */
1720 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1721                         Error **errp)
1722 {
1723     int ret = -1;
1724     Error *local_err = NULL;
1725     BlockDriver *drv;
1726 
1727     assert(reopen_state != NULL);
1728     assert(reopen_state->bs->drv != NULL);
1729     drv = reopen_state->bs->drv;
1730 
1731     /* if we are to stay read-only, do not allow permission change
1732      * to r/w */
1733     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1734         reopen_state->flags & BDRV_O_RDWR) {
1735         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1736                   reopen_state->bs->device_name);
1737         goto error;
1738     }
1739 
1740 
1741     ret = bdrv_flush(reopen_state->bs);
1742     if (ret) {
1743         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1744                   strerror(-ret));
1745         goto error;
1746     }
1747 
1748     if (drv->bdrv_reopen_prepare) {
1749         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1750         if (ret) {
1751             if (local_err != NULL) {
1752                 error_propagate(errp, local_err);
1753             } else {
1754                 error_setg(errp, "failed while preparing to reopen image '%s'",
1755                            reopen_state->bs->filename);
1756             }
1757             goto error;
1758         }
1759     } else {
1760         /* It is currently mandatory to have a bdrv_reopen_prepare()
1761          * handler for each supported drv. */
1762         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1763                   drv->format_name, reopen_state->bs->device_name,
1764                  "reopening of file");
1765         ret = -1;
1766         goto error;
1767     }
1768 
1769     ret = 0;
1770 
1771 error:
1772     return ret;
1773 }
1774 
1775 /*
1776  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1777  * makes them final by swapping the staging BlockDriverState contents into
1778  * the active BlockDriverState contents.
1779  */
1780 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1781 {
1782     BlockDriver *drv;
1783 
1784     assert(reopen_state != NULL);
1785     drv = reopen_state->bs->drv;
1786     assert(drv != NULL);
1787 
1788     /* If there are any driver level actions to take */
1789     if (drv->bdrv_reopen_commit) {
1790         drv->bdrv_reopen_commit(reopen_state);
1791     }
1792 
1793     /* set BDS specific flags now */
1794     reopen_state->bs->open_flags         = reopen_state->flags;
1795     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1796                                               BDRV_O_CACHE_WB);
1797     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1798 
1799     bdrv_refresh_limits(reopen_state->bs, NULL);
1800 }
1801 
1802 /*
1803  * Abort the reopen, and delete and free the staged changes in
1804  * reopen_state
1805  */
1806 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1807 {
1808     BlockDriver *drv;
1809 
1810     assert(reopen_state != NULL);
1811     drv = reopen_state->bs->drv;
1812     assert(drv != NULL);
1813 
1814     if (drv->bdrv_reopen_abort) {
1815         drv->bdrv_reopen_abort(reopen_state);
1816     }
1817 }
1818 
1819 
1820 void bdrv_close(BlockDriverState *bs)
1821 {
1822     if (bs->job) {
1823         block_job_cancel_sync(bs->job);
1824     }
1825     bdrv_drain_all(); /* complete I/O */
1826     bdrv_flush(bs);
1827     bdrv_drain_all(); /* in case flush left pending I/O */
1828     notifier_list_notify(&bs->close_notifiers, bs);
1829 
1830     if (bs->drv) {
1831         if (bs->backing_hd) {
1832             BlockDriverState *backing_hd = bs->backing_hd;
1833             bdrv_set_backing_hd(bs, NULL);
1834             bdrv_unref(backing_hd);
1835         }
1836         bs->drv->bdrv_close(bs);
1837         g_free(bs->opaque);
1838         bs->opaque = NULL;
1839         bs->drv = NULL;
1840         bs->copy_on_read = 0;
1841         bs->backing_file[0] = '\0';
1842         bs->backing_format[0] = '\0';
1843         bs->total_sectors = 0;
1844         bs->encrypted = 0;
1845         bs->valid_key = 0;
1846         bs->sg = 0;
1847         bs->growable = 0;
1848         bs->zero_beyond_eof = false;
1849         QDECREF(bs->options);
1850         bs->options = NULL;
1851         QDECREF(bs->full_open_options);
1852         bs->full_open_options = NULL;
1853 
1854         if (bs->file != NULL) {
1855             bdrv_unref(bs->file);
1856             bs->file = NULL;
1857         }
1858     }
1859 
1860     bdrv_dev_change_media_cb(bs, false);
1861 
1862     /*throttling disk I/O limits*/
1863     if (bs->io_limits_enabled) {
1864         bdrv_io_limits_disable(bs);
1865     }
1866 }
1867 
1868 void bdrv_close_all(void)
1869 {
1870     BlockDriverState *bs;
1871 
1872     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1873         AioContext *aio_context = bdrv_get_aio_context(bs);
1874 
1875         aio_context_acquire(aio_context);
1876         bdrv_close(bs);
1877         aio_context_release(aio_context);
1878     }
1879 }
1880 
1881 /* Check if any requests are in-flight (including throttled requests) */
1882 static bool bdrv_requests_pending(BlockDriverState *bs)
1883 {
1884     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1885         return true;
1886     }
1887     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1888         return true;
1889     }
1890     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1891         return true;
1892     }
1893     if (bs->file && bdrv_requests_pending(bs->file)) {
1894         return true;
1895     }
1896     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1897         return true;
1898     }
1899     return false;
1900 }
1901 
1902 /*
1903  * Wait for pending requests to complete across all BlockDriverStates
1904  *
1905  * This function does not flush data to disk, use bdrv_flush_all() for that
1906  * after calling this function.
1907  *
1908  * Note that completion of an asynchronous I/O operation can trigger any
1909  * number of other I/O operations on other devices---for example a coroutine
1910  * can be arbitrarily complex and a constant flow of I/O can come until the
1911  * coroutine is complete.  Because of this, it is not possible to have a
1912  * function to drain a single device's I/O queue.
1913  */
1914 void bdrv_drain_all(void)
1915 {
1916     /* Always run first iteration so any pending completion BHs run */
1917     bool busy = true;
1918     BlockDriverState *bs;
1919 
1920     while (busy) {
1921         busy = false;
1922 
1923         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1924             AioContext *aio_context = bdrv_get_aio_context(bs);
1925             bool bs_busy;
1926 
1927             aio_context_acquire(aio_context);
1928             bdrv_flush_io_queue(bs);
1929             bdrv_start_throttled_reqs(bs);
1930             bs_busy = bdrv_requests_pending(bs);
1931             bs_busy |= aio_poll(aio_context, bs_busy);
1932             aio_context_release(aio_context);
1933 
1934             busy |= bs_busy;
1935         }
1936     }
1937 }
1938 
1939 /* make a BlockDriverState anonymous by removing from bdrv_state and
1940  * graph_bdrv_state list.
1941    Also, NULL terminate the device_name to prevent double remove */
1942 void bdrv_make_anon(BlockDriverState *bs)
1943 {
1944     if (bs->device_name[0] != '\0') {
1945         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1946     }
1947     bs->device_name[0] = '\0';
1948     if (bs->node_name[0] != '\0') {
1949         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1950     }
1951     bs->node_name[0] = '\0';
1952 }
1953 
1954 static void bdrv_rebind(BlockDriverState *bs)
1955 {
1956     if (bs->drv && bs->drv->bdrv_rebind) {
1957         bs->drv->bdrv_rebind(bs);
1958     }
1959 }
1960 
1961 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1962                                      BlockDriverState *bs_src)
1963 {
1964     /* move some fields that need to stay attached to the device */
1965 
1966     /* dev info */
1967     bs_dest->dev_ops            = bs_src->dev_ops;
1968     bs_dest->dev_opaque         = bs_src->dev_opaque;
1969     bs_dest->dev                = bs_src->dev;
1970     bs_dest->guest_block_size   = bs_src->guest_block_size;
1971     bs_dest->copy_on_read       = bs_src->copy_on_read;
1972 
1973     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1974 
1975     /* i/o throttled req */
1976     memcpy(&bs_dest->throttle_state,
1977            &bs_src->throttle_state,
1978            sizeof(ThrottleState));
1979     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1980     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1981     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1982 
1983     /* r/w error */
1984     bs_dest->on_read_error      = bs_src->on_read_error;
1985     bs_dest->on_write_error     = bs_src->on_write_error;
1986 
1987     /* i/o status */
1988     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1989     bs_dest->iostatus           = bs_src->iostatus;
1990 
1991     /* dirty bitmap */
1992     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1993 
1994     /* reference count */
1995     bs_dest->refcnt             = bs_src->refcnt;
1996 
1997     /* job */
1998     bs_dest->job                = bs_src->job;
1999 
2000     /* keep the same entry in bdrv_states */
2001     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
2002             bs_src->device_name);
2003     bs_dest->device_list = bs_src->device_list;
2004     memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2005            sizeof(bs_dest->op_blockers));
2006 }
2007 
2008 /*
2009  * Swap bs contents for two image chains while they are live,
2010  * while keeping required fields on the BlockDriverState that is
2011  * actually attached to a device.
2012  *
2013  * This will modify the BlockDriverState fields, and swap contents
2014  * between bs_new and bs_old. Both bs_new and bs_old are modified.
2015  *
2016  * bs_new is required to be anonymous.
2017  *
2018  * This function does not create any image files.
2019  */
2020 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2021 {
2022     BlockDriverState tmp;
2023 
2024     /* The code needs to swap the node_name but simply swapping node_list won't
2025      * work so first remove the nodes from the graph list, do the swap then
2026      * insert them back if needed.
2027      */
2028     if (bs_new->node_name[0] != '\0') {
2029         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2030     }
2031     if (bs_old->node_name[0] != '\0') {
2032         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2033     }
2034 
2035     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
2036     assert(bs_new->device_name[0] == '\0');
2037     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2038     assert(bs_new->job == NULL);
2039     assert(bs_new->dev == NULL);
2040     assert(bs_new->io_limits_enabled == false);
2041     assert(!throttle_have_timer(&bs_new->throttle_state));
2042 
2043     tmp = *bs_new;
2044     *bs_new = *bs_old;
2045     *bs_old = tmp;
2046 
2047     /* there are some fields that should not be swapped, move them back */
2048     bdrv_move_feature_fields(&tmp, bs_old);
2049     bdrv_move_feature_fields(bs_old, bs_new);
2050     bdrv_move_feature_fields(bs_new, &tmp);
2051 
2052     /* bs_new shouldn't be in bdrv_states even after the swap!  */
2053     assert(bs_new->device_name[0] == '\0');
2054 
2055     /* Check a few fields that should remain attached to the device */
2056     assert(bs_new->dev == NULL);
2057     assert(bs_new->job == NULL);
2058     assert(bs_new->io_limits_enabled == false);
2059     assert(!throttle_have_timer(&bs_new->throttle_state));
2060 
2061     /* insert the nodes back into the graph node list if needed */
2062     if (bs_new->node_name[0] != '\0') {
2063         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2064     }
2065     if (bs_old->node_name[0] != '\0') {
2066         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2067     }
2068 
2069     bdrv_rebind(bs_new);
2070     bdrv_rebind(bs_old);
2071 }
2072 
2073 /*
2074  * Add new bs contents at the top of an image chain while the chain is
2075  * live, while keeping required fields on the top layer.
2076  *
2077  * This will modify the BlockDriverState fields, and swap contents
2078  * between bs_new and bs_top. Both bs_new and bs_top are modified.
2079  *
2080  * bs_new is required to be anonymous.
2081  *
2082  * This function does not create any image files.
2083  */
2084 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2085 {
2086     bdrv_swap(bs_new, bs_top);
2087 
2088     /* The contents of 'tmp' will become bs_top, as we are
2089      * swapping bs_new and bs_top contents. */
2090     bdrv_set_backing_hd(bs_top, bs_new);
2091 }
2092 
2093 static void bdrv_delete(BlockDriverState *bs)
2094 {
2095     assert(!bs->dev);
2096     assert(!bs->job);
2097     assert(bdrv_op_blocker_is_empty(bs));
2098     assert(!bs->refcnt);
2099     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2100 
2101     bdrv_close(bs);
2102 
2103     /* remove from list, if necessary */
2104     bdrv_make_anon(bs);
2105 
2106     g_free(bs);
2107 }
2108 
2109 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2110 /* TODO change to DeviceState *dev when all users are qdevified */
2111 {
2112     if (bs->dev) {
2113         return -EBUSY;
2114     }
2115     bs->dev = dev;
2116     bdrv_iostatus_reset(bs);
2117 
2118     /* We're expecting I/O from the device so bump up coroutine pool size */
2119     qemu_coroutine_adjust_pool_size(COROUTINE_POOL_RESERVATION);
2120     return 0;
2121 }
2122 
2123 /* TODO qdevified devices don't use this, remove when devices are qdevified */
2124 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
2125 {
2126     if (bdrv_attach_dev(bs, dev) < 0) {
2127         abort();
2128     }
2129 }
2130 
2131 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2132 /* TODO change to DeviceState *dev when all users are qdevified */
2133 {
2134     assert(bs->dev == dev);
2135     bs->dev = NULL;
2136     bs->dev_ops = NULL;
2137     bs->dev_opaque = NULL;
2138     bs->guest_block_size = 512;
2139     qemu_coroutine_adjust_pool_size(-COROUTINE_POOL_RESERVATION);
2140 }
2141 
2142 /* TODO change to return DeviceState * when all users are qdevified */
2143 void *bdrv_get_attached_dev(BlockDriverState *bs)
2144 {
2145     return bs->dev;
2146 }
2147 
2148 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2149                       void *opaque)
2150 {
2151     bs->dev_ops = ops;
2152     bs->dev_opaque = opaque;
2153 }
2154 
2155 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2156 {
2157     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2158         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2159         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2160         if (tray_was_closed) {
2161             /* tray open */
2162             qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2163                                               true, &error_abort);
2164         }
2165         if (load) {
2166             /* tray close */
2167             qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2168                                               false, &error_abort);
2169         }
2170     }
2171 }
2172 
2173 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2174 {
2175     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2176 }
2177 
2178 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2179 {
2180     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2181         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2182     }
2183 }
2184 
2185 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2186 {
2187     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2188         return bs->dev_ops->is_tray_open(bs->dev_opaque);
2189     }
2190     return false;
2191 }
2192 
2193 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2194 {
2195     if (bs->dev_ops && bs->dev_ops->resize_cb) {
2196         bs->dev_ops->resize_cb(bs->dev_opaque);
2197     }
2198 }
2199 
2200 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2201 {
2202     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2203         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2204     }
2205     return false;
2206 }
2207 
2208 /*
2209  * Run consistency checks on an image
2210  *
2211  * Returns 0 if the check could be completed (it doesn't mean that the image is
2212  * free of errors) or -errno when an internal error occurred. The results of the
2213  * check are stored in res.
2214  */
2215 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2216 {
2217     if (bs->drv == NULL) {
2218         return -ENOMEDIUM;
2219     }
2220     if (bs->drv->bdrv_check == NULL) {
2221         return -ENOTSUP;
2222     }
2223 
2224     memset(res, 0, sizeof(*res));
2225     return bs->drv->bdrv_check(bs, res, fix);
2226 }
2227 
2228 #define COMMIT_BUF_SECTORS 2048
2229 
2230 /* commit COW file into the raw image */
2231 int bdrv_commit(BlockDriverState *bs)
2232 {
2233     BlockDriver *drv = bs->drv;
2234     int64_t sector, total_sectors, length, backing_length;
2235     int n, ro, open_flags;
2236     int ret = 0;
2237     uint8_t *buf = NULL;
2238     char filename[PATH_MAX];
2239 
2240     if (!drv)
2241         return -ENOMEDIUM;
2242 
2243     if (!bs->backing_hd) {
2244         return -ENOTSUP;
2245     }
2246 
2247     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2248         bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2249         return -EBUSY;
2250     }
2251 
2252     ro = bs->backing_hd->read_only;
2253     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2254     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2255     open_flags =  bs->backing_hd->open_flags;
2256 
2257     if (ro) {
2258         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2259             return -EACCES;
2260         }
2261     }
2262 
2263     length = bdrv_getlength(bs);
2264     if (length < 0) {
2265         ret = length;
2266         goto ro_cleanup;
2267     }
2268 
2269     backing_length = bdrv_getlength(bs->backing_hd);
2270     if (backing_length < 0) {
2271         ret = backing_length;
2272         goto ro_cleanup;
2273     }
2274 
2275     /* If our top snapshot is larger than the backing file image,
2276      * grow the backing file image if possible.  If not possible,
2277      * we must return an error */
2278     if (length > backing_length) {
2279         ret = bdrv_truncate(bs->backing_hd, length);
2280         if (ret < 0) {
2281             goto ro_cleanup;
2282         }
2283     }
2284 
2285     total_sectors = length >> BDRV_SECTOR_BITS;
2286 
2287     /* qemu_try_blockalign() for bs will choose an alignment that works for
2288      * bs->backing_hd as well, so no need to compare the alignment manually. */
2289     buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2290     if (buf == NULL) {
2291         ret = -ENOMEM;
2292         goto ro_cleanup;
2293     }
2294 
2295     for (sector = 0; sector < total_sectors; sector += n) {
2296         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2297         if (ret < 0) {
2298             goto ro_cleanup;
2299         }
2300         if (ret) {
2301             ret = bdrv_read(bs, sector, buf, n);
2302             if (ret < 0) {
2303                 goto ro_cleanup;
2304             }
2305 
2306             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2307             if (ret < 0) {
2308                 goto ro_cleanup;
2309             }
2310         }
2311     }
2312 
2313     if (drv->bdrv_make_empty) {
2314         ret = drv->bdrv_make_empty(bs);
2315         if (ret < 0) {
2316             goto ro_cleanup;
2317         }
2318         bdrv_flush(bs);
2319     }
2320 
2321     /*
2322      * Make sure all data we wrote to the backing device is actually
2323      * stable on disk.
2324      */
2325     if (bs->backing_hd) {
2326         bdrv_flush(bs->backing_hd);
2327     }
2328 
2329     ret = 0;
2330 ro_cleanup:
2331     qemu_vfree(buf);
2332 
2333     if (ro) {
2334         /* ignoring error return here */
2335         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2336     }
2337 
2338     return ret;
2339 }
2340 
2341 int bdrv_commit_all(void)
2342 {
2343     BlockDriverState *bs;
2344 
2345     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2346         AioContext *aio_context = bdrv_get_aio_context(bs);
2347 
2348         aio_context_acquire(aio_context);
2349         if (bs->drv && bs->backing_hd) {
2350             int ret = bdrv_commit(bs);
2351             if (ret < 0) {
2352                 aio_context_release(aio_context);
2353                 return ret;
2354             }
2355         }
2356         aio_context_release(aio_context);
2357     }
2358     return 0;
2359 }
2360 
2361 /**
2362  * Remove an active request from the tracked requests list
2363  *
2364  * This function should be called when a tracked request is completing.
2365  */
2366 static void tracked_request_end(BdrvTrackedRequest *req)
2367 {
2368     if (req->serialising) {
2369         req->bs->serialising_in_flight--;
2370     }
2371 
2372     QLIST_REMOVE(req, list);
2373     qemu_co_queue_restart_all(&req->wait_queue);
2374 }
2375 
2376 /**
2377  * Add an active request to the tracked requests list
2378  */
2379 static void tracked_request_begin(BdrvTrackedRequest *req,
2380                                   BlockDriverState *bs,
2381                                   int64_t offset,
2382                                   unsigned int bytes, bool is_write)
2383 {
2384     *req = (BdrvTrackedRequest){
2385         .bs = bs,
2386         .offset         = offset,
2387         .bytes          = bytes,
2388         .is_write       = is_write,
2389         .co             = qemu_coroutine_self(),
2390         .serialising    = false,
2391         .overlap_offset = offset,
2392         .overlap_bytes  = bytes,
2393     };
2394 
2395     qemu_co_queue_init(&req->wait_queue);
2396 
2397     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2398 }
2399 
2400 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2401 {
2402     int64_t overlap_offset = req->offset & ~(align - 1);
2403     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2404                                - overlap_offset;
2405 
2406     if (!req->serialising) {
2407         req->bs->serialising_in_flight++;
2408         req->serialising = true;
2409     }
2410 
2411     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2412     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2413 }
2414 
2415 /**
2416  * Round a region to cluster boundaries
2417  */
2418 void bdrv_round_to_clusters(BlockDriverState *bs,
2419                             int64_t sector_num, int nb_sectors,
2420                             int64_t *cluster_sector_num,
2421                             int *cluster_nb_sectors)
2422 {
2423     BlockDriverInfo bdi;
2424 
2425     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2426         *cluster_sector_num = sector_num;
2427         *cluster_nb_sectors = nb_sectors;
2428     } else {
2429         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2430         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2431         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2432                                             nb_sectors, c);
2433     }
2434 }
2435 
2436 static int bdrv_get_cluster_size(BlockDriverState *bs)
2437 {
2438     BlockDriverInfo bdi;
2439     int ret;
2440 
2441     ret = bdrv_get_info(bs, &bdi);
2442     if (ret < 0 || bdi.cluster_size == 0) {
2443         return bs->request_alignment;
2444     } else {
2445         return bdi.cluster_size;
2446     }
2447 }
2448 
2449 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2450                                      int64_t offset, unsigned int bytes)
2451 {
2452     /*        aaaa   bbbb */
2453     if (offset >= req->overlap_offset + req->overlap_bytes) {
2454         return false;
2455     }
2456     /* bbbb   aaaa        */
2457     if (req->overlap_offset >= offset + bytes) {
2458         return false;
2459     }
2460     return true;
2461 }
2462 
2463 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2464 {
2465     BlockDriverState *bs = self->bs;
2466     BdrvTrackedRequest *req;
2467     bool retry;
2468     bool waited = false;
2469 
2470     if (!bs->serialising_in_flight) {
2471         return false;
2472     }
2473 
2474     do {
2475         retry = false;
2476         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2477             if (req == self || (!req->serialising && !self->serialising)) {
2478                 continue;
2479             }
2480             if (tracked_request_overlaps(req, self->overlap_offset,
2481                                          self->overlap_bytes))
2482             {
2483                 /* Hitting this means there was a reentrant request, for
2484                  * example, a block driver issuing nested requests.  This must
2485                  * never happen since it means deadlock.
2486                  */
2487                 assert(qemu_coroutine_self() != req->co);
2488 
2489                 /* If the request is already (indirectly) waiting for us, or
2490                  * will wait for us as soon as it wakes up, then just go on
2491                  * (instead of producing a deadlock in the former case). */
2492                 if (!req->waiting_for) {
2493                     self->waiting_for = req;
2494                     qemu_co_queue_wait(&req->wait_queue);
2495                     self->waiting_for = NULL;
2496                     retry = true;
2497                     waited = true;
2498                     break;
2499                 }
2500             }
2501         }
2502     } while (retry);
2503 
2504     return waited;
2505 }
2506 
2507 /*
2508  * Return values:
2509  * 0        - success
2510  * -EINVAL  - backing format specified, but no file
2511  * -ENOSPC  - can't update the backing file because no space is left in the
2512  *            image file header
2513  * -ENOTSUP - format driver doesn't support changing the backing file
2514  */
2515 int bdrv_change_backing_file(BlockDriverState *bs,
2516     const char *backing_file, const char *backing_fmt)
2517 {
2518     BlockDriver *drv = bs->drv;
2519     int ret;
2520 
2521     /* Backing file format doesn't make sense without a backing file */
2522     if (backing_fmt && !backing_file) {
2523         return -EINVAL;
2524     }
2525 
2526     if (drv->bdrv_change_backing_file != NULL) {
2527         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2528     } else {
2529         ret = -ENOTSUP;
2530     }
2531 
2532     if (ret == 0) {
2533         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2534         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2535     }
2536     return ret;
2537 }
2538 
2539 /*
2540  * Finds the image layer in the chain that has 'bs' as its backing file.
2541  *
2542  * active is the current topmost image.
2543  *
2544  * Returns NULL if bs is not found in active's image chain,
2545  * or if active == bs.
2546  *
2547  * Returns the bottommost base image if bs == NULL.
2548  */
2549 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2550                                     BlockDriverState *bs)
2551 {
2552     while (active && bs != active->backing_hd) {
2553         active = active->backing_hd;
2554     }
2555 
2556     return active;
2557 }
2558 
2559 /* Given a BDS, searches for the base layer. */
2560 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2561 {
2562     return bdrv_find_overlay(bs, NULL);
2563 }
2564 
2565 typedef struct BlkIntermediateStates {
2566     BlockDriverState *bs;
2567     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2568 } BlkIntermediateStates;
2569 
2570 
2571 /*
2572  * Drops images above 'base' up to and including 'top', and sets the image
2573  * above 'top' to have base as its backing file.
2574  *
2575  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2576  * information in 'bs' can be properly updated.
2577  *
2578  * E.g., this will convert the following chain:
2579  * bottom <- base <- intermediate <- top <- active
2580  *
2581  * to
2582  *
2583  * bottom <- base <- active
2584  *
2585  * It is allowed for bottom==base, in which case it converts:
2586  *
2587  * base <- intermediate <- top <- active
2588  *
2589  * to
2590  *
2591  * base <- active
2592  *
2593  * If backing_file_str is non-NULL, it will be used when modifying top's
2594  * overlay image metadata.
2595  *
2596  * Error conditions:
2597  *  if active == top, that is considered an error
2598  *
2599  */
2600 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2601                            BlockDriverState *base, const char *backing_file_str)
2602 {
2603     BlockDriverState *intermediate;
2604     BlockDriverState *base_bs = NULL;
2605     BlockDriverState *new_top_bs = NULL;
2606     BlkIntermediateStates *intermediate_state, *next;
2607     int ret = -EIO;
2608 
2609     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2610     QSIMPLEQ_INIT(&states_to_delete);
2611 
2612     if (!top->drv || !base->drv) {
2613         goto exit;
2614     }
2615 
2616     new_top_bs = bdrv_find_overlay(active, top);
2617 
2618     if (new_top_bs == NULL) {
2619         /* we could not find the image above 'top', this is an error */
2620         goto exit;
2621     }
2622 
2623     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2624      * to do, no intermediate images */
2625     if (new_top_bs->backing_hd == base) {
2626         ret = 0;
2627         goto exit;
2628     }
2629 
2630     intermediate = top;
2631 
2632     /* now we will go down through the list, and add each BDS we find
2633      * into our deletion queue, until we hit the 'base'
2634      */
2635     while (intermediate) {
2636         intermediate_state = g_new0(BlkIntermediateStates, 1);
2637         intermediate_state->bs = intermediate;
2638         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2639 
2640         if (intermediate->backing_hd == base) {
2641             base_bs = intermediate->backing_hd;
2642             break;
2643         }
2644         intermediate = intermediate->backing_hd;
2645     }
2646     if (base_bs == NULL) {
2647         /* something went wrong, we did not end at the base. safely
2648          * unravel everything, and exit with error */
2649         goto exit;
2650     }
2651 
2652     /* success - we can delete the intermediate states, and link top->base */
2653     backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2654     ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2655                                    base_bs->drv ? base_bs->drv->format_name : "");
2656     if (ret) {
2657         goto exit;
2658     }
2659     bdrv_set_backing_hd(new_top_bs, base_bs);
2660 
2661     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2662         /* so that bdrv_close() does not recursively close the chain */
2663         bdrv_set_backing_hd(intermediate_state->bs, NULL);
2664         bdrv_unref(intermediate_state->bs);
2665     }
2666     ret = 0;
2667 
2668 exit:
2669     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2670         g_free(intermediate_state);
2671     }
2672     return ret;
2673 }
2674 
2675 
2676 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2677                                    size_t size)
2678 {
2679     int64_t len;
2680 
2681     if (size > INT_MAX) {
2682         return -EIO;
2683     }
2684 
2685     if (!bdrv_is_inserted(bs))
2686         return -ENOMEDIUM;
2687 
2688     if (bs->growable)
2689         return 0;
2690 
2691     len = bdrv_getlength(bs);
2692 
2693     if (offset < 0)
2694         return -EIO;
2695 
2696     if ((offset > len) || (len - offset < size))
2697         return -EIO;
2698 
2699     return 0;
2700 }
2701 
2702 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2703                               int nb_sectors)
2704 {
2705     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2706         return -EIO;
2707     }
2708 
2709     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2710                                    nb_sectors * BDRV_SECTOR_SIZE);
2711 }
2712 
2713 typedef struct RwCo {
2714     BlockDriverState *bs;
2715     int64_t offset;
2716     QEMUIOVector *qiov;
2717     bool is_write;
2718     int ret;
2719     BdrvRequestFlags flags;
2720 } RwCo;
2721 
2722 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2723 {
2724     RwCo *rwco = opaque;
2725 
2726     if (!rwco->is_write) {
2727         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2728                                       rwco->qiov->size, rwco->qiov,
2729                                       rwco->flags);
2730     } else {
2731         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2732                                        rwco->qiov->size, rwco->qiov,
2733                                        rwco->flags);
2734     }
2735 }
2736 
2737 /*
2738  * Process a vectored synchronous request using coroutines
2739  */
2740 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2741                         QEMUIOVector *qiov, bool is_write,
2742                         BdrvRequestFlags flags)
2743 {
2744     Coroutine *co;
2745     RwCo rwco = {
2746         .bs = bs,
2747         .offset = offset,
2748         .qiov = qiov,
2749         .is_write = is_write,
2750         .ret = NOT_DONE,
2751         .flags = flags,
2752     };
2753 
2754     /**
2755      * In sync call context, when the vcpu is blocked, this throttling timer
2756      * will not fire; so the I/O throttling function has to be disabled here
2757      * if it has been enabled.
2758      */
2759     if (bs->io_limits_enabled) {
2760         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2761                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2762         bdrv_io_limits_disable(bs);
2763     }
2764 
2765     if (qemu_in_coroutine()) {
2766         /* Fast-path if already in coroutine context */
2767         bdrv_rw_co_entry(&rwco);
2768     } else {
2769         AioContext *aio_context = bdrv_get_aio_context(bs);
2770 
2771         co = qemu_coroutine_create(bdrv_rw_co_entry);
2772         qemu_coroutine_enter(co, &rwco);
2773         while (rwco.ret == NOT_DONE) {
2774             aio_poll(aio_context, true);
2775         }
2776     }
2777     return rwco.ret;
2778 }
2779 
2780 /*
2781  * Process a synchronous request using coroutines
2782  */
2783 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2784                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2785 {
2786     QEMUIOVector qiov;
2787     struct iovec iov = {
2788         .iov_base = (void *)buf,
2789         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2790     };
2791 
2792     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2793         return -EINVAL;
2794     }
2795 
2796     qemu_iovec_init_external(&qiov, &iov, 1);
2797     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2798                         &qiov, is_write, flags);
2799 }
2800 
2801 /* return < 0 if error. See bdrv_write() for the return codes */
2802 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2803               uint8_t *buf, int nb_sectors)
2804 {
2805     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2806 }
2807 
2808 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2809 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2810                           uint8_t *buf, int nb_sectors)
2811 {
2812     bool enabled;
2813     int ret;
2814 
2815     enabled = bs->io_limits_enabled;
2816     bs->io_limits_enabled = false;
2817     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2818     bs->io_limits_enabled = enabled;
2819     return ret;
2820 }
2821 
2822 /* Return < 0 if error. Important errors are:
2823   -EIO         generic I/O error (may happen for all errors)
2824   -ENOMEDIUM   No media inserted.
2825   -EINVAL      Invalid sector number or nb_sectors
2826   -EACCES      Trying to write a read-only device
2827 */
2828 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2829                const uint8_t *buf, int nb_sectors)
2830 {
2831     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2832 }
2833 
2834 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2835                       int nb_sectors, BdrvRequestFlags flags)
2836 {
2837     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2838                       BDRV_REQ_ZERO_WRITE | flags);
2839 }
2840 
2841 /*
2842  * Completely zero out a block device with the help of bdrv_write_zeroes.
2843  * The operation is sped up by checking the block status and only writing
2844  * zeroes to the device if they currently do not return zeroes. Optional
2845  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2846  *
2847  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2848  */
2849 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2850 {
2851     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2852     int n;
2853 
2854     target_sectors = bdrv_nb_sectors(bs);
2855     if (target_sectors < 0) {
2856         return target_sectors;
2857     }
2858 
2859     for (;;) {
2860         nb_sectors = target_sectors - sector_num;
2861         if (nb_sectors <= 0) {
2862             return 0;
2863         }
2864         if (nb_sectors > INT_MAX) {
2865             nb_sectors = INT_MAX;
2866         }
2867         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2868         if (ret < 0) {
2869             error_report("error getting block status at sector %" PRId64 ": %s",
2870                          sector_num, strerror(-ret));
2871             return ret;
2872         }
2873         if (ret & BDRV_BLOCK_ZERO) {
2874             sector_num += n;
2875             continue;
2876         }
2877         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2878         if (ret < 0) {
2879             error_report("error writing zeroes at sector %" PRId64 ": %s",
2880                          sector_num, strerror(-ret));
2881             return ret;
2882         }
2883         sector_num += n;
2884     }
2885 }
2886 
2887 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2888 {
2889     QEMUIOVector qiov;
2890     struct iovec iov = {
2891         .iov_base = (void *)buf,
2892         .iov_len = bytes,
2893     };
2894     int ret;
2895 
2896     if (bytes < 0) {
2897         return -EINVAL;
2898     }
2899 
2900     qemu_iovec_init_external(&qiov, &iov, 1);
2901     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2902     if (ret < 0) {
2903         return ret;
2904     }
2905 
2906     return bytes;
2907 }
2908 
2909 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2910 {
2911     int ret;
2912 
2913     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2914     if (ret < 0) {
2915         return ret;
2916     }
2917 
2918     return qiov->size;
2919 }
2920 
2921 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2922                 const void *buf, int bytes)
2923 {
2924     QEMUIOVector qiov;
2925     struct iovec iov = {
2926         .iov_base   = (void *) buf,
2927         .iov_len    = bytes,
2928     };
2929 
2930     if (bytes < 0) {
2931         return -EINVAL;
2932     }
2933 
2934     qemu_iovec_init_external(&qiov, &iov, 1);
2935     return bdrv_pwritev(bs, offset, &qiov);
2936 }
2937 
2938 /*
2939  * Writes to the file and ensures that no writes are reordered across this
2940  * request (acts as a barrier)
2941  *
2942  * Returns 0 on success, -errno in error cases.
2943  */
2944 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2945     const void *buf, int count)
2946 {
2947     int ret;
2948 
2949     ret = bdrv_pwrite(bs, offset, buf, count);
2950     if (ret < 0) {
2951         return ret;
2952     }
2953 
2954     /* No flush needed for cache modes that already do it */
2955     if (bs->enable_write_cache) {
2956         bdrv_flush(bs);
2957     }
2958 
2959     return 0;
2960 }
2961 
2962 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2963         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2964 {
2965     /* Perform I/O through a temporary buffer so that users who scribble over
2966      * their read buffer while the operation is in progress do not end up
2967      * modifying the image file.  This is critical for zero-copy guest I/O
2968      * where anything might happen inside guest memory.
2969      */
2970     void *bounce_buffer;
2971 
2972     BlockDriver *drv = bs->drv;
2973     struct iovec iov;
2974     QEMUIOVector bounce_qiov;
2975     int64_t cluster_sector_num;
2976     int cluster_nb_sectors;
2977     size_t skip_bytes;
2978     int ret;
2979 
2980     /* Cover entire cluster so no additional backing file I/O is required when
2981      * allocating cluster in the image file.
2982      */
2983     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2984                            &cluster_sector_num, &cluster_nb_sectors);
2985 
2986     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2987                                    cluster_sector_num, cluster_nb_sectors);
2988 
2989     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2990     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2991     if (bounce_buffer == NULL) {
2992         ret = -ENOMEM;
2993         goto err;
2994     }
2995 
2996     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2997 
2998     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2999                              &bounce_qiov);
3000     if (ret < 0) {
3001         goto err;
3002     }
3003 
3004     if (drv->bdrv_co_write_zeroes &&
3005         buffer_is_zero(bounce_buffer, iov.iov_len)) {
3006         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
3007                                       cluster_nb_sectors, 0);
3008     } else {
3009         /* This does not change the data on the disk, it is not necessary
3010          * to flush even in cache=writethrough mode.
3011          */
3012         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
3013                                   &bounce_qiov);
3014     }
3015 
3016     if (ret < 0) {
3017         /* It might be okay to ignore write errors for guest requests.  If this
3018          * is a deliberate copy-on-read then we don't want to ignore the error.
3019          * Simply report it in all cases.
3020          */
3021         goto err;
3022     }
3023 
3024     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
3025     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3026                         nb_sectors * BDRV_SECTOR_SIZE);
3027 
3028 err:
3029     qemu_vfree(bounce_buffer);
3030     return ret;
3031 }
3032 
3033 /*
3034  * Forwards an already correctly aligned request to the BlockDriver. This
3035  * handles copy on read and zeroing after EOF; any other features must be
3036  * implemented by the caller.
3037  */
3038 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
3039     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3040     int64_t align, QEMUIOVector *qiov, int flags)
3041 {
3042     BlockDriver *drv = bs->drv;
3043     int ret;
3044 
3045     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3046     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3047 
3048     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3049     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3050     assert(!qiov || bytes == qiov->size);
3051 
3052     /* Handle Copy on Read and associated serialisation */
3053     if (flags & BDRV_REQ_COPY_ON_READ) {
3054         /* If we touch the same cluster it counts as an overlap.  This
3055          * guarantees that allocating writes will be serialized and not race
3056          * with each other for the same cluster.  For example, in copy-on-read
3057          * it ensures that the CoR read and write operations are atomic and
3058          * guest writes cannot interleave between them. */
3059         mark_request_serialising(req, bdrv_get_cluster_size(bs));
3060     }
3061 
3062     wait_serialising_requests(req);
3063 
3064     if (flags & BDRV_REQ_COPY_ON_READ) {
3065         int pnum;
3066 
3067         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3068         if (ret < 0) {
3069             goto out;
3070         }
3071 
3072         if (!ret || pnum != nb_sectors) {
3073             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3074             goto out;
3075         }
3076     }
3077 
3078     /* Forward the request to the BlockDriver */
3079     if (!(bs->zero_beyond_eof && bs->growable)) {
3080         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3081     } else {
3082         /* Read zeros after EOF of growable BDSes */
3083         int64_t total_sectors, max_nb_sectors;
3084 
3085         total_sectors = bdrv_nb_sectors(bs);
3086         if (total_sectors < 0) {
3087             ret = total_sectors;
3088             goto out;
3089         }
3090 
3091         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3092                                   align >> BDRV_SECTOR_BITS);
3093         if (max_nb_sectors > 0) {
3094             QEMUIOVector local_qiov;
3095             size_t local_sectors;
3096 
3097             max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3098             local_sectors = MIN(max_nb_sectors, nb_sectors);
3099 
3100             qemu_iovec_init(&local_qiov, qiov->niov);
3101             qemu_iovec_concat(&local_qiov, qiov, 0,
3102                               local_sectors * BDRV_SECTOR_SIZE);
3103 
3104             ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3105                                      &local_qiov);
3106 
3107             qemu_iovec_destroy(&local_qiov);
3108         } else {
3109             ret = 0;
3110         }
3111 
3112         /* Reading beyond end of file is supposed to produce zeroes */
3113         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3114             uint64_t offset = MAX(0, total_sectors - sector_num);
3115             uint64_t bytes = (sector_num + nb_sectors - offset) *
3116                               BDRV_SECTOR_SIZE;
3117             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3118         }
3119     }
3120 
3121 out:
3122     return ret;
3123 }
3124 
3125 /*
3126  * Handle a read request in coroutine context
3127  */
3128 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3129     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3130     BdrvRequestFlags flags)
3131 {
3132     BlockDriver *drv = bs->drv;
3133     BdrvTrackedRequest req;
3134 
3135     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3136     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3137     uint8_t *head_buf = NULL;
3138     uint8_t *tail_buf = NULL;
3139     QEMUIOVector local_qiov;
3140     bool use_local_qiov = false;
3141     int ret;
3142 
3143     if (!drv) {
3144         return -ENOMEDIUM;
3145     }
3146     if (bdrv_check_byte_request(bs, offset, bytes)) {
3147         return -EIO;
3148     }
3149 
3150     if (bs->copy_on_read) {
3151         flags |= BDRV_REQ_COPY_ON_READ;
3152     }
3153 
3154     /* throttling disk I/O */
3155     if (bs->io_limits_enabled) {
3156         bdrv_io_limits_intercept(bs, bytes, false);
3157     }
3158 
3159     /* Align read if necessary by padding qiov */
3160     if (offset & (align - 1)) {
3161         head_buf = qemu_blockalign(bs, align);
3162         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3163         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3164         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3165         use_local_qiov = true;
3166 
3167         bytes += offset & (align - 1);
3168         offset = offset & ~(align - 1);
3169     }
3170 
3171     if ((offset + bytes) & (align - 1)) {
3172         if (!use_local_qiov) {
3173             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3174             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3175             use_local_qiov = true;
3176         }
3177         tail_buf = qemu_blockalign(bs, align);
3178         qemu_iovec_add(&local_qiov, tail_buf,
3179                        align - ((offset + bytes) & (align - 1)));
3180 
3181         bytes = ROUND_UP(bytes, align);
3182     }
3183 
3184     tracked_request_begin(&req, bs, offset, bytes, false);
3185     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3186                               use_local_qiov ? &local_qiov : qiov,
3187                               flags);
3188     tracked_request_end(&req);
3189 
3190     if (use_local_qiov) {
3191         qemu_iovec_destroy(&local_qiov);
3192         qemu_vfree(head_buf);
3193         qemu_vfree(tail_buf);
3194     }
3195 
3196     return ret;
3197 }
3198 
3199 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3200     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3201     BdrvRequestFlags flags)
3202 {
3203     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3204         return -EINVAL;
3205     }
3206 
3207     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3208                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3209 }
3210 
3211 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3212     int nb_sectors, QEMUIOVector *qiov)
3213 {
3214     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3215 
3216     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3217 }
3218 
3219 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3220     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3221 {
3222     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3223 
3224     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3225                             BDRV_REQ_COPY_ON_READ);
3226 }
3227 
3228 /* if no limit is specified in the BlockLimits use a default
3229  * of 32768 512-byte sectors (16 MiB) per request.
3230  */
3231 #define MAX_WRITE_ZEROES_DEFAULT 32768
3232 
3233 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3234     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3235 {
3236     BlockDriver *drv = bs->drv;
3237     QEMUIOVector qiov;
3238     struct iovec iov = {0};
3239     int ret = 0;
3240 
3241     int max_write_zeroes = bs->bl.max_write_zeroes ?
3242                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3243 
3244     while (nb_sectors > 0 && !ret) {
3245         int num = nb_sectors;
3246 
3247         /* Align request.  Block drivers can expect the "bulk" of the request
3248          * to be aligned.
3249          */
3250         if (bs->bl.write_zeroes_alignment
3251             && num > bs->bl.write_zeroes_alignment) {
3252             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3253                 /* Make a small request up to the first aligned sector.  */
3254                 num = bs->bl.write_zeroes_alignment;
3255                 num -= sector_num % bs->bl.write_zeroes_alignment;
3256             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3257                 /* Shorten the request to the last aligned sector.  num cannot
3258                  * underflow because num > bs->bl.write_zeroes_alignment.
3259                  */
3260                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3261             }
3262         }
3263 
3264         /* limit request size */
3265         if (num > max_write_zeroes) {
3266             num = max_write_zeroes;
3267         }
3268 
3269         ret = -ENOTSUP;
3270         /* First try the efficient write zeroes operation */
3271         if (drv->bdrv_co_write_zeroes) {
3272             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3273         }
3274 
3275         if (ret == -ENOTSUP) {
3276             /* Fall back to bounce buffer if write zeroes is unsupported */
3277             iov.iov_len = num * BDRV_SECTOR_SIZE;
3278             if (iov.iov_base == NULL) {
3279                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3280                 if (iov.iov_base == NULL) {
3281                     ret = -ENOMEM;
3282                     goto fail;
3283                 }
3284                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3285             }
3286             qemu_iovec_init_external(&qiov, &iov, 1);
3287 
3288             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3289 
3290             /* Keep bounce buffer around if it is big enough for all
3291              * all future requests.
3292              */
3293             if (num < max_write_zeroes) {
3294                 qemu_vfree(iov.iov_base);
3295                 iov.iov_base = NULL;
3296             }
3297         }
3298 
3299         sector_num += num;
3300         nb_sectors -= num;
3301     }
3302 
3303 fail:
3304     qemu_vfree(iov.iov_base);
3305     return ret;
3306 }
3307 
3308 /*
3309  * Forwards an already correctly aligned write request to the BlockDriver.
3310  */
3311 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3312     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3313     QEMUIOVector *qiov, int flags)
3314 {
3315     BlockDriver *drv = bs->drv;
3316     bool waited;
3317     int ret;
3318 
3319     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3320     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3321 
3322     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3323     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3324     assert(!qiov || bytes == qiov->size);
3325 
3326     waited = wait_serialising_requests(req);
3327     assert(!waited || !req->serialising);
3328     assert(req->overlap_offset <= offset);
3329     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3330 
3331     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3332 
3333     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3334         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3335         qemu_iovec_is_zero(qiov)) {
3336         flags |= BDRV_REQ_ZERO_WRITE;
3337         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3338             flags |= BDRV_REQ_MAY_UNMAP;
3339         }
3340     }
3341 
3342     if (ret < 0) {
3343         /* Do nothing, write notifier decided to fail this request */
3344     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3345         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3346         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3347     } else {
3348         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3349         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3350     }
3351     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3352 
3353     if (ret == 0 && !bs->enable_write_cache) {
3354         ret = bdrv_co_flush(bs);
3355     }
3356 
3357     bdrv_set_dirty(bs, sector_num, nb_sectors);
3358 
3359     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3360         bs->wr_highest_sector = sector_num + nb_sectors - 1;
3361     }
3362     if (bs->growable && ret >= 0) {
3363         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3364     }
3365 
3366     return ret;
3367 }
3368 
3369 /*
3370  * Handle a write request in coroutine context
3371  */
3372 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3373     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3374     BdrvRequestFlags flags)
3375 {
3376     BdrvTrackedRequest req;
3377     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3378     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3379     uint8_t *head_buf = NULL;
3380     uint8_t *tail_buf = NULL;
3381     QEMUIOVector local_qiov;
3382     bool use_local_qiov = false;
3383     int ret;
3384 
3385     if (!bs->drv) {
3386         return -ENOMEDIUM;
3387     }
3388     if (bs->read_only) {
3389         return -EACCES;
3390     }
3391     if (bdrv_check_byte_request(bs, offset, bytes)) {
3392         return -EIO;
3393     }
3394 
3395     /* throttling disk I/O */
3396     if (bs->io_limits_enabled) {
3397         bdrv_io_limits_intercept(bs, bytes, true);
3398     }
3399 
3400     /*
3401      * Align write if necessary by performing a read-modify-write cycle.
3402      * Pad qiov with the read parts and be sure to have a tracked request not
3403      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3404      */
3405     tracked_request_begin(&req, bs, offset, bytes, true);
3406 
3407     if (offset & (align - 1)) {
3408         QEMUIOVector head_qiov;
3409         struct iovec head_iov;
3410 
3411         mark_request_serialising(&req, align);
3412         wait_serialising_requests(&req);
3413 
3414         head_buf = qemu_blockalign(bs, align);
3415         head_iov = (struct iovec) {
3416             .iov_base   = head_buf,
3417             .iov_len    = align,
3418         };
3419         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3420 
3421         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3422         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3423                                   align, &head_qiov, 0);
3424         if (ret < 0) {
3425             goto fail;
3426         }
3427         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3428 
3429         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3430         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3431         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3432         use_local_qiov = true;
3433 
3434         bytes += offset & (align - 1);
3435         offset = offset & ~(align - 1);
3436     }
3437 
3438     if ((offset + bytes) & (align - 1)) {
3439         QEMUIOVector tail_qiov;
3440         struct iovec tail_iov;
3441         size_t tail_bytes;
3442         bool waited;
3443 
3444         mark_request_serialising(&req, align);
3445         waited = wait_serialising_requests(&req);
3446         assert(!waited || !use_local_qiov);
3447 
3448         tail_buf = qemu_blockalign(bs, align);
3449         tail_iov = (struct iovec) {
3450             .iov_base   = tail_buf,
3451             .iov_len    = align,
3452         };
3453         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3454 
3455         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3456         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3457                                   align, &tail_qiov, 0);
3458         if (ret < 0) {
3459             goto fail;
3460         }
3461         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3462 
3463         if (!use_local_qiov) {
3464             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3465             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3466             use_local_qiov = true;
3467         }
3468 
3469         tail_bytes = (offset + bytes) & (align - 1);
3470         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3471 
3472         bytes = ROUND_UP(bytes, align);
3473     }
3474 
3475     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3476                                use_local_qiov ? &local_qiov : qiov,
3477                                flags);
3478 
3479 fail:
3480     tracked_request_end(&req);
3481 
3482     if (use_local_qiov) {
3483         qemu_iovec_destroy(&local_qiov);
3484     }
3485     qemu_vfree(head_buf);
3486     qemu_vfree(tail_buf);
3487 
3488     return ret;
3489 }
3490 
3491 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3492     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3493     BdrvRequestFlags flags)
3494 {
3495     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3496         return -EINVAL;
3497     }
3498 
3499     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3500                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3501 }
3502 
3503 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3504     int nb_sectors, QEMUIOVector *qiov)
3505 {
3506     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3507 
3508     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3509 }
3510 
3511 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3512                                       int64_t sector_num, int nb_sectors,
3513                                       BdrvRequestFlags flags)
3514 {
3515     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3516 
3517     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3518         flags &= ~BDRV_REQ_MAY_UNMAP;
3519     }
3520 
3521     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3522                              BDRV_REQ_ZERO_WRITE | flags);
3523 }
3524 
3525 /**
3526  * Truncate file to 'offset' bytes (needed only for file protocols)
3527  */
3528 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3529 {
3530     BlockDriver *drv = bs->drv;
3531     int ret;
3532     if (!drv)
3533         return -ENOMEDIUM;
3534     if (!drv->bdrv_truncate)
3535         return -ENOTSUP;
3536     if (bs->read_only)
3537         return -EACCES;
3538 
3539     ret = drv->bdrv_truncate(bs, offset);
3540     if (ret == 0) {
3541         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3542         bdrv_dev_resize_cb(bs);
3543     }
3544     return ret;
3545 }
3546 
3547 /**
3548  * Length of a allocated file in bytes. Sparse files are counted by actual
3549  * allocated space. Return < 0 if error or unknown.
3550  */
3551 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3552 {
3553     BlockDriver *drv = bs->drv;
3554     if (!drv) {
3555         return -ENOMEDIUM;
3556     }
3557     if (drv->bdrv_get_allocated_file_size) {
3558         return drv->bdrv_get_allocated_file_size(bs);
3559     }
3560     if (bs->file) {
3561         return bdrv_get_allocated_file_size(bs->file);
3562     }
3563     return -ENOTSUP;
3564 }
3565 
3566 /**
3567  * Return number of sectors on success, -errno on error.
3568  */
3569 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3570 {
3571     BlockDriver *drv = bs->drv;
3572 
3573     if (!drv)
3574         return -ENOMEDIUM;
3575 
3576     if (drv->has_variable_length) {
3577         int ret = refresh_total_sectors(bs, bs->total_sectors);
3578         if (ret < 0) {
3579             return ret;
3580         }
3581     }
3582     return bs->total_sectors;
3583 }
3584 
3585 /**
3586  * Return length in bytes on success, -errno on error.
3587  * The length is always a multiple of BDRV_SECTOR_SIZE.
3588  */
3589 int64_t bdrv_getlength(BlockDriverState *bs)
3590 {
3591     int64_t ret = bdrv_nb_sectors(bs);
3592 
3593     return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3594 }
3595 
3596 /* return 0 as number of sectors if no device present or error */
3597 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3598 {
3599     int64_t nb_sectors = bdrv_nb_sectors(bs);
3600 
3601     *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3602 }
3603 
3604 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3605                        BlockdevOnError on_write_error)
3606 {
3607     bs->on_read_error = on_read_error;
3608     bs->on_write_error = on_write_error;
3609 }
3610 
3611 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3612 {
3613     return is_read ? bs->on_read_error : bs->on_write_error;
3614 }
3615 
3616 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3617 {
3618     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3619 
3620     switch (on_err) {
3621     case BLOCKDEV_ON_ERROR_ENOSPC:
3622         return (error == ENOSPC) ?
3623                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3624     case BLOCKDEV_ON_ERROR_STOP:
3625         return BLOCK_ERROR_ACTION_STOP;
3626     case BLOCKDEV_ON_ERROR_REPORT:
3627         return BLOCK_ERROR_ACTION_REPORT;
3628     case BLOCKDEV_ON_ERROR_IGNORE:
3629         return BLOCK_ERROR_ACTION_IGNORE;
3630     default:
3631         abort();
3632     }
3633 }
3634 
3635 /* This is done by device models because, while the block layer knows
3636  * about the error, it does not know whether an operation comes from
3637  * the device or the block layer (from a job, for example).
3638  */
3639 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3640                        bool is_read, int error)
3641 {
3642     assert(error >= 0);
3643 
3644     if (action == BLOCK_ERROR_ACTION_STOP) {
3645         /* First set the iostatus, so that "info block" returns an iostatus
3646          * that matches the events raised so far (an additional error iostatus
3647          * is fine, but not a lost one).
3648          */
3649         bdrv_iostatus_set_err(bs, error);
3650 
3651         /* Then raise the request to stop the VM and the event.
3652          * qemu_system_vmstop_request_prepare has two effects.  First,
3653          * it ensures that the STOP event always comes after the
3654          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
3655          * can observe the STOP event and do a "cont" before the STOP
3656          * event is issued, the VM will not stop.  In this case, vm_start()
3657          * also ensures that the STOP/RESUME pair of events is emitted.
3658          */
3659         qemu_system_vmstop_request_prepare();
3660         qapi_event_send_block_io_error(bdrv_get_device_name(bs),
3661                                        is_read ? IO_OPERATION_TYPE_READ :
3662                                        IO_OPERATION_TYPE_WRITE,
3663                                        action, &error_abort);
3664         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3665     } else {
3666         qapi_event_send_block_io_error(bdrv_get_device_name(bs),
3667                                        is_read ? IO_OPERATION_TYPE_READ :
3668                                        IO_OPERATION_TYPE_WRITE,
3669                                        action, &error_abort);
3670     }
3671 }
3672 
3673 int bdrv_is_read_only(BlockDriverState *bs)
3674 {
3675     return bs->read_only;
3676 }
3677 
3678 int bdrv_is_sg(BlockDriverState *bs)
3679 {
3680     return bs->sg;
3681 }
3682 
3683 int bdrv_enable_write_cache(BlockDriverState *bs)
3684 {
3685     return bs->enable_write_cache;
3686 }
3687 
3688 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3689 {
3690     bs->enable_write_cache = wce;
3691 
3692     /* so a reopen() will preserve wce */
3693     if (wce) {
3694         bs->open_flags |= BDRV_O_CACHE_WB;
3695     } else {
3696         bs->open_flags &= ~BDRV_O_CACHE_WB;
3697     }
3698 }
3699 
3700 int bdrv_is_encrypted(BlockDriverState *bs)
3701 {
3702     if (bs->backing_hd && bs->backing_hd->encrypted)
3703         return 1;
3704     return bs->encrypted;
3705 }
3706 
3707 int bdrv_key_required(BlockDriverState *bs)
3708 {
3709     BlockDriverState *backing_hd = bs->backing_hd;
3710 
3711     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3712         return 1;
3713     return (bs->encrypted && !bs->valid_key);
3714 }
3715 
3716 int bdrv_set_key(BlockDriverState *bs, const char *key)
3717 {
3718     int ret;
3719     if (bs->backing_hd && bs->backing_hd->encrypted) {
3720         ret = bdrv_set_key(bs->backing_hd, key);
3721         if (ret < 0)
3722             return ret;
3723         if (!bs->encrypted)
3724             return 0;
3725     }
3726     if (!bs->encrypted) {
3727         return -EINVAL;
3728     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3729         return -ENOMEDIUM;
3730     }
3731     ret = bs->drv->bdrv_set_key(bs, key);
3732     if (ret < 0) {
3733         bs->valid_key = 0;
3734     } else if (!bs->valid_key) {
3735         bs->valid_key = 1;
3736         /* call the change callback now, we skipped it on open */
3737         bdrv_dev_change_media_cb(bs, true);
3738     }
3739     return ret;
3740 }
3741 
3742 const char *bdrv_get_format_name(BlockDriverState *bs)
3743 {
3744     return bs->drv ? bs->drv->format_name : NULL;
3745 }
3746 
3747 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3748                          void *opaque)
3749 {
3750     BlockDriver *drv;
3751     int count = 0;
3752     const char **formats = NULL;
3753 
3754     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3755         if (drv->format_name) {
3756             bool found = false;
3757             int i = count;
3758             while (formats && i && !found) {
3759                 found = !strcmp(formats[--i], drv->format_name);
3760             }
3761 
3762             if (!found) {
3763                 formats = g_renew(const char *, formats, count + 1);
3764                 formats[count++] = drv->format_name;
3765                 it(opaque, drv->format_name);
3766             }
3767         }
3768     }
3769     g_free(formats);
3770 }
3771 
3772 /* This function is to find block backend bs */
3773 BlockDriverState *bdrv_find(const char *name)
3774 {
3775     BlockDriverState *bs;
3776 
3777     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3778         if (!strcmp(name, bs->device_name)) {
3779             return bs;
3780         }
3781     }
3782     return NULL;
3783 }
3784 
3785 /* This function is to find a node in the bs graph */
3786 BlockDriverState *bdrv_find_node(const char *node_name)
3787 {
3788     BlockDriverState *bs;
3789 
3790     assert(node_name);
3791 
3792     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3793         if (!strcmp(node_name, bs->node_name)) {
3794             return bs;
3795         }
3796     }
3797     return NULL;
3798 }
3799 
3800 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3801 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3802 {
3803     BlockDeviceInfoList *list, *entry;
3804     BlockDriverState *bs;
3805 
3806     list = NULL;
3807     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3808         entry = g_malloc0(sizeof(*entry));
3809         entry->value = bdrv_block_device_info(bs);
3810         entry->next = list;
3811         list = entry;
3812     }
3813 
3814     return list;
3815 }
3816 
3817 BlockDriverState *bdrv_lookup_bs(const char *device,
3818                                  const char *node_name,
3819                                  Error **errp)
3820 {
3821     BlockDriverState *bs = NULL;
3822 
3823     if (device) {
3824         bs = bdrv_find(device);
3825 
3826         if (bs) {
3827             return bs;
3828         }
3829     }
3830 
3831     if (node_name) {
3832         bs = bdrv_find_node(node_name);
3833 
3834         if (bs) {
3835             return bs;
3836         }
3837     }
3838 
3839     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3840                      device ? device : "",
3841                      node_name ? node_name : "");
3842     return NULL;
3843 }
3844 
3845 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3846  * return false.  If either argument is NULL, return false. */
3847 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3848 {
3849     while (top && top != base) {
3850         top = top->backing_hd;
3851     }
3852 
3853     return top != NULL;
3854 }
3855 
3856 BlockDriverState *bdrv_next(BlockDriverState *bs)
3857 {
3858     if (!bs) {
3859         return QTAILQ_FIRST(&bdrv_states);
3860     }
3861     return QTAILQ_NEXT(bs, device_list);
3862 }
3863 
3864 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3865 {
3866     BlockDriverState *bs;
3867 
3868     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3869         it(opaque, bs);
3870     }
3871 }
3872 
3873 const char *bdrv_get_device_name(BlockDriverState *bs)
3874 {
3875     return bs->device_name;
3876 }
3877 
3878 int bdrv_get_flags(BlockDriverState *bs)
3879 {
3880     return bs->open_flags;
3881 }
3882 
3883 int bdrv_flush_all(void)
3884 {
3885     BlockDriverState *bs;
3886     int result = 0;
3887 
3888     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3889         AioContext *aio_context = bdrv_get_aio_context(bs);
3890         int ret;
3891 
3892         aio_context_acquire(aio_context);
3893         ret = bdrv_flush(bs);
3894         if (ret < 0 && !result) {
3895             result = ret;
3896         }
3897         aio_context_release(aio_context);
3898     }
3899 
3900     return result;
3901 }
3902 
3903 int bdrv_has_zero_init_1(BlockDriverState *bs)
3904 {
3905     return 1;
3906 }
3907 
3908 int bdrv_has_zero_init(BlockDriverState *bs)
3909 {
3910     assert(bs->drv);
3911 
3912     /* If BS is a copy on write image, it is initialized to
3913        the contents of the base image, which may not be zeroes.  */
3914     if (bs->backing_hd) {
3915         return 0;
3916     }
3917     if (bs->drv->bdrv_has_zero_init) {
3918         return bs->drv->bdrv_has_zero_init(bs);
3919     }
3920 
3921     /* safe default */
3922     return 0;
3923 }
3924 
3925 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3926 {
3927     BlockDriverInfo bdi;
3928 
3929     if (bs->backing_hd) {
3930         return false;
3931     }
3932 
3933     if (bdrv_get_info(bs, &bdi) == 0) {
3934         return bdi.unallocated_blocks_are_zero;
3935     }
3936 
3937     return false;
3938 }
3939 
3940 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3941 {
3942     BlockDriverInfo bdi;
3943 
3944     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3945         return false;
3946     }
3947 
3948     if (bdrv_get_info(bs, &bdi) == 0) {
3949         return bdi.can_write_zeroes_with_unmap;
3950     }
3951 
3952     return false;
3953 }
3954 
3955 typedef struct BdrvCoGetBlockStatusData {
3956     BlockDriverState *bs;
3957     BlockDriverState *base;
3958     int64_t sector_num;
3959     int nb_sectors;
3960     int *pnum;
3961     int64_t ret;
3962     bool done;
3963 } BdrvCoGetBlockStatusData;
3964 
3965 /*
3966  * Returns true iff the specified sector is present in the disk image. Drivers
3967  * not implementing the functionality are assumed to not support backing files,
3968  * hence all their sectors are reported as allocated.
3969  *
3970  * If 'sector_num' is beyond the end of the disk image the return value is 0
3971  * and 'pnum' is set to 0.
3972  *
3973  * 'pnum' is set to the number of sectors (including and immediately following
3974  * the specified sector) that are known to be in the same
3975  * allocated/unallocated state.
3976  *
3977  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3978  * beyond the end of the disk image it will be clamped.
3979  */
3980 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3981                                                      int64_t sector_num,
3982                                                      int nb_sectors, int *pnum)
3983 {
3984     int64_t total_sectors;
3985     int64_t n;
3986     int64_t ret, ret2;
3987 
3988     total_sectors = bdrv_nb_sectors(bs);
3989     if (total_sectors < 0) {
3990         return total_sectors;
3991     }
3992 
3993     if (sector_num >= total_sectors) {
3994         *pnum = 0;
3995         return 0;
3996     }
3997 
3998     n = total_sectors - sector_num;
3999     if (n < nb_sectors) {
4000         nb_sectors = n;
4001     }
4002 
4003     if (!bs->drv->bdrv_co_get_block_status) {
4004         *pnum = nb_sectors;
4005         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
4006         if (bs->drv->protocol_name) {
4007             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4008         }
4009         return ret;
4010     }
4011 
4012     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4013     if (ret < 0) {
4014         *pnum = 0;
4015         return ret;
4016     }
4017 
4018     if (ret & BDRV_BLOCK_RAW) {
4019         assert(ret & BDRV_BLOCK_OFFSET_VALID);
4020         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4021                                      *pnum, pnum);
4022     }
4023 
4024     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4025         ret |= BDRV_BLOCK_ALLOCATED;
4026     }
4027 
4028     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4029         if (bdrv_unallocated_blocks_are_zero(bs)) {
4030             ret |= BDRV_BLOCK_ZERO;
4031         } else if (bs->backing_hd) {
4032             BlockDriverState *bs2 = bs->backing_hd;
4033             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4034             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4035                 ret |= BDRV_BLOCK_ZERO;
4036             }
4037         }
4038     }
4039 
4040     if (bs->file &&
4041         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4042         (ret & BDRV_BLOCK_OFFSET_VALID)) {
4043         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4044                                         *pnum, pnum);
4045         if (ret2 >= 0) {
4046             /* Ignore errors.  This is just providing extra information, it
4047              * is useful but not necessary.
4048              */
4049             ret |= (ret2 & BDRV_BLOCK_ZERO);
4050         }
4051     }
4052 
4053     return ret;
4054 }
4055 
4056 /* Coroutine wrapper for bdrv_get_block_status() */
4057 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4058 {
4059     BdrvCoGetBlockStatusData *data = opaque;
4060     BlockDriverState *bs = data->bs;
4061 
4062     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4063                                          data->pnum);
4064     data->done = true;
4065 }
4066 
4067 /*
4068  * Synchronous wrapper around bdrv_co_get_block_status().
4069  *
4070  * See bdrv_co_get_block_status() for details.
4071  */
4072 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4073                               int nb_sectors, int *pnum)
4074 {
4075     Coroutine *co;
4076     BdrvCoGetBlockStatusData data = {
4077         .bs = bs,
4078         .sector_num = sector_num,
4079         .nb_sectors = nb_sectors,
4080         .pnum = pnum,
4081         .done = false,
4082     };
4083 
4084     if (qemu_in_coroutine()) {
4085         /* Fast-path if already in coroutine context */
4086         bdrv_get_block_status_co_entry(&data);
4087     } else {
4088         AioContext *aio_context = bdrv_get_aio_context(bs);
4089 
4090         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4091         qemu_coroutine_enter(co, &data);
4092         while (!data.done) {
4093             aio_poll(aio_context, true);
4094         }
4095     }
4096     return data.ret;
4097 }
4098 
4099 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4100                                    int nb_sectors, int *pnum)
4101 {
4102     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4103     if (ret < 0) {
4104         return ret;
4105     }
4106     return !!(ret & BDRV_BLOCK_ALLOCATED);
4107 }
4108 
4109 /*
4110  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4111  *
4112  * Return true if the given sector is allocated in any image between
4113  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
4114  * sector is allocated in any image of the chain.  Return false otherwise.
4115  *
4116  * 'pnum' is set to the number of sectors (including and immediately following
4117  *  the specified sector) that are known to be in the same
4118  *  allocated/unallocated state.
4119  *
4120  */
4121 int bdrv_is_allocated_above(BlockDriverState *top,
4122                             BlockDriverState *base,
4123                             int64_t sector_num,
4124                             int nb_sectors, int *pnum)
4125 {
4126     BlockDriverState *intermediate;
4127     int ret, n = nb_sectors;
4128 
4129     intermediate = top;
4130     while (intermediate && intermediate != base) {
4131         int pnum_inter;
4132         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4133                                 &pnum_inter);
4134         if (ret < 0) {
4135             return ret;
4136         } else if (ret) {
4137             *pnum = pnum_inter;
4138             return 1;
4139         }
4140 
4141         /*
4142          * [sector_num, nb_sectors] is unallocated on top but intermediate
4143          * might have
4144          *
4145          * [sector_num+x, nr_sectors] allocated.
4146          */
4147         if (n > pnum_inter &&
4148             (intermediate == top ||
4149              sector_num + pnum_inter < intermediate->total_sectors)) {
4150             n = pnum_inter;
4151         }
4152 
4153         intermediate = intermediate->backing_hd;
4154     }
4155 
4156     *pnum = n;
4157     return 0;
4158 }
4159 
4160 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4161 {
4162     if (bs->backing_hd && bs->backing_hd->encrypted)
4163         return bs->backing_file;
4164     else if (bs->encrypted)
4165         return bs->filename;
4166     else
4167         return NULL;
4168 }
4169 
4170 void bdrv_get_backing_filename(BlockDriverState *bs,
4171                                char *filename, int filename_size)
4172 {
4173     pstrcpy(filename, filename_size, bs->backing_file);
4174 }
4175 
4176 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4177                           const uint8_t *buf, int nb_sectors)
4178 {
4179     BlockDriver *drv = bs->drv;
4180     if (!drv)
4181         return -ENOMEDIUM;
4182     if (!drv->bdrv_write_compressed)
4183         return -ENOTSUP;
4184     if (bdrv_check_request(bs, sector_num, nb_sectors))
4185         return -EIO;
4186 
4187     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4188 
4189     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4190 }
4191 
4192 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4193 {
4194     BlockDriver *drv = bs->drv;
4195     if (!drv)
4196         return -ENOMEDIUM;
4197     if (!drv->bdrv_get_info)
4198         return -ENOTSUP;
4199     memset(bdi, 0, sizeof(*bdi));
4200     return drv->bdrv_get_info(bs, bdi);
4201 }
4202 
4203 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4204 {
4205     BlockDriver *drv = bs->drv;
4206     if (drv && drv->bdrv_get_specific_info) {
4207         return drv->bdrv_get_specific_info(bs);
4208     }
4209     return NULL;
4210 }
4211 
4212 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4213                       int64_t pos, int size)
4214 {
4215     QEMUIOVector qiov;
4216     struct iovec iov = {
4217         .iov_base   = (void *) buf,
4218         .iov_len    = size,
4219     };
4220 
4221     qemu_iovec_init_external(&qiov, &iov, 1);
4222     return bdrv_writev_vmstate(bs, &qiov, pos);
4223 }
4224 
4225 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4226 {
4227     BlockDriver *drv = bs->drv;
4228 
4229     if (!drv) {
4230         return -ENOMEDIUM;
4231     } else if (drv->bdrv_save_vmstate) {
4232         return drv->bdrv_save_vmstate(bs, qiov, pos);
4233     } else if (bs->file) {
4234         return bdrv_writev_vmstate(bs->file, qiov, pos);
4235     }
4236 
4237     return -ENOTSUP;
4238 }
4239 
4240 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4241                       int64_t pos, int size)
4242 {
4243     BlockDriver *drv = bs->drv;
4244     if (!drv)
4245         return -ENOMEDIUM;
4246     if (drv->bdrv_load_vmstate)
4247         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4248     if (bs->file)
4249         return bdrv_load_vmstate(bs->file, buf, pos, size);
4250     return -ENOTSUP;
4251 }
4252 
4253 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4254 {
4255     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4256         return;
4257     }
4258 
4259     bs->drv->bdrv_debug_event(bs, event);
4260 }
4261 
4262 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4263                           const char *tag)
4264 {
4265     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4266         bs = bs->file;
4267     }
4268 
4269     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4270         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4271     }
4272 
4273     return -ENOTSUP;
4274 }
4275 
4276 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4277 {
4278     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4279         bs = bs->file;
4280     }
4281 
4282     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4283         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4284     }
4285 
4286     return -ENOTSUP;
4287 }
4288 
4289 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4290 {
4291     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4292         bs = bs->file;
4293     }
4294 
4295     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4296         return bs->drv->bdrv_debug_resume(bs, tag);
4297     }
4298 
4299     return -ENOTSUP;
4300 }
4301 
4302 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4303 {
4304     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4305         bs = bs->file;
4306     }
4307 
4308     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4309         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4310     }
4311 
4312     return false;
4313 }
4314 
4315 int bdrv_is_snapshot(BlockDriverState *bs)
4316 {
4317     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4318 }
4319 
4320 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4321  * relative, it must be relative to the chain.  So, passing in bs->filename
4322  * from a BDS as backing_file should not be done, as that may be relative to
4323  * the CWD rather than the chain. */
4324 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4325         const char *backing_file)
4326 {
4327     char *filename_full = NULL;
4328     char *backing_file_full = NULL;
4329     char *filename_tmp = NULL;
4330     int is_protocol = 0;
4331     BlockDriverState *curr_bs = NULL;
4332     BlockDriverState *retval = NULL;
4333 
4334     if (!bs || !bs->drv || !backing_file) {
4335         return NULL;
4336     }
4337 
4338     filename_full     = g_malloc(PATH_MAX);
4339     backing_file_full = g_malloc(PATH_MAX);
4340     filename_tmp      = g_malloc(PATH_MAX);
4341 
4342     is_protocol = path_has_protocol(backing_file);
4343 
4344     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4345 
4346         /* If either of the filename paths is actually a protocol, then
4347          * compare unmodified paths; otherwise make paths relative */
4348         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4349             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4350                 retval = curr_bs->backing_hd;
4351                 break;
4352             }
4353         } else {
4354             /* If not an absolute filename path, make it relative to the current
4355              * image's filename path */
4356             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4357                          backing_file);
4358 
4359             /* We are going to compare absolute pathnames */
4360             if (!realpath(filename_tmp, filename_full)) {
4361                 continue;
4362             }
4363 
4364             /* We need to make sure the backing filename we are comparing against
4365              * is relative to the current image filename (or absolute) */
4366             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4367                          curr_bs->backing_file);
4368 
4369             if (!realpath(filename_tmp, backing_file_full)) {
4370                 continue;
4371             }
4372 
4373             if (strcmp(backing_file_full, filename_full) == 0) {
4374                 retval = curr_bs->backing_hd;
4375                 break;
4376             }
4377         }
4378     }
4379 
4380     g_free(filename_full);
4381     g_free(backing_file_full);
4382     g_free(filename_tmp);
4383     return retval;
4384 }
4385 
4386 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4387 {
4388     if (!bs->drv) {
4389         return 0;
4390     }
4391 
4392     if (!bs->backing_hd) {
4393         return 0;
4394     }
4395 
4396     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4397 }
4398 
4399 /**************************************************************/
4400 /* async I/Os */
4401 
4402 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4403                                  QEMUIOVector *qiov, int nb_sectors,
4404                                  BlockDriverCompletionFunc *cb, void *opaque)
4405 {
4406     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4407 
4408     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4409                                  cb, opaque, false);
4410 }
4411 
4412 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4413                                   QEMUIOVector *qiov, int nb_sectors,
4414                                   BlockDriverCompletionFunc *cb, void *opaque)
4415 {
4416     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4417 
4418     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4419                                  cb, opaque, true);
4420 }
4421 
4422 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4423         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4424         BlockDriverCompletionFunc *cb, void *opaque)
4425 {
4426     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4427 
4428     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4429                                  BDRV_REQ_ZERO_WRITE | flags,
4430                                  cb, opaque, true);
4431 }
4432 
4433 
4434 typedef struct MultiwriteCB {
4435     int error;
4436     int num_requests;
4437     int num_callbacks;
4438     struct {
4439         BlockDriverCompletionFunc *cb;
4440         void *opaque;
4441         QEMUIOVector *free_qiov;
4442     } callbacks[];
4443 } MultiwriteCB;
4444 
4445 static void multiwrite_user_cb(MultiwriteCB *mcb)
4446 {
4447     int i;
4448 
4449     for (i = 0; i < mcb->num_callbacks; i++) {
4450         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4451         if (mcb->callbacks[i].free_qiov) {
4452             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4453         }
4454         g_free(mcb->callbacks[i].free_qiov);
4455     }
4456 }
4457 
4458 static void multiwrite_cb(void *opaque, int ret)
4459 {
4460     MultiwriteCB *mcb = opaque;
4461 
4462     trace_multiwrite_cb(mcb, ret);
4463 
4464     if (ret < 0 && !mcb->error) {
4465         mcb->error = ret;
4466     }
4467 
4468     mcb->num_requests--;
4469     if (mcb->num_requests == 0) {
4470         multiwrite_user_cb(mcb);
4471         g_free(mcb);
4472     }
4473 }
4474 
4475 static int multiwrite_req_compare(const void *a, const void *b)
4476 {
4477     const BlockRequest *req1 = a, *req2 = b;
4478 
4479     /*
4480      * Note that we can't simply subtract req2->sector from req1->sector
4481      * here as that could overflow the return value.
4482      */
4483     if (req1->sector > req2->sector) {
4484         return 1;
4485     } else if (req1->sector < req2->sector) {
4486         return -1;
4487     } else {
4488         return 0;
4489     }
4490 }
4491 
4492 /*
4493  * Takes a bunch of requests and tries to merge them. Returns the number of
4494  * requests that remain after merging.
4495  */
4496 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4497     int num_reqs, MultiwriteCB *mcb)
4498 {
4499     int i, outidx;
4500 
4501     // Sort requests by start sector
4502     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4503 
4504     // Check if adjacent requests touch the same clusters. If so, combine them,
4505     // filling up gaps with zero sectors.
4506     outidx = 0;
4507     for (i = 1; i < num_reqs; i++) {
4508         int merge = 0;
4509         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4510 
4511         // Handle exactly sequential writes and overlapping writes.
4512         if (reqs[i].sector <= oldreq_last) {
4513             merge = 1;
4514         }
4515 
4516         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4517             merge = 0;
4518         }
4519 
4520         if (merge) {
4521             size_t size;
4522             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4523             qemu_iovec_init(qiov,
4524                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4525 
4526             // Add the first request to the merged one. If the requests are
4527             // overlapping, drop the last sectors of the first request.
4528             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4529             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4530 
4531             // We should need to add any zeros between the two requests
4532             assert (reqs[i].sector <= oldreq_last);
4533 
4534             // Add the second request
4535             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4536 
4537             reqs[outidx].nb_sectors = qiov->size >> 9;
4538             reqs[outidx].qiov = qiov;
4539 
4540             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4541         } else {
4542             outidx++;
4543             reqs[outidx].sector     = reqs[i].sector;
4544             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4545             reqs[outidx].qiov       = reqs[i].qiov;
4546         }
4547     }
4548 
4549     return outidx + 1;
4550 }
4551 
4552 /*
4553  * Submit multiple AIO write requests at once.
4554  *
4555  * On success, the function returns 0 and all requests in the reqs array have
4556  * been submitted. In error case this function returns -1, and any of the
4557  * requests may or may not be submitted yet. In particular, this means that the
4558  * callback will be called for some of the requests, for others it won't. The
4559  * caller must check the error field of the BlockRequest to wait for the right
4560  * callbacks (if error != 0, no callback will be called).
4561  *
4562  * The implementation may modify the contents of the reqs array, e.g. to merge
4563  * requests. However, the fields opaque and error are left unmodified as they
4564  * are used to signal failure for a single request to the caller.
4565  */
4566 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4567 {
4568     MultiwriteCB *mcb;
4569     int i;
4570 
4571     /* don't submit writes if we don't have a medium */
4572     if (bs->drv == NULL) {
4573         for (i = 0; i < num_reqs; i++) {
4574             reqs[i].error = -ENOMEDIUM;
4575         }
4576         return -1;
4577     }
4578 
4579     if (num_reqs == 0) {
4580         return 0;
4581     }
4582 
4583     // Create MultiwriteCB structure
4584     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4585     mcb->num_requests = 0;
4586     mcb->num_callbacks = num_reqs;
4587 
4588     for (i = 0; i < num_reqs; i++) {
4589         mcb->callbacks[i].cb = reqs[i].cb;
4590         mcb->callbacks[i].opaque = reqs[i].opaque;
4591     }
4592 
4593     // Check for mergable requests
4594     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4595 
4596     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4597 
4598     /* Run the aio requests. */
4599     mcb->num_requests = num_reqs;
4600     for (i = 0; i < num_reqs; i++) {
4601         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4602                               reqs[i].nb_sectors, reqs[i].flags,
4603                               multiwrite_cb, mcb,
4604                               true);
4605     }
4606 
4607     return 0;
4608 }
4609 
4610 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4611 {
4612     acb->aiocb_info->cancel(acb);
4613 }
4614 
4615 /**************************************************************/
4616 /* async block device emulation */
4617 
4618 typedef struct BlockDriverAIOCBSync {
4619     BlockDriverAIOCB common;
4620     QEMUBH *bh;
4621     int ret;
4622     /* vector translation state */
4623     QEMUIOVector *qiov;
4624     uint8_t *bounce;
4625     int is_write;
4626 } BlockDriverAIOCBSync;
4627 
4628 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4629 {
4630     BlockDriverAIOCBSync *acb =
4631         container_of(blockacb, BlockDriverAIOCBSync, common);
4632     qemu_bh_delete(acb->bh);
4633     acb->bh = NULL;
4634     qemu_aio_release(acb);
4635 }
4636 
4637 static const AIOCBInfo bdrv_em_aiocb_info = {
4638     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4639     .cancel             = bdrv_aio_cancel_em,
4640 };
4641 
4642 static void bdrv_aio_bh_cb(void *opaque)
4643 {
4644     BlockDriverAIOCBSync *acb = opaque;
4645 
4646     if (!acb->is_write && acb->ret >= 0) {
4647         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4648     }
4649     qemu_vfree(acb->bounce);
4650     acb->common.cb(acb->common.opaque, acb->ret);
4651     qemu_bh_delete(acb->bh);
4652     acb->bh = NULL;
4653     qemu_aio_release(acb);
4654 }
4655 
4656 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4657                                             int64_t sector_num,
4658                                             QEMUIOVector *qiov,
4659                                             int nb_sectors,
4660                                             BlockDriverCompletionFunc *cb,
4661                                             void *opaque,
4662                                             int is_write)
4663 
4664 {
4665     BlockDriverAIOCBSync *acb;
4666 
4667     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4668     acb->is_write = is_write;
4669     acb->qiov = qiov;
4670     acb->bounce = qemu_try_blockalign(bs, qiov->size);
4671     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4672 
4673     if (acb->bounce == NULL) {
4674         acb->ret = -ENOMEM;
4675     } else if (is_write) {
4676         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4677         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4678     } else {
4679         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4680     }
4681 
4682     qemu_bh_schedule(acb->bh);
4683 
4684     return &acb->common;
4685 }
4686 
4687 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4688         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4689         BlockDriverCompletionFunc *cb, void *opaque)
4690 {
4691     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4692 }
4693 
4694 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4695         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4696         BlockDriverCompletionFunc *cb, void *opaque)
4697 {
4698     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4699 }
4700 
4701 
4702 typedef struct BlockDriverAIOCBCoroutine {
4703     BlockDriverAIOCB common;
4704     BlockRequest req;
4705     bool is_write;
4706     bool *done;
4707     QEMUBH* bh;
4708 } BlockDriverAIOCBCoroutine;
4709 
4710 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4711 {
4712     AioContext *aio_context = bdrv_get_aio_context(blockacb->bs);
4713     BlockDriverAIOCBCoroutine *acb =
4714         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4715     bool done = false;
4716 
4717     acb->done = &done;
4718     while (!done) {
4719         aio_poll(aio_context, true);
4720     }
4721 }
4722 
4723 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4724     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4725     .cancel             = bdrv_aio_co_cancel_em,
4726 };
4727 
4728 static void bdrv_co_em_bh(void *opaque)
4729 {
4730     BlockDriverAIOCBCoroutine *acb = opaque;
4731 
4732     acb->common.cb(acb->common.opaque, acb->req.error);
4733 
4734     if (acb->done) {
4735         *acb->done = true;
4736     }
4737 
4738     qemu_bh_delete(acb->bh);
4739     qemu_aio_release(acb);
4740 }
4741 
4742 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4743 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4744 {
4745     BlockDriverAIOCBCoroutine *acb = opaque;
4746     BlockDriverState *bs = acb->common.bs;
4747 
4748     if (!acb->is_write) {
4749         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4750             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4751     } else {
4752         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4753             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4754     }
4755 
4756     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4757     qemu_bh_schedule(acb->bh);
4758 }
4759 
4760 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4761                                                int64_t sector_num,
4762                                                QEMUIOVector *qiov,
4763                                                int nb_sectors,
4764                                                BdrvRequestFlags flags,
4765                                                BlockDriverCompletionFunc *cb,
4766                                                void *opaque,
4767                                                bool is_write)
4768 {
4769     Coroutine *co;
4770     BlockDriverAIOCBCoroutine *acb;
4771 
4772     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4773     acb->req.sector = sector_num;
4774     acb->req.nb_sectors = nb_sectors;
4775     acb->req.qiov = qiov;
4776     acb->req.flags = flags;
4777     acb->is_write = is_write;
4778     acb->done = NULL;
4779 
4780     co = qemu_coroutine_create(bdrv_co_do_rw);
4781     qemu_coroutine_enter(co, acb);
4782 
4783     return &acb->common;
4784 }
4785 
4786 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4787 {
4788     BlockDriverAIOCBCoroutine *acb = opaque;
4789     BlockDriverState *bs = acb->common.bs;
4790 
4791     acb->req.error = bdrv_co_flush(bs);
4792     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4793     qemu_bh_schedule(acb->bh);
4794 }
4795 
4796 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4797         BlockDriverCompletionFunc *cb, void *opaque)
4798 {
4799     trace_bdrv_aio_flush(bs, opaque);
4800 
4801     Coroutine *co;
4802     BlockDriverAIOCBCoroutine *acb;
4803 
4804     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4805     acb->done = NULL;
4806 
4807     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4808     qemu_coroutine_enter(co, acb);
4809 
4810     return &acb->common;
4811 }
4812 
4813 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4814 {
4815     BlockDriverAIOCBCoroutine *acb = opaque;
4816     BlockDriverState *bs = acb->common.bs;
4817 
4818     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4819     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4820     qemu_bh_schedule(acb->bh);
4821 }
4822 
4823 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4824         int64_t sector_num, int nb_sectors,
4825         BlockDriverCompletionFunc *cb, void *opaque)
4826 {
4827     Coroutine *co;
4828     BlockDriverAIOCBCoroutine *acb;
4829 
4830     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4831 
4832     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4833     acb->req.sector = sector_num;
4834     acb->req.nb_sectors = nb_sectors;
4835     acb->done = NULL;
4836     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4837     qemu_coroutine_enter(co, acb);
4838 
4839     return &acb->common;
4840 }
4841 
4842 void bdrv_init(void)
4843 {
4844     module_call_init(MODULE_INIT_BLOCK);
4845 }
4846 
4847 void bdrv_init_with_whitelist(void)
4848 {
4849     use_bdrv_whitelist = 1;
4850     bdrv_init();
4851 }
4852 
4853 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4854                    BlockDriverCompletionFunc *cb, void *opaque)
4855 {
4856     BlockDriverAIOCB *acb;
4857 
4858     acb = g_slice_alloc(aiocb_info->aiocb_size);
4859     acb->aiocb_info = aiocb_info;
4860     acb->bs = bs;
4861     acb->cb = cb;
4862     acb->opaque = opaque;
4863     return acb;
4864 }
4865 
4866 void qemu_aio_release(void *p)
4867 {
4868     BlockDriverAIOCB *acb = p;
4869     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4870 }
4871 
4872 /**************************************************************/
4873 /* Coroutine block device emulation */
4874 
4875 typedef struct CoroutineIOCompletion {
4876     Coroutine *coroutine;
4877     int ret;
4878 } CoroutineIOCompletion;
4879 
4880 static void bdrv_co_io_em_complete(void *opaque, int ret)
4881 {
4882     CoroutineIOCompletion *co = opaque;
4883 
4884     co->ret = ret;
4885     qemu_coroutine_enter(co->coroutine, NULL);
4886 }
4887 
4888 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4889                                       int nb_sectors, QEMUIOVector *iov,
4890                                       bool is_write)
4891 {
4892     CoroutineIOCompletion co = {
4893         .coroutine = qemu_coroutine_self(),
4894     };
4895     BlockDriverAIOCB *acb;
4896 
4897     if (is_write) {
4898         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4899                                        bdrv_co_io_em_complete, &co);
4900     } else {
4901         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4902                                       bdrv_co_io_em_complete, &co);
4903     }
4904 
4905     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4906     if (!acb) {
4907         return -EIO;
4908     }
4909     qemu_coroutine_yield();
4910 
4911     return co.ret;
4912 }
4913 
4914 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4915                                          int64_t sector_num, int nb_sectors,
4916                                          QEMUIOVector *iov)
4917 {
4918     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4919 }
4920 
4921 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4922                                          int64_t sector_num, int nb_sectors,
4923                                          QEMUIOVector *iov)
4924 {
4925     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4926 }
4927 
4928 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4929 {
4930     RwCo *rwco = opaque;
4931 
4932     rwco->ret = bdrv_co_flush(rwco->bs);
4933 }
4934 
4935 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4936 {
4937     int ret;
4938 
4939     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4940         return 0;
4941     }
4942 
4943     /* Write back cached data to the OS even with cache=unsafe */
4944     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4945     if (bs->drv->bdrv_co_flush_to_os) {
4946         ret = bs->drv->bdrv_co_flush_to_os(bs);
4947         if (ret < 0) {
4948             return ret;
4949         }
4950     }
4951 
4952     /* But don't actually force it to the disk with cache=unsafe */
4953     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4954         goto flush_parent;
4955     }
4956 
4957     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4958     if (bs->drv->bdrv_co_flush_to_disk) {
4959         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4960     } else if (bs->drv->bdrv_aio_flush) {
4961         BlockDriverAIOCB *acb;
4962         CoroutineIOCompletion co = {
4963             .coroutine = qemu_coroutine_self(),
4964         };
4965 
4966         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4967         if (acb == NULL) {
4968             ret = -EIO;
4969         } else {
4970             qemu_coroutine_yield();
4971             ret = co.ret;
4972         }
4973     } else {
4974         /*
4975          * Some block drivers always operate in either writethrough or unsafe
4976          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4977          * know how the server works (because the behaviour is hardcoded or
4978          * depends on server-side configuration), so we can't ensure that
4979          * everything is safe on disk. Returning an error doesn't work because
4980          * that would break guests even if the server operates in writethrough
4981          * mode.
4982          *
4983          * Let's hope the user knows what he's doing.
4984          */
4985         ret = 0;
4986     }
4987     if (ret < 0) {
4988         return ret;
4989     }
4990 
4991     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4992      * in the case of cache=unsafe, so there are no useless flushes.
4993      */
4994 flush_parent:
4995     return bdrv_co_flush(bs->file);
4996 }
4997 
4998 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4999 {
5000     Error *local_err = NULL;
5001     int ret;
5002 
5003     if (!bs->drv)  {
5004         return;
5005     }
5006 
5007     if (bs->drv->bdrv_invalidate_cache) {
5008         bs->drv->bdrv_invalidate_cache(bs, &local_err);
5009     } else if (bs->file) {
5010         bdrv_invalidate_cache(bs->file, &local_err);
5011     }
5012     if (local_err) {
5013         error_propagate(errp, local_err);
5014         return;
5015     }
5016 
5017     ret = refresh_total_sectors(bs, bs->total_sectors);
5018     if (ret < 0) {
5019         error_setg_errno(errp, -ret, "Could not refresh total sector count");
5020         return;
5021     }
5022 }
5023 
5024 void bdrv_invalidate_cache_all(Error **errp)
5025 {
5026     BlockDriverState *bs;
5027     Error *local_err = NULL;
5028 
5029     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5030         AioContext *aio_context = bdrv_get_aio_context(bs);
5031 
5032         aio_context_acquire(aio_context);
5033         bdrv_invalidate_cache(bs, &local_err);
5034         aio_context_release(aio_context);
5035         if (local_err) {
5036             error_propagate(errp, local_err);
5037             return;
5038         }
5039     }
5040 }
5041 
5042 void bdrv_clear_incoming_migration_all(void)
5043 {
5044     BlockDriverState *bs;
5045 
5046     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5047         AioContext *aio_context = bdrv_get_aio_context(bs);
5048 
5049         aio_context_acquire(aio_context);
5050         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
5051         aio_context_release(aio_context);
5052     }
5053 }
5054 
5055 int bdrv_flush(BlockDriverState *bs)
5056 {
5057     Coroutine *co;
5058     RwCo rwco = {
5059         .bs = bs,
5060         .ret = NOT_DONE,
5061     };
5062 
5063     if (qemu_in_coroutine()) {
5064         /* Fast-path if already in coroutine context */
5065         bdrv_flush_co_entry(&rwco);
5066     } else {
5067         AioContext *aio_context = bdrv_get_aio_context(bs);
5068 
5069         co = qemu_coroutine_create(bdrv_flush_co_entry);
5070         qemu_coroutine_enter(co, &rwco);
5071         while (rwco.ret == NOT_DONE) {
5072             aio_poll(aio_context, true);
5073         }
5074     }
5075 
5076     return rwco.ret;
5077 }
5078 
5079 typedef struct DiscardCo {
5080     BlockDriverState *bs;
5081     int64_t sector_num;
5082     int nb_sectors;
5083     int ret;
5084 } DiscardCo;
5085 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5086 {
5087     DiscardCo *rwco = opaque;
5088 
5089     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5090 }
5091 
5092 /* if no limit is specified in the BlockLimits use a default
5093  * of 32768 512-byte sectors (16 MiB) per request.
5094  */
5095 #define MAX_DISCARD_DEFAULT 32768
5096 
5097 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5098                                  int nb_sectors)
5099 {
5100     int max_discard;
5101 
5102     if (!bs->drv) {
5103         return -ENOMEDIUM;
5104     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5105         return -EIO;
5106     } else if (bs->read_only) {
5107         return -EROFS;
5108     }
5109 
5110     bdrv_reset_dirty(bs, sector_num, nb_sectors);
5111 
5112     /* Do nothing if disabled.  */
5113     if (!(bs->open_flags & BDRV_O_UNMAP)) {
5114         return 0;
5115     }
5116 
5117     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5118         return 0;
5119     }
5120 
5121     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5122     while (nb_sectors > 0) {
5123         int ret;
5124         int num = nb_sectors;
5125 
5126         /* align request */
5127         if (bs->bl.discard_alignment &&
5128             num >= bs->bl.discard_alignment &&
5129             sector_num % bs->bl.discard_alignment) {
5130             if (num > bs->bl.discard_alignment) {
5131                 num = bs->bl.discard_alignment;
5132             }
5133             num -= sector_num % bs->bl.discard_alignment;
5134         }
5135 
5136         /* limit request size */
5137         if (num > max_discard) {
5138             num = max_discard;
5139         }
5140 
5141         if (bs->drv->bdrv_co_discard) {
5142             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5143         } else {
5144             BlockDriverAIOCB *acb;
5145             CoroutineIOCompletion co = {
5146                 .coroutine = qemu_coroutine_self(),
5147             };
5148 
5149             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5150                                             bdrv_co_io_em_complete, &co);
5151             if (acb == NULL) {
5152                 return -EIO;
5153             } else {
5154                 qemu_coroutine_yield();
5155                 ret = co.ret;
5156             }
5157         }
5158         if (ret && ret != -ENOTSUP) {
5159             return ret;
5160         }
5161 
5162         sector_num += num;
5163         nb_sectors -= num;
5164     }
5165     return 0;
5166 }
5167 
5168 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5169 {
5170     Coroutine *co;
5171     DiscardCo rwco = {
5172         .bs = bs,
5173         .sector_num = sector_num,
5174         .nb_sectors = nb_sectors,
5175         .ret = NOT_DONE,
5176     };
5177 
5178     if (qemu_in_coroutine()) {
5179         /* Fast-path if already in coroutine context */
5180         bdrv_discard_co_entry(&rwco);
5181     } else {
5182         AioContext *aio_context = bdrv_get_aio_context(bs);
5183 
5184         co = qemu_coroutine_create(bdrv_discard_co_entry);
5185         qemu_coroutine_enter(co, &rwco);
5186         while (rwco.ret == NOT_DONE) {
5187             aio_poll(aio_context, true);
5188         }
5189     }
5190 
5191     return rwco.ret;
5192 }
5193 
5194 /**************************************************************/
5195 /* removable device support */
5196 
5197 /**
5198  * Return TRUE if the media is present
5199  */
5200 int bdrv_is_inserted(BlockDriverState *bs)
5201 {
5202     BlockDriver *drv = bs->drv;
5203 
5204     if (!drv)
5205         return 0;
5206     if (!drv->bdrv_is_inserted)
5207         return 1;
5208     return drv->bdrv_is_inserted(bs);
5209 }
5210 
5211 /**
5212  * Return whether the media changed since the last call to this
5213  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5214  */
5215 int bdrv_media_changed(BlockDriverState *bs)
5216 {
5217     BlockDriver *drv = bs->drv;
5218 
5219     if (drv && drv->bdrv_media_changed) {
5220         return drv->bdrv_media_changed(bs);
5221     }
5222     return -ENOTSUP;
5223 }
5224 
5225 /**
5226  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5227  */
5228 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5229 {
5230     BlockDriver *drv = bs->drv;
5231 
5232     if (drv && drv->bdrv_eject) {
5233         drv->bdrv_eject(bs, eject_flag);
5234     }
5235 
5236     if (bs->device_name[0] != '\0') {
5237         qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
5238                                           eject_flag, &error_abort);
5239     }
5240 }
5241 
5242 /**
5243  * Lock or unlock the media (if it is locked, the user won't be able
5244  * to eject it manually).
5245  */
5246 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5247 {
5248     BlockDriver *drv = bs->drv;
5249 
5250     trace_bdrv_lock_medium(bs, locked);
5251 
5252     if (drv && drv->bdrv_lock_medium) {
5253         drv->bdrv_lock_medium(bs, locked);
5254     }
5255 }
5256 
5257 /* needed for generic scsi interface */
5258 
5259 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5260 {
5261     BlockDriver *drv = bs->drv;
5262 
5263     if (drv && drv->bdrv_ioctl)
5264         return drv->bdrv_ioctl(bs, req, buf);
5265     return -ENOTSUP;
5266 }
5267 
5268 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5269         unsigned long int req, void *buf,
5270         BlockDriverCompletionFunc *cb, void *opaque)
5271 {
5272     BlockDriver *drv = bs->drv;
5273 
5274     if (drv && drv->bdrv_aio_ioctl)
5275         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5276     return NULL;
5277 }
5278 
5279 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5280 {
5281     bs->guest_block_size = align;
5282 }
5283 
5284 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5285 {
5286     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5287 }
5288 
5289 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5290 {
5291     size_t align = bdrv_opt_mem_align(bs);
5292 
5293     /* Ensure that NULL is never returned on success */
5294     assert(align > 0);
5295     if (size == 0) {
5296         size = align;
5297     }
5298 
5299     return qemu_try_memalign(align, size);
5300 }
5301 
5302 /*
5303  * Check if all memory in this vector is sector aligned.
5304  */
5305 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5306 {
5307     int i;
5308     size_t alignment = bdrv_opt_mem_align(bs);
5309 
5310     for (i = 0; i < qiov->niov; i++) {
5311         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5312             return false;
5313         }
5314         if (qiov->iov[i].iov_len % alignment) {
5315             return false;
5316         }
5317     }
5318 
5319     return true;
5320 }
5321 
5322 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5323                                           Error **errp)
5324 {
5325     int64_t bitmap_size;
5326     BdrvDirtyBitmap *bitmap;
5327 
5328     assert((granularity & (granularity - 1)) == 0);
5329 
5330     granularity >>= BDRV_SECTOR_BITS;
5331     assert(granularity);
5332     bitmap_size = bdrv_nb_sectors(bs);
5333     if (bitmap_size < 0) {
5334         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5335         errno = -bitmap_size;
5336         return NULL;
5337     }
5338     bitmap = g_new0(BdrvDirtyBitmap, 1);
5339     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5340     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5341     return bitmap;
5342 }
5343 
5344 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5345 {
5346     BdrvDirtyBitmap *bm, *next;
5347     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5348         if (bm == bitmap) {
5349             QLIST_REMOVE(bitmap, list);
5350             hbitmap_free(bitmap->bitmap);
5351             g_free(bitmap);
5352             return;
5353         }
5354     }
5355 }
5356 
5357 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5358 {
5359     BdrvDirtyBitmap *bm;
5360     BlockDirtyInfoList *list = NULL;
5361     BlockDirtyInfoList **plist = &list;
5362 
5363     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5364         BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5365         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5366         info->count = bdrv_get_dirty_count(bs, bm);
5367         info->granularity =
5368             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5369         entry->value = info;
5370         *plist = entry;
5371         plist = &entry->next;
5372     }
5373 
5374     return list;
5375 }
5376 
5377 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5378 {
5379     if (bitmap) {
5380         return hbitmap_get(bitmap->bitmap, sector);
5381     } else {
5382         return 0;
5383     }
5384 }
5385 
5386 void bdrv_dirty_iter_init(BlockDriverState *bs,
5387                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5388 {
5389     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5390 }
5391 
5392 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5393                     int nr_sectors)
5394 {
5395     BdrvDirtyBitmap *bitmap;
5396     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5397         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5398     }
5399 }
5400 
5401 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5402 {
5403     BdrvDirtyBitmap *bitmap;
5404     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5405         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5406     }
5407 }
5408 
5409 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5410 {
5411     return hbitmap_count(bitmap->bitmap);
5412 }
5413 
5414 /* Get a reference to bs */
5415 void bdrv_ref(BlockDriverState *bs)
5416 {
5417     bs->refcnt++;
5418 }
5419 
5420 /* Release a previously grabbed reference to bs.
5421  * If after releasing, reference count is zero, the BlockDriverState is
5422  * deleted. */
5423 void bdrv_unref(BlockDriverState *bs)
5424 {
5425     if (!bs) {
5426         return;
5427     }
5428     assert(bs->refcnt > 0);
5429     if (--bs->refcnt == 0) {
5430         bdrv_delete(bs);
5431     }
5432 }
5433 
5434 struct BdrvOpBlocker {
5435     Error *reason;
5436     QLIST_ENTRY(BdrvOpBlocker) list;
5437 };
5438 
5439 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5440 {
5441     BdrvOpBlocker *blocker;
5442     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5443     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5444         blocker = QLIST_FIRST(&bs->op_blockers[op]);
5445         if (errp) {
5446             error_setg(errp, "Device '%s' is busy: %s",
5447                        bs->device_name, error_get_pretty(blocker->reason));
5448         }
5449         return true;
5450     }
5451     return false;
5452 }
5453 
5454 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5455 {
5456     BdrvOpBlocker *blocker;
5457     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5458 
5459     blocker = g_new0(BdrvOpBlocker, 1);
5460     blocker->reason = reason;
5461     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5462 }
5463 
5464 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5465 {
5466     BdrvOpBlocker *blocker, *next;
5467     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5468     QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5469         if (blocker->reason == reason) {
5470             QLIST_REMOVE(blocker, list);
5471             g_free(blocker);
5472         }
5473     }
5474 }
5475 
5476 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5477 {
5478     int i;
5479     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5480         bdrv_op_block(bs, i, reason);
5481     }
5482 }
5483 
5484 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5485 {
5486     int i;
5487     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5488         bdrv_op_unblock(bs, i, reason);
5489     }
5490 }
5491 
5492 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5493 {
5494     int i;
5495 
5496     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5497         if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5498             return false;
5499         }
5500     }
5501     return true;
5502 }
5503 
5504 void bdrv_iostatus_enable(BlockDriverState *bs)
5505 {
5506     bs->iostatus_enabled = true;
5507     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5508 }
5509 
5510 /* The I/O status is only enabled if the drive explicitly
5511  * enables it _and_ the VM is configured to stop on errors */
5512 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5513 {
5514     return (bs->iostatus_enabled &&
5515            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5516             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5517             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5518 }
5519 
5520 void bdrv_iostatus_disable(BlockDriverState *bs)
5521 {
5522     bs->iostatus_enabled = false;
5523 }
5524 
5525 void bdrv_iostatus_reset(BlockDriverState *bs)
5526 {
5527     if (bdrv_iostatus_is_enabled(bs)) {
5528         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5529         if (bs->job) {
5530             block_job_iostatus_reset(bs->job);
5531         }
5532     }
5533 }
5534 
5535 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5536 {
5537     assert(bdrv_iostatus_is_enabled(bs));
5538     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5539         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5540                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5541     }
5542 }
5543 
5544 void
5545 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5546         enum BlockAcctType type)
5547 {
5548     assert(type < BDRV_MAX_IOTYPE);
5549 
5550     cookie->bytes = bytes;
5551     cookie->start_time_ns = get_clock();
5552     cookie->type = type;
5553 }
5554 
5555 void
5556 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5557 {
5558     assert(cookie->type < BDRV_MAX_IOTYPE);
5559 
5560     bs->nr_bytes[cookie->type] += cookie->bytes;
5561     bs->nr_ops[cookie->type]++;
5562     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5563 }
5564 
5565 void bdrv_img_create(const char *filename, const char *fmt,
5566                      const char *base_filename, const char *base_fmt,
5567                      char *options, uint64_t img_size, int flags,
5568                      Error **errp, bool quiet)
5569 {
5570     QemuOptsList *create_opts = NULL;
5571     QemuOpts *opts = NULL;
5572     const char *backing_fmt, *backing_file;
5573     int64_t size;
5574     BlockDriver *drv, *proto_drv;
5575     BlockDriver *backing_drv = NULL;
5576     Error *local_err = NULL;
5577     int ret = 0;
5578 
5579     /* Find driver and parse its options */
5580     drv = bdrv_find_format(fmt);
5581     if (!drv) {
5582         error_setg(errp, "Unknown file format '%s'", fmt);
5583         return;
5584     }
5585 
5586     proto_drv = bdrv_find_protocol(filename, true);
5587     if (!proto_drv) {
5588         error_setg(errp, "Unknown protocol '%s'", filename);
5589         return;
5590     }
5591 
5592     create_opts = qemu_opts_append(create_opts, drv->create_opts);
5593     create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5594 
5595     /* Create parameter list with default values */
5596     opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5597     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5598 
5599     /* Parse -o options */
5600     if (options) {
5601         if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5602             error_setg(errp, "Invalid options for file format '%s'", fmt);
5603             goto out;
5604         }
5605     }
5606 
5607     if (base_filename) {
5608         if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5609             error_setg(errp, "Backing file not supported for file format '%s'",
5610                        fmt);
5611             goto out;
5612         }
5613     }
5614 
5615     if (base_fmt) {
5616         if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5617             error_setg(errp, "Backing file format not supported for file "
5618                              "format '%s'", fmt);
5619             goto out;
5620         }
5621     }
5622 
5623     backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5624     if (backing_file) {
5625         if (!strcmp(filename, backing_file)) {
5626             error_setg(errp, "Error: Trying to create an image with the "
5627                              "same filename as the backing file");
5628             goto out;
5629         }
5630     }
5631 
5632     backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5633     if (backing_fmt) {
5634         backing_drv = bdrv_find_format(backing_fmt);
5635         if (!backing_drv) {
5636             error_setg(errp, "Unknown backing file format '%s'",
5637                        backing_fmt);
5638             goto out;
5639         }
5640     }
5641 
5642     // The size for the image must always be specified, with one exception:
5643     // If we are using a backing file, we can obtain the size from there
5644     size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5645     if (size == -1) {
5646         if (backing_file) {
5647             BlockDriverState *bs;
5648             int64_t size;
5649             int back_flags;
5650 
5651             /* backing files always opened read-only */
5652             back_flags =
5653                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5654 
5655             bs = NULL;
5656             ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5657                             backing_drv, &local_err);
5658             if (ret < 0) {
5659                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5660                                  backing_file,
5661                                  error_get_pretty(local_err));
5662                 error_free(local_err);
5663                 local_err = NULL;
5664                 goto out;
5665             }
5666             size = bdrv_getlength(bs);
5667             if (size < 0) {
5668                 error_setg_errno(errp, -size, "Could not get size of '%s'",
5669                                  backing_file);
5670                 bdrv_unref(bs);
5671                 goto out;
5672             }
5673 
5674             qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5675 
5676             bdrv_unref(bs);
5677         } else {
5678             error_setg(errp, "Image creation needs a size parameter");
5679             goto out;
5680         }
5681     }
5682 
5683     if (!quiet) {
5684         printf("Formatting '%s', fmt=%s ", filename, fmt);
5685         qemu_opts_print(opts);
5686         puts("");
5687     }
5688 
5689     ret = bdrv_create(drv, filename, opts, &local_err);
5690 
5691     if (ret == -EFBIG) {
5692         /* This is generally a better message than whatever the driver would
5693          * deliver (especially because of the cluster_size_hint), since that
5694          * is most probably not much different from "image too large". */
5695         const char *cluster_size_hint = "";
5696         if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5697             cluster_size_hint = " (try using a larger cluster size)";
5698         }
5699         error_setg(errp, "The image size is too large for file format '%s'"
5700                    "%s", fmt, cluster_size_hint);
5701         error_free(local_err);
5702         local_err = NULL;
5703     }
5704 
5705 out:
5706     qemu_opts_del(opts);
5707     qemu_opts_free(create_opts);
5708     if (local_err) {
5709         error_propagate(errp, local_err);
5710     }
5711 }
5712 
5713 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5714 {
5715     return bs->aio_context;
5716 }
5717 
5718 void bdrv_detach_aio_context(BlockDriverState *bs)
5719 {
5720     if (!bs->drv) {
5721         return;
5722     }
5723 
5724     if (bs->io_limits_enabled) {
5725         throttle_detach_aio_context(&bs->throttle_state);
5726     }
5727     if (bs->drv->bdrv_detach_aio_context) {
5728         bs->drv->bdrv_detach_aio_context(bs);
5729     }
5730     if (bs->file) {
5731         bdrv_detach_aio_context(bs->file);
5732     }
5733     if (bs->backing_hd) {
5734         bdrv_detach_aio_context(bs->backing_hd);
5735     }
5736 
5737     bs->aio_context = NULL;
5738 }
5739 
5740 void bdrv_attach_aio_context(BlockDriverState *bs,
5741                              AioContext *new_context)
5742 {
5743     if (!bs->drv) {
5744         return;
5745     }
5746 
5747     bs->aio_context = new_context;
5748 
5749     if (bs->backing_hd) {
5750         bdrv_attach_aio_context(bs->backing_hd, new_context);
5751     }
5752     if (bs->file) {
5753         bdrv_attach_aio_context(bs->file, new_context);
5754     }
5755     if (bs->drv->bdrv_attach_aio_context) {
5756         bs->drv->bdrv_attach_aio_context(bs, new_context);
5757     }
5758     if (bs->io_limits_enabled) {
5759         throttle_attach_aio_context(&bs->throttle_state, new_context);
5760     }
5761 }
5762 
5763 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5764 {
5765     bdrv_drain_all(); /* ensure there are no in-flight requests */
5766 
5767     bdrv_detach_aio_context(bs);
5768 
5769     /* This function executes in the old AioContext so acquire the new one in
5770      * case it runs in a different thread.
5771      */
5772     aio_context_acquire(new_context);
5773     bdrv_attach_aio_context(bs, new_context);
5774     aio_context_release(new_context);
5775 }
5776 
5777 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5778                                     NotifierWithReturn *notifier)
5779 {
5780     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5781 }
5782 
5783 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts)
5784 {
5785     if (!bs->drv->bdrv_amend_options) {
5786         return -ENOTSUP;
5787     }
5788     return bs->drv->bdrv_amend_options(bs, opts);
5789 }
5790 
5791 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5792  * of block filter and by bdrv_is_first_non_filter.
5793  * It is used to test if the given bs is the candidate or recurse more in the
5794  * node graph.
5795  */
5796 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5797                                       BlockDriverState *candidate)
5798 {
5799     /* return false if basic checks fails */
5800     if (!bs || !bs->drv) {
5801         return false;
5802     }
5803 
5804     /* the code reached a non block filter driver -> check if the bs is
5805      * the same as the candidate. It's the recursion termination condition.
5806      */
5807     if (!bs->drv->is_filter) {
5808         return bs == candidate;
5809     }
5810     /* Down this path the driver is a block filter driver */
5811 
5812     /* If the block filter recursion method is defined use it to recurse down
5813      * the node graph.
5814      */
5815     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5816         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5817     }
5818 
5819     /* the driver is a block filter but don't allow to recurse -> return false
5820      */
5821     return false;
5822 }
5823 
5824 /* This function checks if the candidate is the first non filter bs down it's
5825  * bs chain. Since we don't have pointers to parents it explore all bs chains
5826  * from the top. Some filters can choose not to pass down the recursion.
5827  */
5828 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5829 {
5830     BlockDriverState *bs;
5831 
5832     /* walk down the bs forest recursively */
5833     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5834         bool perm;
5835 
5836         /* try to recurse in this top level bs */
5837         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5838 
5839         /* candidate is the first non filter */
5840         if (perm) {
5841             return true;
5842         }
5843     }
5844 
5845     return false;
5846 }
5847 
5848 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5849 {
5850     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5851     if (!to_replace_bs) {
5852         error_setg(errp, "Node name '%s' not found", node_name);
5853         return NULL;
5854     }
5855 
5856     if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5857         return NULL;
5858     }
5859 
5860     /* We don't want arbitrary node of the BDS chain to be replaced only the top
5861      * most non filter in order to prevent data corruption.
5862      * Another benefit is that this tests exclude backing files which are
5863      * blocked by the backing blockers.
5864      */
5865     if (!bdrv_is_first_non_filter(to_replace_bs)) {
5866         error_setg(errp, "Only top most non filter can be replaced");
5867         return NULL;
5868     }
5869 
5870     return to_replace_bs;
5871 }
5872 
5873 void bdrv_io_plug(BlockDriverState *bs)
5874 {
5875     BlockDriver *drv = bs->drv;
5876     if (drv && drv->bdrv_io_plug) {
5877         drv->bdrv_io_plug(bs);
5878     } else if (bs->file) {
5879         bdrv_io_plug(bs->file);
5880     }
5881 }
5882 
5883 void bdrv_io_unplug(BlockDriverState *bs)
5884 {
5885     BlockDriver *drv = bs->drv;
5886     if (drv && drv->bdrv_io_unplug) {
5887         drv->bdrv_io_unplug(bs);
5888     } else if (bs->file) {
5889         bdrv_io_unplug(bs->file);
5890     }
5891 }
5892 
5893 void bdrv_flush_io_queue(BlockDriverState *bs)
5894 {
5895     BlockDriver *drv = bs->drv;
5896     if (drv && drv->bdrv_flush_io_queue) {
5897         drv->bdrv_flush_io_queue(bs);
5898     } else if (bs->file) {
5899         bdrv_flush_io_queue(bs->file);
5900     }
5901 }
5902 
5903 static bool append_open_options(QDict *d, BlockDriverState *bs)
5904 {
5905     const QDictEntry *entry;
5906     bool found_any = false;
5907 
5908     for (entry = qdict_first(bs->options); entry;
5909          entry = qdict_next(bs->options, entry))
5910     {
5911         /* Only take options for this level and exclude all non-driver-specific
5912          * options */
5913         if (!strchr(qdict_entry_key(entry), '.') &&
5914             strcmp(qdict_entry_key(entry), "node-name"))
5915         {
5916             qobject_incref(qdict_entry_value(entry));
5917             qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5918             found_any = true;
5919         }
5920     }
5921 
5922     return found_any;
5923 }
5924 
5925 /* Updates the following BDS fields:
5926  *  - exact_filename: A filename which may be used for opening a block device
5927  *                    which (mostly) equals the given BDS (even without any
5928  *                    other options; so reading and writing must return the same
5929  *                    results, but caching etc. may be different)
5930  *  - full_open_options: Options which, when given when opening a block device
5931  *                       (without a filename), result in a BDS (mostly)
5932  *                       equalling the given one
5933  *  - filename: If exact_filename is set, it is copied here. Otherwise,
5934  *              full_open_options is converted to a JSON object, prefixed with
5935  *              "json:" (for use through the JSON pseudo protocol) and put here.
5936  */
5937 void bdrv_refresh_filename(BlockDriverState *bs)
5938 {
5939     BlockDriver *drv = bs->drv;
5940     QDict *opts;
5941 
5942     if (!drv) {
5943         return;
5944     }
5945 
5946     /* This BDS's file name will most probably depend on its file's name, so
5947      * refresh that first */
5948     if (bs->file) {
5949         bdrv_refresh_filename(bs->file);
5950     }
5951 
5952     if (drv->bdrv_refresh_filename) {
5953         /* Obsolete information is of no use here, so drop the old file name
5954          * information before refreshing it */
5955         bs->exact_filename[0] = '\0';
5956         if (bs->full_open_options) {
5957             QDECREF(bs->full_open_options);
5958             bs->full_open_options = NULL;
5959         }
5960 
5961         drv->bdrv_refresh_filename(bs);
5962     } else if (bs->file) {
5963         /* Try to reconstruct valid information from the underlying file */
5964         bool has_open_options;
5965 
5966         bs->exact_filename[0] = '\0';
5967         if (bs->full_open_options) {
5968             QDECREF(bs->full_open_options);
5969             bs->full_open_options = NULL;
5970         }
5971 
5972         opts = qdict_new();
5973         has_open_options = append_open_options(opts, bs);
5974 
5975         /* If no specific options have been given for this BDS, the filename of
5976          * the underlying file should suffice for this one as well */
5977         if (bs->file->exact_filename[0] && !has_open_options) {
5978             strcpy(bs->exact_filename, bs->file->exact_filename);
5979         }
5980         /* Reconstructing the full options QDict is simple for most format block
5981          * drivers, as long as the full options are known for the underlying
5982          * file BDS. The full options QDict of that file BDS should somehow
5983          * contain a representation of the filename, therefore the following
5984          * suffices without querying the (exact_)filename of this BDS. */
5985         if (bs->file->full_open_options) {
5986             qdict_put_obj(opts, "driver",
5987                           QOBJECT(qstring_from_str(drv->format_name)));
5988             QINCREF(bs->file->full_open_options);
5989             qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
5990 
5991             bs->full_open_options = opts;
5992         } else {
5993             QDECREF(opts);
5994         }
5995     } else if (!bs->full_open_options && qdict_size(bs->options)) {
5996         /* There is no underlying file BDS (at least referenced by BDS.file),
5997          * so the full options QDict should be equal to the options given
5998          * specifically for this block device when it was opened (plus the
5999          * driver specification).
6000          * Because those options don't change, there is no need to update
6001          * full_open_options when it's already set. */
6002 
6003         opts = qdict_new();
6004         append_open_options(opts, bs);
6005         qdict_put_obj(opts, "driver",
6006                       QOBJECT(qstring_from_str(drv->format_name)));
6007 
6008         if (bs->exact_filename[0]) {
6009             /* This may not work for all block protocol drivers (some may
6010              * require this filename to be parsed), but we have to find some
6011              * default solution here, so just include it. If some block driver
6012              * does not support pure options without any filename at all or
6013              * needs some special format of the options QDict, it needs to
6014              * implement the driver-specific bdrv_refresh_filename() function.
6015              */
6016             qdict_put_obj(opts, "filename",
6017                           QOBJECT(qstring_from_str(bs->exact_filename)));
6018         }
6019 
6020         bs->full_open_options = opts;
6021     }
6022 
6023     if (bs->exact_filename[0]) {
6024         pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6025     } else if (bs->full_open_options) {
6026         QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6027         snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6028                  qstring_get_str(json));
6029         QDECREF(json);
6030     }
6031 }
6032