xref: /qemu/block/io.c (revision 3d100d0f)
1 /*
2  * Block layer I/O functions
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "trace.h"
27 #include "sysemu/block-backend.h"
28 #include "block/blockjob.h"
29 #include "block/block_int.h"
30 #include "block/throttle-groups.h"
31 #include "qemu/cutils.h"
32 #include "qapi/error.h"
33 #include "qemu/error-report.h"
34 
35 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
36 
37 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
38         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
39         BlockCompletionFunc *cb, void *opaque);
40 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
41         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
42         BlockCompletionFunc *cb, void *opaque);
43 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
44                                          int64_t sector_num, int nb_sectors,
45                                          QEMUIOVector *iov);
46 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
47                                          int64_t sector_num, int nb_sectors,
48                                          QEMUIOVector *iov);
49 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
50                                          int64_t sector_num,
51                                          QEMUIOVector *qiov,
52                                          int nb_sectors,
53                                          BdrvRequestFlags flags,
54                                          BlockCompletionFunc *cb,
55                                          void *opaque,
56                                          bool is_write);
57 static void coroutine_fn bdrv_co_do_rw(void *opaque);
58 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
59     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
60 
61 /* throttling disk I/O limits */
62 void bdrv_set_io_limits(BlockDriverState *bs,
63                         ThrottleConfig *cfg)
64 {
65     int i;
66 
67     throttle_group_config(bs, cfg);
68 
69     for (i = 0; i < 2; i++) {
70         qemu_co_enter_next(&bs->throttled_reqs[i]);
71     }
72 }
73 
74 /* this function drain all the throttled IOs */
75 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
76 {
77     bool drained = false;
78     bool enabled = bs->io_limits_enabled;
79     int i;
80 
81     bs->io_limits_enabled = false;
82 
83     for (i = 0; i < 2; i++) {
84         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
85             drained = true;
86         }
87     }
88 
89     bs->io_limits_enabled = enabled;
90 
91     return drained;
92 }
93 
94 void bdrv_io_limits_disable(BlockDriverState *bs)
95 {
96     bs->io_limits_enabled = false;
97     bdrv_start_throttled_reqs(bs);
98     throttle_group_unregister_bs(bs);
99 }
100 
101 /* should be called before bdrv_set_io_limits if a limit is set */
102 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group)
103 {
104     assert(!bs->io_limits_enabled);
105     throttle_group_register_bs(bs, group);
106     bs->io_limits_enabled = true;
107 }
108 
109 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group)
110 {
111     /* this bs is not part of any group */
112     if (!bs->throttle_state) {
113         return;
114     }
115 
116     /* this bs is a part of the same group than the one we want */
117     if (!g_strcmp0(throttle_group_get_name(bs), group)) {
118         return;
119     }
120 
121     /* need to change the group this bs belong to */
122     bdrv_io_limits_disable(bs);
123     bdrv_io_limits_enable(bs, group);
124 }
125 
126 void bdrv_setup_io_funcs(BlockDriver *bdrv)
127 {
128     /* Block drivers without coroutine functions need emulation */
129     if (!bdrv->bdrv_co_readv) {
130         bdrv->bdrv_co_readv = bdrv_co_readv_em;
131         bdrv->bdrv_co_writev = bdrv_co_writev_em;
132 
133         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
134          * the block driver lacks aio we need to emulate that too.
135          */
136         if (!bdrv->bdrv_aio_readv) {
137             /* add AIO emulation layer */
138             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
139             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
140         }
141     }
142 }
143 
144 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
145 {
146     BlockDriver *drv = bs->drv;
147     Error *local_err = NULL;
148 
149     memset(&bs->bl, 0, sizeof(bs->bl));
150 
151     if (!drv) {
152         return;
153     }
154 
155     /* Take some limits from the children as a default */
156     if (bs->file) {
157         bdrv_refresh_limits(bs->file->bs, &local_err);
158         if (local_err) {
159             error_propagate(errp, local_err);
160             return;
161         }
162         bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length;
163         bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length;
164         bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment;
165         bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment;
166         bs->bl.max_iov = bs->file->bs->bl.max_iov;
167     } else {
168         bs->bl.min_mem_alignment = 512;
169         bs->bl.opt_mem_alignment = getpagesize();
170 
171         /* Safe default since most protocols use readv()/writev()/etc */
172         bs->bl.max_iov = IOV_MAX;
173     }
174 
175     if (bs->backing) {
176         bdrv_refresh_limits(bs->backing->bs, &local_err);
177         if (local_err) {
178             error_propagate(errp, local_err);
179             return;
180         }
181         bs->bl.opt_transfer_length =
182             MAX(bs->bl.opt_transfer_length,
183                 bs->backing->bs->bl.opt_transfer_length);
184         bs->bl.max_transfer_length =
185             MIN_NON_ZERO(bs->bl.max_transfer_length,
186                          bs->backing->bs->bl.max_transfer_length);
187         bs->bl.opt_mem_alignment =
188             MAX(bs->bl.opt_mem_alignment,
189                 bs->backing->bs->bl.opt_mem_alignment);
190         bs->bl.min_mem_alignment =
191             MAX(bs->bl.min_mem_alignment,
192                 bs->backing->bs->bl.min_mem_alignment);
193         bs->bl.max_iov =
194             MIN(bs->bl.max_iov,
195                 bs->backing->bs->bl.max_iov);
196     }
197 
198     /* Then let the driver override it */
199     if (drv->bdrv_refresh_limits) {
200         drv->bdrv_refresh_limits(bs, errp);
201     }
202 }
203 
204 /**
205  * The copy-on-read flag is actually a reference count so multiple users may
206  * use the feature without worrying about clobbering its previous state.
207  * Copy-on-read stays enabled until all users have called to disable it.
208  */
209 void bdrv_enable_copy_on_read(BlockDriverState *bs)
210 {
211     bs->copy_on_read++;
212 }
213 
214 void bdrv_disable_copy_on_read(BlockDriverState *bs)
215 {
216     assert(bs->copy_on_read > 0);
217     bs->copy_on_read--;
218 }
219 
220 /* Check if any requests are in-flight (including throttled requests) */
221 bool bdrv_requests_pending(BlockDriverState *bs)
222 {
223     BdrvChild *child;
224 
225     if (!QLIST_EMPTY(&bs->tracked_requests)) {
226         return true;
227     }
228     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
229         return true;
230     }
231     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
232         return true;
233     }
234 
235     QLIST_FOREACH(child, &bs->children, next) {
236         if (bdrv_requests_pending(child->bs)) {
237             return true;
238         }
239     }
240 
241     return false;
242 }
243 
244 static void bdrv_drain_recurse(BlockDriverState *bs)
245 {
246     BdrvChild *child;
247 
248     if (bs->drv && bs->drv->bdrv_drain) {
249         bs->drv->bdrv_drain(bs);
250     }
251     QLIST_FOREACH(child, &bs->children, next) {
252         bdrv_drain_recurse(child->bs);
253     }
254 }
255 
256 /*
257  * Wait for pending requests to complete on a single BlockDriverState subtree,
258  * and suspend block driver's internal I/O until next request arrives.
259  *
260  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
261  * AioContext.
262  *
263  * Only this BlockDriverState's AioContext is run, so in-flight requests must
264  * not depend on events in other AioContexts.  In that case, use
265  * bdrv_drain_all() instead.
266  */
267 void bdrv_drain(BlockDriverState *bs)
268 {
269     bool busy = true;
270 
271     bdrv_drain_recurse(bs);
272     while (busy) {
273         /* Keep iterating */
274          bdrv_flush_io_queue(bs);
275          busy = bdrv_requests_pending(bs);
276          busy |= aio_poll(bdrv_get_aio_context(bs), busy);
277     }
278 }
279 
280 /*
281  * Wait for pending requests to complete across all BlockDriverStates
282  *
283  * This function does not flush data to disk, use bdrv_flush_all() for that
284  * after calling this function.
285  */
286 void bdrv_drain_all(void)
287 {
288     /* Always run first iteration so any pending completion BHs run */
289     bool busy = true;
290     BlockDriverState *bs = NULL;
291     GSList *aio_ctxs = NULL, *ctx;
292 
293     while ((bs = bdrv_next(bs))) {
294         AioContext *aio_context = bdrv_get_aio_context(bs);
295 
296         aio_context_acquire(aio_context);
297         if (bs->job) {
298             block_job_pause(bs->job);
299         }
300         bdrv_drain_recurse(bs);
301         aio_context_release(aio_context);
302 
303         if (!g_slist_find(aio_ctxs, aio_context)) {
304             aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
305         }
306     }
307 
308     /* Note that completion of an asynchronous I/O operation can trigger any
309      * number of other I/O operations on other devices---for example a
310      * coroutine can submit an I/O request to another device in response to
311      * request completion.  Therefore we must keep looping until there was no
312      * more activity rather than simply draining each device independently.
313      */
314     while (busy) {
315         busy = false;
316 
317         for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
318             AioContext *aio_context = ctx->data;
319             bs = NULL;
320 
321             aio_context_acquire(aio_context);
322             while ((bs = bdrv_next(bs))) {
323                 if (aio_context == bdrv_get_aio_context(bs)) {
324                     bdrv_flush_io_queue(bs);
325                     if (bdrv_requests_pending(bs)) {
326                         busy = true;
327                         aio_poll(aio_context, busy);
328                     }
329                 }
330             }
331             busy |= aio_poll(aio_context, false);
332             aio_context_release(aio_context);
333         }
334     }
335 
336     bs = NULL;
337     while ((bs = bdrv_next(bs))) {
338         AioContext *aio_context = bdrv_get_aio_context(bs);
339 
340         aio_context_acquire(aio_context);
341         if (bs->job) {
342             block_job_resume(bs->job);
343         }
344         aio_context_release(aio_context);
345     }
346     g_slist_free(aio_ctxs);
347 }
348 
349 /**
350  * Remove an active request from the tracked requests list
351  *
352  * This function should be called when a tracked request is completing.
353  */
354 static void tracked_request_end(BdrvTrackedRequest *req)
355 {
356     if (req->serialising) {
357         req->bs->serialising_in_flight--;
358     }
359 
360     QLIST_REMOVE(req, list);
361     qemu_co_queue_restart_all(&req->wait_queue);
362 }
363 
364 /**
365  * Add an active request to the tracked requests list
366  */
367 static void tracked_request_begin(BdrvTrackedRequest *req,
368                                   BlockDriverState *bs,
369                                   int64_t offset,
370                                   unsigned int bytes,
371                                   enum BdrvTrackedRequestType type)
372 {
373     *req = (BdrvTrackedRequest){
374         .bs = bs,
375         .offset         = offset,
376         .bytes          = bytes,
377         .type           = type,
378         .co             = qemu_coroutine_self(),
379         .serialising    = false,
380         .overlap_offset = offset,
381         .overlap_bytes  = bytes,
382     };
383 
384     qemu_co_queue_init(&req->wait_queue);
385 
386     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
387 }
388 
389 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
390 {
391     int64_t overlap_offset = req->offset & ~(align - 1);
392     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
393                                - overlap_offset;
394 
395     if (!req->serialising) {
396         req->bs->serialising_in_flight++;
397         req->serialising = true;
398     }
399 
400     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
401     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
402 }
403 
404 /**
405  * Round a region to cluster boundaries
406  */
407 void bdrv_round_to_clusters(BlockDriverState *bs,
408                             int64_t sector_num, int nb_sectors,
409                             int64_t *cluster_sector_num,
410                             int *cluster_nb_sectors)
411 {
412     BlockDriverInfo bdi;
413 
414     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
415         *cluster_sector_num = sector_num;
416         *cluster_nb_sectors = nb_sectors;
417     } else {
418         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
419         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
420         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
421                                             nb_sectors, c);
422     }
423 }
424 
425 static int bdrv_get_cluster_size(BlockDriverState *bs)
426 {
427     BlockDriverInfo bdi;
428     int ret;
429 
430     ret = bdrv_get_info(bs, &bdi);
431     if (ret < 0 || bdi.cluster_size == 0) {
432         return bs->request_alignment;
433     } else {
434         return bdi.cluster_size;
435     }
436 }
437 
438 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
439                                      int64_t offset, unsigned int bytes)
440 {
441     /*        aaaa   bbbb */
442     if (offset >= req->overlap_offset + req->overlap_bytes) {
443         return false;
444     }
445     /* bbbb   aaaa        */
446     if (req->overlap_offset >= offset + bytes) {
447         return false;
448     }
449     return true;
450 }
451 
452 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
453 {
454     BlockDriverState *bs = self->bs;
455     BdrvTrackedRequest *req;
456     bool retry;
457     bool waited = false;
458 
459     if (!bs->serialising_in_flight) {
460         return false;
461     }
462 
463     do {
464         retry = false;
465         QLIST_FOREACH(req, &bs->tracked_requests, list) {
466             if (req == self || (!req->serialising && !self->serialising)) {
467                 continue;
468             }
469             if (tracked_request_overlaps(req, self->overlap_offset,
470                                          self->overlap_bytes))
471             {
472                 /* Hitting this means there was a reentrant request, for
473                  * example, a block driver issuing nested requests.  This must
474                  * never happen since it means deadlock.
475                  */
476                 assert(qemu_coroutine_self() != req->co);
477 
478                 /* If the request is already (indirectly) waiting for us, or
479                  * will wait for us as soon as it wakes up, then just go on
480                  * (instead of producing a deadlock in the former case). */
481                 if (!req->waiting_for) {
482                     self->waiting_for = req;
483                     qemu_co_queue_wait(&req->wait_queue);
484                     self->waiting_for = NULL;
485                     retry = true;
486                     waited = true;
487                     break;
488                 }
489             }
490         }
491     } while (retry);
492 
493     return waited;
494 }
495 
496 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
497                                    size_t size)
498 {
499     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
500         return -EIO;
501     }
502 
503     if (!bdrv_is_inserted(bs)) {
504         return -ENOMEDIUM;
505     }
506 
507     if (offset < 0) {
508         return -EIO;
509     }
510 
511     return 0;
512 }
513 
514 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
515                               int nb_sectors)
516 {
517     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
518         return -EIO;
519     }
520 
521     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
522                                    nb_sectors * BDRV_SECTOR_SIZE);
523 }
524 
525 typedef struct RwCo {
526     BlockDriverState *bs;
527     int64_t offset;
528     QEMUIOVector *qiov;
529     bool is_write;
530     int ret;
531     BdrvRequestFlags flags;
532 } RwCo;
533 
534 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
535 {
536     RwCo *rwco = opaque;
537 
538     if (!rwco->is_write) {
539         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
540                                       rwco->qiov->size, rwco->qiov,
541                                       rwco->flags);
542     } else {
543         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
544                                        rwco->qiov->size, rwco->qiov,
545                                        rwco->flags);
546     }
547 }
548 
549 /*
550  * Process a vectored synchronous request using coroutines
551  */
552 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
553                         QEMUIOVector *qiov, bool is_write,
554                         BdrvRequestFlags flags)
555 {
556     Coroutine *co;
557     RwCo rwco = {
558         .bs = bs,
559         .offset = offset,
560         .qiov = qiov,
561         .is_write = is_write,
562         .ret = NOT_DONE,
563         .flags = flags,
564     };
565 
566     /**
567      * In sync call context, when the vcpu is blocked, this throttling timer
568      * will not fire; so the I/O throttling function has to be disabled here
569      * if it has been enabled.
570      */
571     if (bs->io_limits_enabled) {
572         fprintf(stderr, "Disabling I/O throttling on '%s' due "
573                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
574         bdrv_io_limits_disable(bs);
575     }
576 
577     if (qemu_in_coroutine()) {
578         /* Fast-path if already in coroutine context */
579         bdrv_rw_co_entry(&rwco);
580     } else {
581         AioContext *aio_context = bdrv_get_aio_context(bs);
582 
583         co = qemu_coroutine_create(bdrv_rw_co_entry);
584         qemu_coroutine_enter(co, &rwco);
585         while (rwco.ret == NOT_DONE) {
586             aio_poll(aio_context, true);
587         }
588     }
589     return rwco.ret;
590 }
591 
592 /*
593  * Process a synchronous request using coroutines
594  */
595 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
596                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
597 {
598     QEMUIOVector qiov;
599     struct iovec iov = {
600         .iov_base = (void *)buf,
601         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
602     };
603 
604     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
605         return -EINVAL;
606     }
607 
608     qemu_iovec_init_external(&qiov, &iov, 1);
609     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
610                         &qiov, is_write, flags);
611 }
612 
613 /* return < 0 if error. See bdrv_write() for the return codes */
614 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
615               uint8_t *buf, int nb_sectors)
616 {
617     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
618 }
619 
620 /* Return < 0 if error. Important errors are:
621   -EIO         generic I/O error (may happen for all errors)
622   -ENOMEDIUM   No media inserted.
623   -EINVAL      Invalid sector number or nb_sectors
624   -EACCES      Trying to write a read-only device
625 */
626 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
627                const uint8_t *buf, int nb_sectors)
628 {
629     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
630 }
631 
632 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
633                       int nb_sectors, BdrvRequestFlags flags)
634 {
635     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
636                       BDRV_REQ_ZERO_WRITE | flags);
637 }
638 
639 /*
640  * Completely zero out a block device with the help of bdrv_write_zeroes.
641  * The operation is sped up by checking the block status and only writing
642  * zeroes to the device if they currently do not return zeroes. Optional
643  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
644  *
645  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
646  */
647 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
648 {
649     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
650     BlockDriverState *file;
651     int n;
652 
653     target_sectors = bdrv_nb_sectors(bs);
654     if (target_sectors < 0) {
655         return target_sectors;
656     }
657 
658     for (;;) {
659         nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
660         if (nb_sectors <= 0) {
661             return 0;
662         }
663         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file);
664         if (ret < 0) {
665             error_report("error getting block status at sector %" PRId64 ": %s",
666                          sector_num, strerror(-ret));
667             return ret;
668         }
669         if (ret & BDRV_BLOCK_ZERO) {
670             sector_num += n;
671             continue;
672         }
673         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
674         if (ret < 0) {
675             error_report("error writing zeroes at sector %" PRId64 ": %s",
676                          sector_num, strerror(-ret));
677             return ret;
678         }
679         sector_num += n;
680     }
681 }
682 
683 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
684 {
685     QEMUIOVector qiov;
686     struct iovec iov = {
687         .iov_base = (void *)buf,
688         .iov_len = bytes,
689     };
690     int ret;
691 
692     if (bytes < 0) {
693         return -EINVAL;
694     }
695 
696     qemu_iovec_init_external(&qiov, &iov, 1);
697     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
698     if (ret < 0) {
699         return ret;
700     }
701 
702     return bytes;
703 }
704 
705 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
706 {
707     int ret;
708 
709     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
710     if (ret < 0) {
711         return ret;
712     }
713 
714     return qiov->size;
715 }
716 
717 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
718                 const void *buf, int bytes)
719 {
720     QEMUIOVector qiov;
721     struct iovec iov = {
722         .iov_base   = (void *) buf,
723         .iov_len    = bytes,
724     };
725 
726     if (bytes < 0) {
727         return -EINVAL;
728     }
729 
730     qemu_iovec_init_external(&qiov, &iov, 1);
731     return bdrv_pwritev(bs, offset, &qiov);
732 }
733 
734 /*
735  * Writes to the file and ensures that no writes are reordered across this
736  * request (acts as a barrier)
737  *
738  * Returns 0 on success, -errno in error cases.
739  */
740 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
741     const void *buf, int count)
742 {
743     int ret;
744 
745     ret = bdrv_pwrite(bs, offset, buf, count);
746     if (ret < 0) {
747         return ret;
748     }
749 
750     ret = bdrv_flush(bs);
751     if (ret < 0) {
752         return ret;
753     }
754 
755     return 0;
756 }
757 
758 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
759         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
760 {
761     /* Perform I/O through a temporary buffer so that users who scribble over
762      * their read buffer while the operation is in progress do not end up
763      * modifying the image file.  This is critical for zero-copy guest I/O
764      * where anything might happen inside guest memory.
765      */
766     void *bounce_buffer;
767 
768     BlockDriver *drv = bs->drv;
769     struct iovec iov;
770     QEMUIOVector bounce_qiov;
771     int64_t cluster_sector_num;
772     int cluster_nb_sectors;
773     size_t skip_bytes;
774     int ret;
775 
776     /* Cover entire cluster so no additional backing file I/O is required when
777      * allocating cluster in the image file.
778      */
779     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
780                            &cluster_sector_num, &cluster_nb_sectors);
781 
782     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
783                                    cluster_sector_num, cluster_nb_sectors);
784 
785     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
786     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
787     if (bounce_buffer == NULL) {
788         ret = -ENOMEM;
789         goto err;
790     }
791 
792     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
793 
794     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
795                              &bounce_qiov);
796     if (ret < 0) {
797         goto err;
798     }
799 
800     if (drv->bdrv_co_write_zeroes &&
801         buffer_is_zero(bounce_buffer, iov.iov_len)) {
802         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
803                                       cluster_nb_sectors, 0);
804     } else {
805         /* This does not change the data on the disk, it is not necessary
806          * to flush even in cache=writethrough mode.
807          */
808         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
809                                   &bounce_qiov);
810     }
811 
812     if (ret < 0) {
813         /* It might be okay to ignore write errors for guest requests.  If this
814          * is a deliberate copy-on-read then we don't want to ignore the error.
815          * Simply report it in all cases.
816          */
817         goto err;
818     }
819 
820     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
821     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
822                         nb_sectors * BDRV_SECTOR_SIZE);
823 
824 err:
825     qemu_vfree(bounce_buffer);
826     return ret;
827 }
828 
829 /*
830  * Forwards an already correctly aligned request to the BlockDriver. This
831  * handles copy on read and zeroing after EOF; any other features must be
832  * implemented by the caller.
833  */
834 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
835     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
836     int64_t align, QEMUIOVector *qiov, int flags)
837 {
838     BlockDriver *drv = bs->drv;
839     int ret;
840 
841     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
842     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
843 
844     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
845     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
846     assert(!qiov || bytes == qiov->size);
847     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
848 
849     /* Handle Copy on Read and associated serialisation */
850     if (flags & BDRV_REQ_COPY_ON_READ) {
851         /* If we touch the same cluster it counts as an overlap.  This
852          * guarantees that allocating writes will be serialized and not race
853          * with each other for the same cluster.  For example, in copy-on-read
854          * it ensures that the CoR read and write operations are atomic and
855          * guest writes cannot interleave between them. */
856         mark_request_serialising(req, bdrv_get_cluster_size(bs));
857     }
858 
859     if (!(flags & BDRV_REQ_NO_SERIALISING)) {
860         wait_serialising_requests(req);
861     }
862 
863     if (flags & BDRV_REQ_COPY_ON_READ) {
864         int pnum;
865 
866         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
867         if (ret < 0) {
868             goto out;
869         }
870 
871         if (!ret || pnum != nb_sectors) {
872             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
873             goto out;
874         }
875     }
876 
877     /* Forward the request to the BlockDriver */
878     if (!bs->zero_beyond_eof) {
879         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
880     } else {
881         /* Read zeros after EOF */
882         int64_t total_sectors, max_nb_sectors;
883 
884         total_sectors = bdrv_nb_sectors(bs);
885         if (total_sectors < 0) {
886             ret = total_sectors;
887             goto out;
888         }
889 
890         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
891                                   align >> BDRV_SECTOR_BITS);
892         if (nb_sectors < max_nb_sectors) {
893             ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
894         } else if (max_nb_sectors > 0) {
895             QEMUIOVector local_qiov;
896 
897             qemu_iovec_init(&local_qiov, qiov->niov);
898             qemu_iovec_concat(&local_qiov, qiov, 0,
899                               max_nb_sectors * BDRV_SECTOR_SIZE);
900 
901             ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
902                                      &local_qiov);
903 
904             qemu_iovec_destroy(&local_qiov);
905         } else {
906             ret = 0;
907         }
908 
909         /* Reading beyond end of file is supposed to produce zeroes */
910         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
911             uint64_t offset = MAX(0, total_sectors - sector_num);
912             uint64_t bytes = (sector_num + nb_sectors - offset) *
913                               BDRV_SECTOR_SIZE;
914             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
915         }
916     }
917 
918 out:
919     return ret;
920 }
921 
922 /*
923  * Handle a read request in coroutine context
924  */
925 int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
926     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
927     BdrvRequestFlags flags)
928 {
929     BlockDriver *drv = bs->drv;
930     BdrvTrackedRequest req;
931 
932     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
933     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
934     uint8_t *head_buf = NULL;
935     uint8_t *tail_buf = NULL;
936     QEMUIOVector local_qiov;
937     bool use_local_qiov = false;
938     int ret;
939 
940     if (!drv) {
941         return -ENOMEDIUM;
942     }
943 
944     ret = bdrv_check_byte_request(bs, offset, bytes);
945     if (ret < 0) {
946         return ret;
947     }
948 
949     /* Don't do copy-on-read if we read data before write operation */
950     if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
951         flags |= BDRV_REQ_COPY_ON_READ;
952     }
953 
954     /* throttling disk I/O */
955     if (bs->io_limits_enabled) {
956         throttle_group_co_io_limits_intercept(bs, bytes, false);
957     }
958 
959     /* Align read if necessary by padding qiov */
960     if (offset & (align - 1)) {
961         head_buf = qemu_blockalign(bs, align);
962         qemu_iovec_init(&local_qiov, qiov->niov + 2);
963         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
964         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
965         use_local_qiov = true;
966 
967         bytes += offset & (align - 1);
968         offset = offset & ~(align - 1);
969     }
970 
971     if ((offset + bytes) & (align - 1)) {
972         if (!use_local_qiov) {
973             qemu_iovec_init(&local_qiov, qiov->niov + 1);
974             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
975             use_local_qiov = true;
976         }
977         tail_buf = qemu_blockalign(bs, align);
978         qemu_iovec_add(&local_qiov, tail_buf,
979                        align - ((offset + bytes) & (align - 1)));
980 
981         bytes = ROUND_UP(bytes, align);
982     }
983 
984     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
985     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
986                               use_local_qiov ? &local_qiov : qiov,
987                               flags);
988     tracked_request_end(&req);
989 
990     if (use_local_qiov) {
991         qemu_iovec_destroy(&local_qiov);
992         qemu_vfree(head_buf);
993         qemu_vfree(tail_buf);
994     }
995 
996     return ret;
997 }
998 
999 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1000     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1001     BdrvRequestFlags flags)
1002 {
1003     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1004         return -EINVAL;
1005     }
1006 
1007     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
1008                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1009 }
1010 
1011 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1012     int nb_sectors, QEMUIOVector *qiov)
1013 {
1014     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1015 
1016     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1017 }
1018 
1019 int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs,
1020     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1021 {
1022     trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors);
1023 
1024     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1025                             BDRV_REQ_NO_SERIALISING);
1026 }
1027 
1028 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1029     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1030 {
1031     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1032 
1033     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1034                             BDRV_REQ_COPY_ON_READ);
1035 }
1036 
1037 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
1038 
1039 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1040     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
1041 {
1042     BlockDriver *drv = bs->drv;
1043     QEMUIOVector qiov;
1044     struct iovec iov = {0};
1045     int ret = 0;
1046 
1047     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
1048                                         BDRV_REQUEST_MAX_SECTORS);
1049 
1050     while (nb_sectors > 0 && !ret) {
1051         int num = nb_sectors;
1052 
1053         /* Align request.  Block drivers can expect the "bulk" of the request
1054          * to be aligned.
1055          */
1056         if (bs->bl.write_zeroes_alignment
1057             && num > bs->bl.write_zeroes_alignment) {
1058             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
1059                 /* Make a small request up to the first aligned sector.  */
1060                 num = bs->bl.write_zeroes_alignment;
1061                 num -= sector_num % bs->bl.write_zeroes_alignment;
1062             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
1063                 /* Shorten the request to the last aligned sector.  num cannot
1064                  * underflow because num > bs->bl.write_zeroes_alignment.
1065                  */
1066                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
1067             }
1068         }
1069 
1070         /* limit request size */
1071         if (num > max_write_zeroes) {
1072             num = max_write_zeroes;
1073         }
1074 
1075         ret = -ENOTSUP;
1076         /* First try the efficient write zeroes operation */
1077         if (drv->bdrv_co_write_zeroes) {
1078             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
1079         }
1080 
1081         if (ret == -ENOTSUP) {
1082             /* Fall back to bounce buffer if write zeroes is unsupported */
1083             int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
1084                                             MAX_WRITE_ZEROES_BOUNCE_BUFFER);
1085             num = MIN(num, max_xfer_len);
1086             iov.iov_len = num * BDRV_SECTOR_SIZE;
1087             if (iov.iov_base == NULL) {
1088                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
1089                 if (iov.iov_base == NULL) {
1090                     ret = -ENOMEM;
1091                     goto fail;
1092                 }
1093                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
1094             }
1095             qemu_iovec_init_external(&qiov, &iov, 1);
1096 
1097             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
1098 
1099             /* Keep bounce buffer around if it is big enough for all
1100              * all future requests.
1101              */
1102             if (num < max_xfer_len) {
1103                 qemu_vfree(iov.iov_base);
1104                 iov.iov_base = NULL;
1105             }
1106         }
1107 
1108         sector_num += num;
1109         nb_sectors -= num;
1110     }
1111 
1112 fail:
1113     qemu_vfree(iov.iov_base);
1114     return ret;
1115 }
1116 
1117 /*
1118  * Forwards an already correctly aligned write request to the BlockDriver.
1119  */
1120 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
1121     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1122     QEMUIOVector *qiov, int flags)
1123 {
1124     BlockDriver *drv = bs->drv;
1125     bool waited;
1126     int ret;
1127 
1128     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
1129     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
1130 
1131     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1132     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1133     assert(!qiov || bytes == qiov->size);
1134     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1135 
1136     waited = wait_serialising_requests(req);
1137     assert(!waited || !req->serialising);
1138     assert(req->overlap_offset <= offset);
1139     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1140 
1141     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1142 
1143     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1144         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
1145         qemu_iovec_is_zero(qiov)) {
1146         flags |= BDRV_REQ_ZERO_WRITE;
1147         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1148             flags |= BDRV_REQ_MAY_UNMAP;
1149         }
1150     }
1151 
1152     if (ret < 0) {
1153         /* Do nothing, write notifier decided to fail this request */
1154     } else if (flags & BDRV_REQ_ZERO_WRITE) {
1155         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1156         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
1157     } else if (drv->bdrv_co_writev_flags) {
1158         bdrv_debug_event(bs, BLKDBG_PWRITEV);
1159         ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
1160                                         flags);
1161     } else {
1162         assert(drv->supported_write_flags == 0);
1163         bdrv_debug_event(bs, BLKDBG_PWRITEV);
1164         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1165     }
1166     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1167 
1168     if (ret == 0 && (flags & BDRV_REQ_FUA) &&
1169         !(drv->supported_write_flags & BDRV_REQ_FUA))
1170     {
1171         ret = bdrv_co_flush(bs);
1172     }
1173 
1174     bdrv_set_dirty(bs, sector_num, nb_sectors);
1175 
1176     if (bs->wr_highest_offset < offset + bytes) {
1177         bs->wr_highest_offset = offset + bytes;
1178     }
1179 
1180     if (ret >= 0) {
1181         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
1182     }
1183 
1184     return ret;
1185 }
1186 
1187 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
1188                                                 int64_t offset,
1189                                                 unsigned int bytes,
1190                                                 BdrvRequestFlags flags,
1191                                                 BdrvTrackedRequest *req)
1192 {
1193     uint8_t *buf = NULL;
1194     QEMUIOVector local_qiov;
1195     struct iovec iov;
1196     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1197     unsigned int head_padding_bytes, tail_padding_bytes;
1198     int ret = 0;
1199 
1200     head_padding_bytes = offset & (align - 1);
1201     tail_padding_bytes = align - ((offset + bytes) & (align - 1));
1202 
1203 
1204     assert(flags & BDRV_REQ_ZERO_WRITE);
1205     if (head_padding_bytes || tail_padding_bytes) {
1206         buf = qemu_blockalign(bs, align);
1207         iov = (struct iovec) {
1208             .iov_base   = buf,
1209             .iov_len    = align,
1210         };
1211         qemu_iovec_init_external(&local_qiov, &iov, 1);
1212     }
1213     if (head_padding_bytes) {
1214         uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1215 
1216         /* RMW the unaligned part before head. */
1217         mark_request_serialising(req, align);
1218         wait_serialising_requests(req);
1219         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1220         ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
1221                                   align, &local_qiov, 0);
1222         if (ret < 0) {
1223             goto fail;
1224         }
1225         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1226 
1227         memset(buf + head_padding_bytes, 0, zero_bytes);
1228         ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
1229                                    &local_qiov,
1230                                    flags & ~BDRV_REQ_ZERO_WRITE);
1231         if (ret < 0) {
1232             goto fail;
1233         }
1234         offset += zero_bytes;
1235         bytes -= zero_bytes;
1236     }
1237 
1238     assert(!bytes || (offset & (align - 1)) == 0);
1239     if (bytes >= align) {
1240         /* Write the aligned part in the middle. */
1241         uint64_t aligned_bytes = bytes & ~(align - 1);
1242         ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes,
1243                                    NULL, flags);
1244         if (ret < 0) {
1245             goto fail;
1246         }
1247         bytes -= aligned_bytes;
1248         offset += aligned_bytes;
1249     }
1250 
1251     assert(!bytes || (offset & (align - 1)) == 0);
1252     if (bytes) {
1253         assert(align == tail_padding_bytes + bytes);
1254         /* RMW the unaligned part after tail. */
1255         mark_request_serialising(req, align);
1256         wait_serialising_requests(req);
1257         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1258         ret = bdrv_aligned_preadv(bs, req, offset, align,
1259                                   align, &local_qiov, 0);
1260         if (ret < 0) {
1261             goto fail;
1262         }
1263         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1264 
1265         memset(buf, 0, bytes);
1266         ret = bdrv_aligned_pwritev(bs, req, offset, align,
1267                                    &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1268     }
1269 fail:
1270     qemu_vfree(buf);
1271     return ret;
1272 
1273 }
1274 
1275 /*
1276  * Handle a write request in coroutine context
1277  */
1278 int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
1279     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1280     BdrvRequestFlags flags)
1281 {
1282     BdrvTrackedRequest req;
1283     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
1284     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1285     uint8_t *head_buf = NULL;
1286     uint8_t *tail_buf = NULL;
1287     QEMUIOVector local_qiov;
1288     bool use_local_qiov = false;
1289     int ret;
1290 
1291     if (!bs->drv) {
1292         return -ENOMEDIUM;
1293     }
1294     if (bs->read_only) {
1295         return -EPERM;
1296     }
1297     assert(!(bs->open_flags & BDRV_O_INACTIVE));
1298 
1299     ret = bdrv_check_byte_request(bs, offset, bytes);
1300     if (ret < 0) {
1301         return ret;
1302     }
1303 
1304     /* throttling disk I/O */
1305     if (bs->io_limits_enabled) {
1306         throttle_group_co_io_limits_intercept(bs, bytes, true);
1307     }
1308 
1309     /*
1310      * Align write if necessary by performing a read-modify-write cycle.
1311      * Pad qiov with the read parts and be sure to have a tracked request not
1312      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1313      */
1314     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1315 
1316     if (!qiov) {
1317         ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
1318         goto out;
1319     }
1320 
1321     if (offset & (align - 1)) {
1322         QEMUIOVector head_qiov;
1323         struct iovec head_iov;
1324 
1325         mark_request_serialising(&req, align);
1326         wait_serialising_requests(&req);
1327 
1328         head_buf = qemu_blockalign(bs, align);
1329         head_iov = (struct iovec) {
1330             .iov_base   = head_buf,
1331             .iov_len    = align,
1332         };
1333         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1334 
1335         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1336         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
1337                                   align, &head_qiov, 0);
1338         if (ret < 0) {
1339             goto fail;
1340         }
1341         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1342 
1343         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1344         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1345         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1346         use_local_qiov = true;
1347 
1348         bytes += offset & (align - 1);
1349         offset = offset & ~(align - 1);
1350     }
1351 
1352     if ((offset + bytes) & (align - 1)) {
1353         QEMUIOVector tail_qiov;
1354         struct iovec tail_iov;
1355         size_t tail_bytes;
1356         bool waited;
1357 
1358         mark_request_serialising(&req, align);
1359         waited = wait_serialising_requests(&req);
1360         assert(!waited || !use_local_qiov);
1361 
1362         tail_buf = qemu_blockalign(bs, align);
1363         tail_iov = (struct iovec) {
1364             .iov_base   = tail_buf,
1365             .iov_len    = align,
1366         };
1367         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1368 
1369         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1370         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
1371                                   align, &tail_qiov, 0);
1372         if (ret < 0) {
1373             goto fail;
1374         }
1375         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1376 
1377         if (!use_local_qiov) {
1378             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1379             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1380             use_local_qiov = true;
1381         }
1382 
1383         tail_bytes = (offset + bytes) & (align - 1);
1384         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1385 
1386         bytes = ROUND_UP(bytes, align);
1387     }
1388 
1389     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
1390                                use_local_qiov ? &local_qiov : qiov,
1391                                flags);
1392 
1393 fail:
1394 
1395     if (use_local_qiov) {
1396         qemu_iovec_destroy(&local_qiov);
1397     }
1398     qemu_vfree(head_buf);
1399     qemu_vfree(tail_buf);
1400 out:
1401     tracked_request_end(&req);
1402     return ret;
1403 }
1404 
1405 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1406     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1407     BdrvRequestFlags flags)
1408 {
1409     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1410         return -EINVAL;
1411     }
1412 
1413     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
1414                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1415 }
1416 
1417 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1418     int nb_sectors, QEMUIOVector *qiov)
1419 {
1420     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1421 
1422     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1423 }
1424 
1425 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1426                                       int64_t sector_num, int nb_sectors,
1427                                       BdrvRequestFlags flags)
1428 {
1429     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
1430 
1431     if (!(bs->open_flags & BDRV_O_UNMAP)) {
1432         flags &= ~BDRV_REQ_MAY_UNMAP;
1433     }
1434 
1435     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1436                              BDRV_REQ_ZERO_WRITE | flags);
1437 }
1438 
1439 typedef struct BdrvCoGetBlockStatusData {
1440     BlockDriverState *bs;
1441     BlockDriverState *base;
1442     BlockDriverState **file;
1443     int64_t sector_num;
1444     int nb_sectors;
1445     int *pnum;
1446     int64_t ret;
1447     bool done;
1448 } BdrvCoGetBlockStatusData;
1449 
1450 /*
1451  * Returns the allocation status of the specified sectors.
1452  * Drivers not implementing the functionality are assumed to not support
1453  * backing files, hence all their sectors are reported as allocated.
1454  *
1455  * If 'sector_num' is beyond the end of the disk image the return value is 0
1456  * and 'pnum' is set to 0.
1457  *
1458  * 'pnum' is set to the number of sectors (including and immediately following
1459  * the specified sector) that are known to be in the same
1460  * allocated/unallocated state.
1461  *
1462  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1463  * beyond the end of the disk image it will be clamped.
1464  *
1465  * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
1466  * points to the BDS which the sector range is allocated in.
1467  */
1468 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
1469                                                      int64_t sector_num,
1470                                                      int nb_sectors, int *pnum,
1471                                                      BlockDriverState **file)
1472 {
1473     int64_t total_sectors;
1474     int64_t n;
1475     int64_t ret, ret2;
1476 
1477     total_sectors = bdrv_nb_sectors(bs);
1478     if (total_sectors < 0) {
1479         return total_sectors;
1480     }
1481 
1482     if (sector_num >= total_sectors) {
1483         *pnum = 0;
1484         return 0;
1485     }
1486 
1487     n = total_sectors - sector_num;
1488     if (n < nb_sectors) {
1489         nb_sectors = n;
1490     }
1491 
1492     if (!bs->drv->bdrv_co_get_block_status) {
1493         *pnum = nb_sectors;
1494         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1495         if (bs->drv->protocol_name) {
1496             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
1497         }
1498         return ret;
1499     }
1500 
1501     *file = NULL;
1502     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
1503                                             file);
1504     if (ret < 0) {
1505         *pnum = 0;
1506         return ret;
1507     }
1508 
1509     if (ret & BDRV_BLOCK_RAW) {
1510         assert(ret & BDRV_BLOCK_OFFSET_VALID);
1511         return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
1512                                      *pnum, pnum, file);
1513     }
1514 
1515     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1516         ret |= BDRV_BLOCK_ALLOCATED;
1517     } else {
1518         if (bdrv_unallocated_blocks_are_zero(bs)) {
1519             ret |= BDRV_BLOCK_ZERO;
1520         } else if (bs->backing) {
1521             BlockDriverState *bs2 = bs->backing->bs;
1522             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
1523             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
1524                 ret |= BDRV_BLOCK_ZERO;
1525             }
1526         }
1527     }
1528 
1529     if (*file && *file != bs &&
1530         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1531         (ret & BDRV_BLOCK_OFFSET_VALID)) {
1532         BlockDriverState *file2;
1533         int file_pnum;
1534 
1535         ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
1536                                         *pnum, &file_pnum, &file2);
1537         if (ret2 >= 0) {
1538             /* Ignore errors.  This is just providing extra information, it
1539              * is useful but not necessary.
1540              */
1541             if (!file_pnum) {
1542                 /* !file_pnum indicates an offset at or beyond the EOF; it is
1543                  * perfectly valid for the format block driver to point to such
1544                  * offsets, so catch it and mark everything as zero */
1545                 ret |= BDRV_BLOCK_ZERO;
1546             } else {
1547                 /* Limit request to the range reported by the protocol driver */
1548                 *pnum = file_pnum;
1549                 ret |= (ret2 & BDRV_BLOCK_ZERO);
1550             }
1551         }
1552     }
1553 
1554     return ret;
1555 }
1556 
1557 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
1558         BlockDriverState *base,
1559         int64_t sector_num,
1560         int nb_sectors,
1561         int *pnum,
1562         BlockDriverState **file)
1563 {
1564     BlockDriverState *p;
1565     int64_t ret = 0;
1566 
1567     assert(bs != base);
1568     for (p = bs; p != base; p = backing_bs(p)) {
1569         ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
1570         if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
1571             break;
1572         }
1573         /* [sector_num, pnum] unallocated on this layer, which could be only
1574          * the first part of [sector_num, nb_sectors].  */
1575         nb_sectors = MIN(nb_sectors, *pnum);
1576     }
1577     return ret;
1578 }
1579 
1580 /* Coroutine wrapper for bdrv_get_block_status_above() */
1581 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
1582 {
1583     BdrvCoGetBlockStatusData *data = opaque;
1584 
1585     data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
1586                                                data->sector_num,
1587                                                data->nb_sectors,
1588                                                data->pnum,
1589                                                data->file);
1590     data->done = true;
1591 }
1592 
1593 /*
1594  * Synchronous wrapper around bdrv_co_get_block_status_above().
1595  *
1596  * See bdrv_co_get_block_status_above() for details.
1597  */
1598 int64_t bdrv_get_block_status_above(BlockDriverState *bs,
1599                                     BlockDriverState *base,
1600                                     int64_t sector_num,
1601                                     int nb_sectors, int *pnum,
1602                                     BlockDriverState **file)
1603 {
1604     Coroutine *co;
1605     BdrvCoGetBlockStatusData data = {
1606         .bs = bs,
1607         .base = base,
1608         .file = file,
1609         .sector_num = sector_num,
1610         .nb_sectors = nb_sectors,
1611         .pnum = pnum,
1612         .done = false,
1613     };
1614 
1615     if (qemu_in_coroutine()) {
1616         /* Fast-path if already in coroutine context */
1617         bdrv_get_block_status_above_co_entry(&data);
1618     } else {
1619         AioContext *aio_context = bdrv_get_aio_context(bs);
1620 
1621         co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry);
1622         qemu_coroutine_enter(co, &data);
1623         while (!data.done) {
1624             aio_poll(aio_context, true);
1625         }
1626     }
1627     return data.ret;
1628 }
1629 
1630 int64_t bdrv_get_block_status(BlockDriverState *bs,
1631                               int64_t sector_num,
1632                               int nb_sectors, int *pnum,
1633                               BlockDriverState **file)
1634 {
1635     return bdrv_get_block_status_above(bs, backing_bs(bs),
1636                                        sector_num, nb_sectors, pnum, file);
1637 }
1638 
1639 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
1640                                    int nb_sectors, int *pnum)
1641 {
1642     BlockDriverState *file;
1643     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
1644                                         &file);
1645     if (ret < 0) {
1646         return ret;
1647     }
1648     return !!(ret & BDRV_BLOCK_ALLOCATED);
1649 }
1650 
1651 /*
1652  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
1653  *
1654  * Return true if the given sector is allocated in any image between
1655  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
1656  * sector is allocated in any image of the chain.  Return false otherwise.
1657  *
1658  * 'pnum' is set to the number of sectors (including and immediately following
1659  *  the specified sector) that are known to be in the same
1660  *  allocated/unallocated state.
1661  *
1662  */
1663 int bdrv_is_allocated_above(BlockDriverState *top,
1664                             BlockDriverState *base,
1665                             int64_t sector_num,
1666                             int nb_sectors, int *pnum)
1667 {
1668     BlockDriverState *intermediate;
1669     int ret, n = nb_sectors;
1670 
1671     intermediate = top;
1672     while (intermediate && intermediate != base) {
1673         int pnum_inter;
1674         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
1675                                 &pnum_inter);
1676         if (ret < 0) {
1677             return ret;
1678         } else if (ret) {
1679             *pnum = pnum_inter;
1680             return 1;
1681         }
1682 
1683         /*
1684          * [sector_num, nb_sectors] is unallocated on top but intermediate
1685          * might have
1686          *
1687          * [sector_num+x, nr_sectors] allocated.
1688          */
1689         if (n > pnum_inter &&
1690             (intermediate == top ||
1691              sector_num + pnum_inter < intermediate->total_sectors)) {
1692             n = pnum_inter;
1693         }
1694 
1695         intermediate = backing_bs(intermediate);
1696     }
1697 
1698     *pnum = n;
1699     return 0;
1700 }
1701 
1702 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1703                           const uint8_t *buf, int nb_sectors)
1704 {
1705     BlockDriver *drv = bs->drv;
1706     int ret;
1707 
1708     if (!drv) {
1709         return -ENOMEDIUM;
1710     }
1711     if (!drv->bdrv_write_compressed) {
1712         return -ENOTSUP;
1713     }
1714     ret = bdrv_check_request(bs, sector_num, nb_sectors);
1715     if (ret < 0) {
1716         return ret;
1717     }
1718 
1719     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1720 
1721     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1722 }
1723 
1724 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1725                       int64_t pos, int size)
1726 {
1727     QEMUIOVector qiov;
1728     struct iovec iov = {
1729         .iov_base   = (void *) buf,
1730         .iov_len    = size,
1731     };
1732 
1733     qemu_iovec_init_external(&qiov, &iov, 1);
1734     return bdrv_writev_vmstate(bs, &qiov, pos);
1735 }
1736 
1737 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
1738 {
1739     BlockDriver *drv = bs->drv;
1740 
1741     if (!drv) {
1742         return -ENOMEDIUM;
1743     } else if (drv->bdrv_save_vmstate) {
1744         return drv->bdrv_save_vmstate(bs, qiov, pos);
1745     } else if (bs->file) {
1746         return bdrv_writev_vmstate(bs->file->bs, qiov, pos);
1747     }
1748 
1749     return -ENOTSUP;
1750 }
1751 
1752 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1753                       int64_t pos, int size)
1754 {
1755     BlockDriver *drv = bs->drv;
1756     if (!drv)
1757         return -ENOMEDIUM;
1758     if (drv->bdrv_load_vmstate)
1759         return drv->bdrv_load_vmstate(bs, buf, pos, size);
1760     if (bs->file)
1761         return bdrv_load_vmstate(bs->file->bs, buf, pos, size);
1762     return -ENOTSUP;
1763 }
1764 
1765 /**************************************************************/
1766 /* async I/Os */
1767 
1768 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
1769                            QEMUIOVector *qiov, int nb_sectors,
1770                            BlockCompletionFunc *cb, void *opaque)
1771 {
1772     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
1773 
1774     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1775                                  cb, opaque, false);
1776 }
1777 
1778 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
1779                             QEMUIOVector *qiov, int nb_sectors,
1780                             BlockCompletionFunc *cb, void *opaque)
1781 {
1782     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
1783 
1784     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1785                                  cb, opaque, true);
1786 }
1787 
1788 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
1789         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
1790         BlockCompletionFunc *cb, void *opaque)
1791 {
1792     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
1793 
1794     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
1795                                  BDRV_REQ_ZERO_WRITE | flags,
1796                                  cb, opaque, true);
1797 }
1798 
1799 
1800 typedef struct MultiwriteCB {
1801     int error;
1802     int num_requests;
1803     int num_callbacks;
1804     struct {
1805         BlockCompletionFunc *cb;
1806         void *opaque;
1807         QEMUIOVector *free_qiov;
1808     } callbacks[];
1809 } MultiwriteCB;
1810 
1811 static void multiwrite_user_cb(MultiwriteCB *mcb)
1812 {
1813     int i;
1814 
1815     for (i = 0; i < mcb->num_callbacks; i++) {
1816         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1817         if (mcb->callbacks[i].free_qiov) {
1818             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
1819         }
1820         g_free(mcb->callbacks[i].free_qiov);
1821     }
1822 }
1823 
1824 static void multiwrite_cb(void *opaque, int ret)
1825 {
1826     MultiwriteCB *mcb = opaque;
1827 
1828     trace_multiwrite_cb(mcb, ret);
1829 
1830     if (ret < 0 && !mcb->error) {
1831         mcb->error = ret;
1832     }
1833 
1834     mcb->num_requests--;
1835     if (mcb->num_requests == 0) {
1836         multiwrite_user_cb(mcb);
1837         g_free(mcb);
1838     }
1839 }
1840 
1841 static int multiwrite_req_compare(const void *a, const void *b)
1842 {
1843     const BlockRequest *req1 = a, *req2 = b;
1844 
1845     /*
1846      * Note that we can't simply subtract req2->sector from req1->sector
1847      * here as that could overflow the return value.
1848      */
1849     if (req1->sector > req2->sector) {
1850         return 1;
1851     } else if (req1->sector < req2->sector) {
1852         return -1;
1853     } else {
1854         return 0;
1855     }
1856 }
1857 
1858 /*
1859  * Takes a bunch of requests and tries to merge them. Returns the number of
1860  * requests that remain after merging.
1861  */
1862 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
1863     int num_reqs, MultiwriteCB *mcb)
1864 {
1865     int i, outidx;
1866 
1867     // Sort requests by start sector
1868     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
1869 
1870     // Check if adjacent requests touch the same clusters. If so, combine them,
1871     // filling up gaps with zero sectors.
1872     outidx = 0;
1873     for (i = 1; i < num_reqs; i++) {
1874         int merge = 0;
1875         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
1876 
1877         // Handle exactly sequential writes and overlapping writes.
1878         if (reqs[i].sector <= oldreq_last) {
1879             merge = 1;
1880         }
1881 
1882         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 >
1883             bs->bl.max_iov) {
1884             merge = 0;
1885         }
1886 
1887         if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
1888             reqs[i].nb_sectors > bs->bl.max_transfer_length) {
1889             merge = 0;
1890         }
1891 
1892         if (merge) {
1893             size_t size;
1894             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
1895             qemu_iovec_init(qiov,
1896                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
1897 
1898             // Add the first request to the merged one. If the requests are
1899             // overlapping, drop the last sectors of the first request.
1900             size = (reqs[i].sector - reqs[outidx].sector) << 9;
1901             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
1902 
1903             // We should need to add any zeros between the two requests
1904             assert (reqs[i].sector <= oldreq_last);
1905 
1906             // Add the second request
1907             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
1908 
1909             // Add tail of first request, if necessary
1910             if (qiov->size < reqs[outidx].qiov->size) {
1911                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
1912                                   reqs[outidx].qiov->size - qiov->size);
1913             }
1914 
1915             reqs[outidx].nb_sectors = qiov->size >> 9;
1916             reqs[outidx].qiov = qiov;
1917 
1918             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
1919         } else {
1920             outidx++;
1921             reqs[outidx].sector     = reqs[i].sector;
1922             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
1923             reqs[outidx].qiov       = reqs[i].qiov;
1924         }
1925     }
1926 
1927     if (bs->blk) {
1928         block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE,
1929                               num_reqs - outidx - 1);
1930     }
1931 
1932     return outidx + 1;
1933 }
1934 
1935 /*
1936  * Submit multiple AIO write requests at once.
1937  *
1938  * On success, the function returns 0 and all requests in the reqs array have
1939  * been submitted. In error case this function returns -1, and any of the
1940  * requests may or may not be submitted yet. In particular, this means that the
1941  * callback will be called for some of the requests, for others it won't. The
1942  * caller must check the error field of the BlockRequest to wait for the right
1943  * callbacks (if error != 0, no callback will be called).
1944  *
1945  * The implementation may modify the contents of the reqs array, e.g. to merge
1946  * requests. However, the fields opaque and error are left unmodified as they
1947  * are used to signal failure for a single request to the caller.
1948  */
1949 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
1950 {
1951     MultiwriteCB *mcb;
1952     int i;
1953 
1954     /* don't submit writes if we don't have a medium */
1955     if (bs->drv == NULL) {
1956         for (i = 0; i < num_reqs; i++) {
1957             reqs[i].error = -ENOMEDIUM;
1958         }
1959         return -1;
1960     }
1961 
1962     if (num_reqs == 0) {
1963         return 0;
1964     }
1965 
1966     // Create MultiwriteCB structure
1967     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
1968     mcb->num_requests = 0;
1969     mcb->num_callbacks = num_reqs;
1970 
1971     for (i = 0; i < num_reqs; i++) {
1972         mcb->callbacks[i].cb = reqs[i].cb;
1973         mcb->callbacks[i].opaque = reqs[i].opaque;
1974     }
1975 
1976     // Check for mergable requests
1977     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
1978 
1979     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
1980 
1981     /* Run the aio requests. */
1982     mcb->num_requests = num_reqs;
1983     for (i = 0; i < num_reqs; i++) {
1984         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
1985                               reqs[i].nb_sectors, reqs[i].flags,
1986                               multiwrite_cb, mcb,
1987                               true);
1988     }
1989 
1990     return 0;
1991 }
1992 
1993 void bdrv_aio_cancel(BlockAIOCB *acb)
1994 {
1995     qemu_aio_ref(acb);
1996     bdrv_aio_cancel_async(acb);
1997     while (acb->refcnt > 1) {
1998         if (acb->aiocb_info->get_aio_context) {
1999             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2000         } else if (acb->bs) {
2001             aio_poll(bdrv_get_aio_context(acb->bs), true);
2002         } else {
2003             abort();
2004         }
2005     }
2006     qemu_aio_unref(acb);
2007 }
2008 
2009 /* Async version of aio cancel. The caller is not blocked if the acb implements
2010  * cancel_async, otherwise we do nothing and let the request normally complete.
2011  * In either case the completion callback must be called. */
2012 void bdrv_aio_cancel_async(BlockAIOCB *acb)
2013 {
2014     if (acb->aiocb_info->cancel_async) {
2015         acb->aiocb_info->cancel_async(acb);
2016     }
2017 }
2018 
2019 /**************************************************************/
2020 /* async block device emulation */
2021 
2022 typedef struct BlockAIOCBSync {
2023     BlockAIOCB common;
2024     QEMUBH *bh;
2025     int ret;
2026     /* vector translation state */
2027     QEMUIOVector *qiov;
2028     uint8_t *bounce;
2029     int is_write;
2030 } BlockAIOCBSync;
2031 
2032 static const AIOCBInfo bdrv_em_aiocb_info = {
2033     .aiocb_size         = sizeof(BlockAIOCBSync),
2034 };
2035 
2036 static void bdrv_aio_bh_cb(void *opaque)
2037 {
2038     BlockAIOCBSync *acb = opaque;
2039 
2040     if (!acb->is_write && acb->ret >= 0) {
2041         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
2042     }
2043     qemu_vfree(acb->bounce);
2044     acb->common.cb(acb->common.opaque, acb->ret);
2045     qemu_bh_delete(acb->bh);
2046     acb->bh = NULL;
2047     qemu_aio_unref(acb);
2048 }
2049 
2050 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2051                                       int64_t sector_num,
2052                                       QEMUIOVector *qiov,
2053                                       int nb_sectors,
2054                                       BlockCompletionFunc *cb,
2055                                       void *opaque,
2056                                       int is_write)
2057 
2058 {
2059     BlockAIOCBSync *acb;
2060 
2061     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
2062     acb->is_write = is_write;
2063     acb->qiov = qiov;
2064     acb->bounce = qemu_try_blockalign(bs, qiov->size);
2065     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
2066 
2067     if (acb->bounce == NULL) {
2068         acb->ret = -ENOMEM;
2069     } else if (is_write) {
2070         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
2071         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2072     } else {
2073         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2074     }
2075 
2076     qemu_bh_schedule(acb->bh);
2077 
2078     return &acb->common;
2079 }
2080 
2081 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2082         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2083         BlockCompletionFunc *cb, void *opaque)
2084 {
2085     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2086 }
2087 
2088 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2089         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2090         BlockCompletionFunc *cb, void *opaque)
2091 {
2092     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2093 }
2094 
2095 
2096 typedef struct BlockAIOCBCoroutine {
2097     BlockAIOCB common;
2098     BlockRequest req;
2099     bool is_write;
2100     bool need_bh;
2101     bool *done;
2102     QEMUBH* bh;
2103 } BlockAIOCBCoroutine;
2104 
2105 static const AIOCBInfo bdrv_em_co_aiocb_info = {
2106     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
2107 };
2108 
2109 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
2110 {
2111     if (!acb->need_bh) {
2112         acb->common.cb(acb->common.opaque, acb->req.error);
2113         qemu_aio_unref(acb);
2114     }
2115 }
2116 
2117 static void bdrv_co_em_bh(void *opaque)
2118 {
2119     BlockAIOCBCoroutine *acb = opaque;
2120 
2121     assert(!acb->need_bh);
2122     qemu_bh_delete(acb->bh);
2123     bdrv_co_complete(acb);
2124 }
2125 
2126 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
2127 {
2128     acb->need_bh = false;
2129     if (acb->req.error != -EINPROGRESS) {
2130         BlockDriverState *bs = acb->common.bs;
2131 
2132         acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
2133         qemu_bh_schedule(acb->bh);
2134     }
2135 }
2136 
2137 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2138 static void coroutine_fn bdrv_co_do_rw(void *opaque)
2139 {
2140     BlockAIOCBCoroutine *acb = opaque;
2141     BlockDriverState *bs = acb->common.bs;
2142 
2143     if (!acb->is_write) {
2144         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
2145             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2146     } else {
2147         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
2148             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2149     }
2150 
2151     bdrv_co_complete(acb);
2152 }
2153 
2154 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2155                                          int64_t sector_num,
2156                                          QEMUIOVector *qiov,
2157                                          int nb_sectors,
2158                                          BdrvRequestFlags flags,
2159                                          BlockCompletionFunc *cb,
2160                                          void *opaque,
2161                                          bool is_write)
2162 {
2163     Coroutine *co;
2164     BlockAIOCBCoroutine *acb;
2165 
2166     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2167     acb->need_bh = true;
2168     acb->req.error = -EINPROGRESS;
2169     acb->req.sector = sector_num;
2170     acb->req.nb_sectors = nb_sectors;
2171     acb->req.qiov = qiov;
2172     acb->req.flags = flags;
2173     acb->is_write = is_write;
2174 
2175     co = qemu_coroutine_create(bdrv_co_do_rw);
2176     qemu_coroutine_enter(co, acb);
2177 
2178     bdrv_co_maybe_schedule_bh(acb);
2179     return &acb->common;
2180 }
2181 
2182 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2183 {
2184     BlockAIOCBCoroutine *acb = opaque;
2185     BlockDriverState *bs = acb->common.bs;
2186 
2187     acb->req.error = bdrv_co_flush(bs);
2188     bdrv_co_complete(acb);
2189 }
2190 
2191 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2192         BlockCompletionFunc *cb, void *opaque)
2193 {
2194     trace_bdrv_aio_flush(bs, opaque);
2195 
2196     Coroutine *co;
2197     BlockAIOCBCoroutine *acb;
2198 
2199     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2200     acb->need_bh = true;
2201     acb->req.error = -EINPROGRESS;
2202 
2203     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
2204     qemu_coroutine_enter(co, acb);
2205 
2206     bdrv_co_maybe_schedule_bh(acb);
2207     return &acb->common;
2208 }
2209 
2210 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
2211 {
2212     BlockAIOCBCoroutine *acb = opaque;
2213     BlockDriverState *bs = acb->common.bs;
2214 
2215     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2216     bdrv_co_complete(acb);
2217 }
2218 
2219 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
2220         int64_t sector_num, int nb_sectors,
2221         BlockCompletionFunc *cb, void *opaque)
2222 {
2223     Coroutine *co;
2224     BlockAIOCBCoroutine *acb;
2225 
2226     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
2227 
2228     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2229     acb->need_bh = true;
2230     acb->req.error = -EINPROGRESS;
2231     acb->req.sector = sector_num;
2232     acb->req.nb_sectors = nb_sectors;
2233     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
2234     qemu_coroutine_enter(co, acb);
2235 
2236     bdrv_co_maybe_schedule_bh(acb);
2237     return &acb->common;
2238 }
2239 
2240 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
2241                    BlockCompletionFunc *cb, void *opaque)
2242 {
2243     BlockAIOCB *acb;
2244 
2245     acb = g_malloc(aiocb_info->aiocb_size);
2246     acb->aiocb_info = aiocb_info;
2247     acb->bs = bs;
2248     acb->cb = cb;
2249     acb->opaque = opaque;
2250     acb->refcnt = 1;
2251     return acb;
2252 }
2253 
2254 void qemu_aio_ref(void *p)
2255 {
2256     BlockAIOCB *acb = p;
2257     acb->refcnt++;
2258 }
2259 
2260 void qemu_aio_unref(void *p)
2261 {
2262     BlockAIOCB *acb = p;
2263     assert(acb->refcnt > 0);
2264     if (--acb->refcnt == 0) {
2265         g_free(acb);
2266     }
2267 }
2268 
2269 /**************************************************************/
2270 /* Coroutine block device emulation */
2271 
2272 typedef struct CoroutineIOCompletion {
2273     Coroutine *coroutine;
2274     int ret;
2275 } CoroutineIOCompletion;
2276 
2277 static void bdrv_co_io_em_complete(void *opaque, int ret)
2278 {
2279     CoroutineIOCompletion *co = opaque;
2280 
2281     co->ret = ret;
2282     qemu_coroutine_enter(co->coroutine, NULL);
2283 }
2284 
2285 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
2286                                       int nb_sectors, QEMUIOVector *iov,
2287                                       bool is_write)
2288 {
2289     CoroutineIOCompletion co = {
2290         .coroutine = qemu_coroutine_self(),
2291     };
2292     BlockAIOCB *acb;
2293 
2294     if (is_write) {
2295         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
2296                                        bdrv_co_io_em_complete, &co);
2297     } else {
2298         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
2299                                       bdrv_co_io_em_complete, &co);
2300     }
2301 
2302     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
2303     if (!acb) {
2304         return -EIO;
2305     }
2306     qemu_coroutine_yield();
2307 
2308     return co.ret;
2309 }
2310 
2311 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
2312                                          int64_t sector_num, int nb_sectors,
2313                                          QEMUIOVector *iov)
2314 {
2315     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
2316 }
2317 
2318 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
2319                                          int64_t sector_num, int nb_sectors,
2320                                          QEMUIOVector *iov)
2321 {
2322     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
2323 }
2324 
2325 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2326 {
2327     RwCo *rwco = opaque;
2328 
2329     rwco->ret = bdrv_co_flush(rwco->bs);
2330 }
2331 
2332 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2333 {
2334     int ret;
2335     BdrvTrackedRequest req;
2336 
2337     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2338         bdrv_is_sg(bs)) {
2339         return 0;
2340     }
2341 
2342     tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
2343 
2344     /* Write back all layers by calling one driver function */
2345     if (bs->drv->bdrv_co_flush) {
2346         ret = bs->drv->bdrv_co_flush(bs);
2347         goto out;
2348     }
2349 
2350     /* Write back cached data to the OS even with cache=unsafe */
2351     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2352     if (bs->drv->bdrv_co_flush_to_os) {
2353         ret = bs->drv->bdrv_co_flush_to_os(bs);
2354         if (ret < 0) {
2355             goto out;
2356         }
2357     }
2358 
2359     /* But don't actually force it to the disk with cache=unsafe */
2360     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2361         goto flush_parent;
2362     }
2363 
2364     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2365     if (bs->drv->bdrv_co_flush_to_disk) {
2366         ret = bs->drv->bdrv_co_flush_to_disk(bs);
2367     } else if (bs->drv->bdrv_aio_flush) {
2368         BlockAIOCB *acb;
2369         CoroutineIOCompletion co = {
2370             .coroutine = qemu_coroutine_self(),
2371         };
2372 
2373         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2374         if (acb == NULL) {
2375             ret = -EIO;
2376         } else {
2377             qemu_coroutine_yield();
2378             ret = co.ret;
2379         }
2380     } else {
2381         /*
2382          * Some block drivers always operate in either writethrough or unsafe
2383          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2384          * know how the server works (because the behaviour is hardcoded or
2385          * depends on server-side configuration), so we can't ensure that
2386          * everything is safe on disk. Returning an error doesn't work because
2387          * that would break guests even if the server operates in writethrough
2388          * mode.
2389          *
2390          * Let's hope the user knows what he's doing.
2391          */
2392         ret = 0;
2393     }
2394     if (ret < 0) {
2395         goto out;
2396     }
2397 
2398     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2399      * in the case of cache=unsafe, so there are no useless flushes.
2400      */
2401 flush_parent:
2402     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2403 out:
2404     tracked_request_end(&req);
2405     return ret;
2406 }
2407 
2408 int bdrv_flush(BlockDriverState *bs)
2409 {
2410     Coroutine *co;
2411     RwCo rwco = {
2412         .bs = bs,
2413         .ret = NOT_DONE,
2414     };
2415 
2416     if (qemu_in_coroutine()) {
2417         /* Fast-path if already in coroutine context */
2418         bdrv_flush_co_entry(&rwco);
2419     } else {
2420         AioContext *aio_context = bdrv_get_aio_context(bs);
2421 
2422         co = qemu_coroutine_create(bdrv_flush_co_entry);
2423         qemu_coroutine_enter(co, &rwco);
2424         while (rwco.ret == NOT_DONE) {
2425             aio_poll(aio_context, true);
2426         }
2427     }
2428 
2429     return rwco.ret;
2430 }
2431 
2432 typedef struct DiscardCo {
2433     BlockDriverState *bs;
2434     int64_t sector_num;
2435     int nb_sectors;
2436     int ret;
2437 } DiscardCo;
2438 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
2439 {
2440     DiscardCo *rwco = opaque;
2441 
2442     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
2443 }
2444 
2445 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
2446                                  int nb_sectors)
2447 {
2448     BdrvTrackedRequest req;
2449     int max_discard, ret;
2450 
2451     if (!bs->drv) {
2452         return -ENOMEDIUM;
2453     }
2454 
2455     ret = bdrv_check_request(bs, sector_num, nb_sectors);
2456     if (ret < 0) {
2457         return ret;
2458     } else if (bs->read_only) {
2459         return -EPERM;
2460     }
2461     assert(!(bs->open_flags & BDRV_O_INACTIVE));
2462 
2463     /* Do nothing if disabled.  */
2464     if (!(bs->open_flags & BDRV_O_UNMAP)) {
2465         return 0;
2466     }
2467 
2468     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
2469         return 0;
2470     }
2471 
2472     tracked_request_begin(&req, bs, sector_num, nb_sectors,
2473                           BDRV_TRACKED_DISCARD);
2474     bdrv_set_dirty(bs, sector_num, nb_sectors);
2475 
2476     max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
2477     while (nb_sectors > 0) {
2478         int ret;
2479         int num = nb_sectors;
2480 
2481         /* align request */
2482         if (bs->bl.discard_alignment &&
2483             num >= bs->bl.discard_alignment &&
2484             sector_num % bs->bl.discard_alignment) {
2485             if (num > bs->bl.discard_alignment) {
2486                 num = bs->bl.discard_alignment;
2487             }
2488             num -= sector_num % bs->bl.discard_alignment;
2489         }
2490 
2491         /* limit request size */
2492         if (num > max_discard) {
2493             num = max_discard;
2494         }
2495 
2496         if (bs->drv->bdrv_co_discard) {
2497             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
2498         } else {
2499             BlockAIOCB *acb;
2500             CoroutineIOCompletion co = {
2501                 .coroutine = qemu_coroutine_self(),
2502             };
2503 
2504             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
2505                                             bdrv_co_io_em_complete, &co);
2506             if (acb == NULL) {
2507                 ret = -EIO;
2508                 goto out;
2509             } else {
2510                 qemu_coroutine_yield();
2511                 ret = co.ret;
2512             }
2513         }
2514         if (ret && ret != -ENOTSUP) {
2515             goto out;
2516         }
2517 
2518         sector_num += num;
2519         nb_sectors -= num;
2520     }
2521     ret = 0;
2522 out:
2523     tracked_request_end(&req);
2524     return ret;
2525 }
2526 
2527 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
2528 {
2529     Coroutine *co;
2530     DiscardCo rwco = {
2531         .bs = bs,
2532         .sector_num = sector_num,
2533         .nb_sectors = nb_sectors,
2534         .ret = NOT_DONE,
2535     };
2536 
2537     if (qemu_in_coroutine()) {
2538         /* Fast-path if already in coroutine context */
2539         bdrv_discard_co_entry(&rwco);
2540     } else {
2541         AioContext *aio_context = bdrv_get_aio_context(bs);
2542 
2543         co = qemu_coroutine_create(bdrv_discard_co_entry);
2544         qemu_coroutine_enter(co, &rwco);
2545         while (rwco.ret == NOT_DONE) {
2546             aio_poll(aio_context, true);
2547         }
2548     }
2549 
2550     return rwco.ret;
2551 }
2552 
2553 typedef struct {
2554     CoroutineIOCompletion *co;
2555     QEMUBH *bh;
2556 } BdrvIoctlCompletionData;
2557 
2558 static void bdrv_ioctl_bh_cb(void *opaque)
2559 {
2560     BdrvIoctlCompletionData *data = opaque;
2561 
2562     bdrv_co_io_em_complete(data->co, -ENOTSUP);
2563     qemu_bh_delete(data->bh);
2564 }
2565 
2566 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
2567 {
2568     BlockDriver *drv = bs->drv;
2569     BdrvTrackedRequest tracked_req;
2570     CoroutineIOCompletion co = {
2571         .coroutine = qemu_coroutine_self(),
2572     };
2573     BlockAIOCB *acb;
2574 
2575     tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
2576     if (!drv || !drv->bdrv_aio_ioctl) {
2577         co.ret = -ENOTSUP;
2578         goto out;
2579     }
2580 
2581     acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2582     if (!acb) {
2583         BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1);
2584         data->bh = aio_bh_new(bdrv_get_aio_context(bs),
2585                                 bdrv_ioctl_bh_cb, data);
2586         data->co = &co;
2587         qemu_bh_schedule(data->bh);
2588     }
2589     qemu_coroutine_yield();
2590 out:
2591     tracked_request_end(&tracked_req);
2592     return co.ret;
2593 }
2594 
2595 typedef struct {
2596     BlockDriverState *bs;
2597     int req;
2598     void *buf;
2599     int ret;
2600 } BdrvIoctlCoData;
2601 
2602 static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
2603 {
2604     BdrvIoctlCoData *data = opaque;
2605     data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
2606 }
2607 
2608 /* needed for generic scsi interface */
2609 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2610 {
2611     BdrvIoctlCoData data = {
2612         .bs = bs,
2613         .req = req,
2614         .buf = buf,
2615         .ret = -EINPROGRESS,
2616     };
2617 
2618     if (qemu_in_coroutine()) {
2619         /* Fast-path if already in coroutine context */
2620         bdrv_co_ioctl_entry(&data);
2621     } else {
2622         Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry);
2623 
2624         qemu_coroutine_enter(co, &data);
2625         while (data.ret == -EINPROGRESS) {
2626             aio_poll(bdrv_get_aio_context(bs), true);
2627         }
2628     }
2629     return data.ret;
2630 }
2631 
2632 static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
2633 {
2634     BlockAIOCBCoroutine *acb = opaque;
2635     acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
2636                                       acb->req.req, acb->req.buf);
2637     bdrv_co_complete(acb);
2638 }
2639 
2640 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
2641         unsigned long int req, void *buf,
2642         BlockCompletionFunc *cb, void *opaque)
2643 {
2644     BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
2645                                             bs, cb, opaque);
2646     Coroutine *co;
2647 
2648     acb->need_bh = true;
2649     acb->req.error = -EINPROGRESS;
2650     acb->req.req = req;
2651     acb->req.buf = buf;
2652     co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry);
2653     qemu_coroutine_enter(co, acb);
2654 
2655     bdrv_co_maybe_schedule_bh(acb);
2656     return &acb->common;
2657 }
2658 
2659 void *qemu_blockalign(BlockDriverState *bs, size_t size)
2660 {
2661     return qemu_memalign(bdrv_opt_mem_align(bs), size);
2662 }
2663 
2664 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2665 {
2666     return memset(qemu_blockalign(bs, size), 0, size);
2667 }
2668 
2669 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2670 {
2671     size_t align = bdrv_opt_mem_align(bs);
2672 
2673     /* Ensure that NULL is never returned on success */
2674     assert(align > 0);
2675     if (size == 0) {
2676         size = align;
2677     }
2678 
2679     return qemu_try_memalign(align, size);
2680 }
2681 
2682 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2683 {
2684     void *mem = qemu_try_blockalign(bs, size);
2685 
2686     if (mem) {
2687         memset(mem, 0, size);
2688     }
2689 
2690     return mem;
2691 }
2692 
2693 /*
2694  * Check if all memory in this vector is sector aligned.
2695  */
2696 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2697 {
2698     int i;
2699     size_t alignment = bdrv_min_mem_align(bs);
2700 
2701     for (i = 0; i < qiov->niov; i++) {
2702         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2703             return false;
2704         }
2705         if (qiov->iov[i].iov_len % alignment) {
2706             return false;
2707         }
2708     }
2709 
2710     return true;
2711 }
2712 
2713 void bdrv_add_before_write_notifier(BlockDriverState *bs,
2714                                     NotifierWithReturn *notifier)
2715 {
2716     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2717 }
2718 
2719 void bdrv_io_plug(BlockDriverState *bs)
2720 {
2721     BlockDriver *drv = bs->drv;
2722     if (drv && drv->bdrv_io_plug) {
2723         drv->bdrv_io_plug(bs);
2724     } else if (bs->file) {
2725         bdrv_io_plug(bs->file->bs);
2726     }
2727 }
2728 
2729 void bdrv_io_unplug(BlockDriverState *bs)
2730 {
2731     BlockDriver *drv = bs->drv;
2732     if (drv && drv->bdrv_io_unplug) {
2733         drv->bdrv_io_unplug(bs);
2734     } else if (bs->file) {
2735         bdrv_io_unplug(bs->file->bs);
2736     }
2737 }
2738 
2739 void bdrv_flush_io_queue(BlockDriverState *bs)
2740 {
2741     BlockDriver *drv = bs->drv;
2742     if (drv && drv->bdrv_flush_io_queue) {
2743         drv->bdrv_flush_io_queue(bs);
2744     } else if (bs->file) {
2745         bdrv_flush_io_queue(bs->file->bs);
2746     }
2747     bdrv_start_throttled_reqs(bs);
2748 }
2749 
2750 void bdrv_drained_begin(BlockDriverState *bs)
2751 {
2752     if (!bs->quiesce_counter++) {
2753         aio_disable_external(bdrv_get_aio_context(bs));
2754     }
2755     bdrv_drain(bs);
2756 }
2757 
2758 void bdrv_drained_end(BlockDriverState *bs)
2759 {
2760     assert(bs->quiesce_counter > 0);
2761     if (--bs->quiesce_counter > 0) {
2762         return;
2763     }
2764     aio_enable_external(bdrv_get_aio_context(bs));
2765 }
2766