xref: /qemu/block/io.c (revision 52ea63de)
1 /*
2  * Block layer I/O functions
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "trace.h"
27 #include "sysemu/block-backend.h"
28 #include "block/blockjob.h"
29 #include "block/block_int.h"
30 #include "qemu/cutils.h"
31 #include "qapi/error.h"
32 #include "qemu/error-report.h"
33 
34 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
35 
36 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
37                                          int64_t sector_num,
38                                          QEMUIOVector *qiov,
39                                          int nb_sectors,
40                                          BdrvRequestFlags flags,
41                                          BlockCompletionFunc *cb,
42                                          void *opaque,
43                                          bool is_write);
44 static void coroutine_fn bdrv_co_do_rw(void *opaque);
45 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
46     int64_t offset, int count, BdrvRequestFlags flags);
47 
48 static void bdrv_parent_drained_begin(BlockDriverState *bs)
49 {
50     BdrvChild *c;
51 
52     QLIST_FOREACH(c, &bs->parents, next_parent) {
53         if (c->role->drained_begin) {
54             c->role->drained_begin(c);
55         }
56     }
57 }
58 
59 static void bdrv_parent_drained_end(BlockDriverState *bs)
60 {
61     BdrvChild *c;
62 
63     QLIST_FOREACH(c, &bs->parents, next_parent) {
64         if (c->role->drained_end) {
65             c->role->drained_end(c);
66         }
67     }
68 }
69 
70 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
71 {
72     BlockDriver *drv = bs->drv;
73     Error *local_err = NULL;
74 
75     memset(&bs->bl, 0, sizeof(bs->bl));
76 
77     if (!drv) {
78         return;
79     }
80 
81     /* Take some limits from the children as a default */
82     if (bs->file) {
83         bdrv_refresh_limits(bs->file->bs, &local_err);
84         if (local_err) {
85             error_propagate(errp, local_err);
86             return;
87         }
88         bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length;
89         bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length;
90         bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment;
91         bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment;
92         bs->bl.max_iov = bs->file->bs->bl.max_iov;
93     } else {
94         bs->bl.min_mem_alignment = 512;
95         bs->bl.opt_mem_alignment = getpagesize();
96 
97         /* Safe default since most protocols use readv()/writev()/etc */
98         bs->bl.max_iov = IOV_MAX;
99     }
100 
101     if (bs->backing) {
102         bdrv_refresh_limits(bs->backing->bs, &local_err);
103         if (local_err) {
104             error_propagate(errp, local_err);
105             return;
106         }
107         bs->bl.opt_transfer_length =
108             MAX(bs->bl.opt_transfer_length,
109                 bs->backing->bs->bl.opt_transfer_length);
110         bs->bl.max_transfer_length =
111             MIN_NON_ZERO(bs->bl.max_transfer_length,
112                          bs->backing->bs->bl.max_transfer_length);
113         bs->bl.opt_mem_alignment =
114             MAX(bs->bl.opt_mem_alignment,
115                 bs->backing->bs->bl.opt_mem_alignment);
116         bs->bl.min_mem_alignment =
117             MAX(bs->bl.min_mem_alignment,
118                 bs->backing->bs->bl.min_mem_alignment);
119         bs->bl.max_iov =
120             MIN(bs->bl.max_iov,
121                 bs->backing->bs->bl.max_iov);
122     }
123 
124     /* Then let the driver override it */
125     if (drv->bdrv_refresh_limits) {
126         drv->bdrv_refresh_limits(bs, errp);
127     }
128 }
129 
130 /**
131  * The copy-on-read flag is actually a reference count so multiple users may
132  * use the feature without worrying about clobbering its previous state.
133  * Copy-on-read stays enabled until all users have called to disable it.
134  */
135 void bdrv_enable_copy_on_read(BlockDriverState *bs)
136 {
137     bs->copy_on_read++;
138 }
139 
140 void bdrv_disable_copy_on_read(BlockDriverState *bs)
141 {
142     assert(bs->copy_on_read > 0);
143     bs->copy_on_read--;
144 }
145 
146 /* Check if any requests are in-flight (including throttled requests) */
147 bool bdrv_requests_pending(BlockDriverState *bs)
148 {
149     BdrvChild *child;
150 
151     if (!QLIST_EMPTY(&bs->tracked_requests)) {
152         return true;
153     }
154 
155     QLIST_FOREACH(child, &bs->children, next) {
156         if (bdrv_requests_pending(child->bs)) {
157             return true;
158         }
159     }
160 
161     return false;
162 }
163 
164 static void bdrv_drain_recurse(BlockDriverState *bs)
165 {
166     BdrvChild *child;
167 
168     if (bs->drv && bs->drv->bdrv_drain) {
169         bs->drv->bdrv_drain(bs);
170     }
171     QLIST_FOREACH(child, &bs->children, next) {
172         bdrv_drain_recurse(child->bs);
173     }
174 }
175 
176 typedef struct {
177     Coroutine *co;
178     BlockDriverState *bs;
179     QEMUBH *bh;
180     bool done;
181 } BdrvCoDrainData;
182 
183 static void bdrv_drain_poll(BlockDriverState *bs)
184 {
185     bool busy = true;
186 
187     while (busy) {
188         /* Keep iterating */
189         busy = bdrv_requests_pending(bs);
190         busy |= aio_poll(bdrv_get_aio_context(bs), busy);
191     }
192 }
193 
194 static void bdrv_co_drain_bh_cb(void *opaque)
195 {
196     BdrvCoDrainData *data = opaque;
197     Coroutine *co = data->co;
198 
199     qemu_bh_delete(data->bh);
200     bdrv_drain_poll(data->bs);
201     data->done = true;
202     qemu_coroutine_enter(co, NULL);
203 }
204 
205 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
206 {
207     BdrvCoDrainData data;
208 
209     /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
210      * other coroutines run if they were queued from
211      * qemu_co_queue_run_restart(). */
212 
213     assert(qemu_in_coroutine());
214     data = (BdrvCoDrainData) {
215         .co = qemu_coroutine_self(),
216         .bs = bs,
217         .done = false,
218         .bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_drain_bh_cb, &data),
219     };
220     qemu_bh_schedule(data.bh);
221 
222     qemu_coroutine_yield();
223     /* If we are resumed from some other event (such as an aio completion or a
224      * timer callback), it is a bug in the caller that should be fixed. */
225     assert(data.done);
226 }
227 
228 void bdrv_drained_begin(BlockDriverState *bs)
229 {
230     if (!bs->quiesce_counter++) {
231         aio_disable_external(bdrv_get_aio_context(bs));
232         bdrv_parent_drained_begin(bs);
233     }
234 
235     bdrv_io_unplugged_begin(bs);
236     bdrv_drain_recurse(bs);
237     if (qemu_in_coroutine()) {
238         bdrv_co_yield_to_drain(bs);
239     } else {
240         bdrv_drain_poll(bs);
241     }
242     bdrv_io_unplugged_end(bs);
243 }
244 
245 void bdrv_drained_end(BlockDriverState *bs)
246 {
247     assert(bs->quiesce_counter > 0);
248     if (--bs->quiesce_counter > 0) {
249         return;
250     }
251 
252     bdrv_parent_drained_end(bs);
253     aio_enable_external(bdrv_get_aio_context(bs));
254 }
255 
256 /*
257  * Wait for pending requests to complete on a single BlockDriverState subtree,
258  * and suspend block driver's internal I/O until next request arrives.
259  *
260  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
261  * AioContext.
262  *
263  * Only this BlockDriverState's AioContext is run, so in-flight requests must
264  * not depend on events in other AioContexts.  In that case, use
265  * bdrv_drain_all() instead.
266  */
267 void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
268 {
269     assert(qemu_in_coroutine());
270     bdrv_drained_begin(bs);
271     bdrv_drained_end(bs);
272 }
273 
274 void bdrv_drain(BlockDriverState *bs)
275 {
276     bdrv_drained_begin(bs);
277     bdrv_drained_end(bs);
278 }
279 
280 /*
281  * Wait for pending requests to complete across all BlockDriverStates
282  *
283  * This function does not flush data to disk, use bdrv_flush_all() for that
284  * after calling this function.
285  */
286 void bdrv_drain_all(void)
287 {
288     /* Always run first iteration so any pending completion BHs run */
289     bool busy = true;
290     BlockDriverState *bs;
291     BdrvNextIterator it;
292     GSList *aio_ctxs = NULL, *ctx;
293 
294     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
295         AioContext *aio_context = bdrv_get_aio_context(bs);
296 
297         aio_context_acquire(aio_context);
298         if (bs->job) {
299             block_job_pause(bs->job);
300         }
301         bdrv_parent_drained_begin(bs);
302         bdrv_io_unplugged_begin(bs);
303         bdrv_drain_recurse(bs);
304         aio_context_release(aio_context);
305 
306         if (!g_slist_find(aio_ctxs, aio_context)) {
307             aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
308         }
309     }
310 
311     /* Note that completion of an asynchronous I/O operation can trigger any
312      * number of other I/O operations on other devices---for example a
313      * coroutine can submit an I/O request to another device in response to
314      * request completion.  Therefore we must keep looping until there was no
315      * more activity rather than simply draining each device independently.
316      */
317     while (busy) {
318         busy = false;
319 
320         for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
321             AioContext *aio_context = ctx->data;
322 
323             aio_context_acquire(aio_context);
324             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
325                 if (aio_context == bdrv_get_aio_context(bs)) {
326                     if (bdrv_requests_pending(bs)) {
327                         busy = true;
328                         aio_poll(aio_context, busy);
329                     }
330                 }
331             }
332             busy |= aio_poll(aio_context, false);
333             aio_context_release(aio_context);
334         }
335     }
336 
337     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
338         AioContext *aio_context = bdrv_get_aio_context(bs);
339 
340         aio_context_acquire(aio_context);
341         bdrv_io_unplugged_end(bs);
342         bdrv_parent_drained_end(bs);
343         if (bs->job) {
344             block_job_resume(bs->job);
345         }
346         aio_context_release(aio_context);
347     }
348     g_slist_free(aio_ctxs);
349 }
350 
351 /**
352  * Remove an active request from the tracked requests list
353  *
354  * This function should be called when a tracked request is completing.
355  */
356 static void tracked_request_end(BdrvTrackedRequest *req)
357 {
358     if (req->serialising) {
359         req->bs->serialising_in_flight--;
360     }
361 
362     QLIST_REMOVE(req, list);
363     qemu_co_queue_restart_all(&req->wait_queue);
364 }
365 
366 /**
367  * Add an active request to the tracked requests list
368  */
369 static void tracked_request_begin(BdrvTrackedRequest *req,
370                                   BlockDriverState *bs,
371                                   int64_t offset,
372                                   unsigned int bytes,
373                                   enum BdrvTrackedRequestType type)
374 {
375     *req = (BdrvTrackedRequest){
376         .bs = bs,
377         .offset         = offset,
378         .bytes          = bytes,
379         .type           = type,
380         .co             = qemu_coroutine_self(),
381         .serialising    = false,
382         .overlap_offset = offset,
383         .overlap_bytes  = bytes,
384     };
385 
386     qemu_co_queue_init(&req->wait_queue);
387 
388     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
389 }
390 
391 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
392 {
393     int64_t overlap_offset = req->offset & ~(align - 1);
394     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
395                                - overlap_offset;
396 
397     if (!req->serialising) {
398         req->bs->serialising_in_flight++;
399         req->serialising = true;
400     }
401 
402     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
403     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
404 }
405 
406 /**
407  * Round a region to cluster boundaries
408  */
409 void bdrv_round_to_clusters(BlockDriverState *bs,
410                             int64_t sector_num, int nb_sectors,
411                             int64_t *cluster_sector_num,
412                             int *cluster_nb_sectors)
413 {
414     BlockDriverInfo bdi;
415 
416     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
417         *cluster_sector_num = sector_num;
418         *cluster_nb_sectors = nb_sectors;
419     } else {
420         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
421         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
422         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
423                                             nb_sectors, c);
424     }
425 }
426 
427 static int bdrv_get_cluster_size(BlockDriverState *bs)
428 {
429     BlockDriverInfo bdi;
430     int ret;
431 
432     ret = bdrv_get_info(bs, &bdi);
433     if (ret < 0 || bdi.cluster_size == 0) {
434         return bs->request_alignment;
435     } else {
436         return bdi.cluster_size;
437     }
438 }
439 
440 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
441                                      int64_t offset, unsigned int bytes)
442 {
443     /*        aaaa   bbbb */
444     if (offset >= req->overlap_offset + req->overlap_bytes) {
445         return false;
446     }
447     /* bbbb   aaaa        */
448     if (req->overlap_offset >= offset + bytes) {
449         return false;
450     }
451     return true;
452 }
453 
454 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
455 {
456     BlockDriverState *bs = self->bs;
457     BdrvTrackedRequest *req;
458     bool retry;
459     bool waited = false;
460 
461     if (!bs->serialising_in_flight) {
462         return false;
463     }
464 
465     do {
466         retry = false;
467         QLIST_FOREACH(req, &bs->tracked_requests, list) {
468             if (req == self || (!req->serialising && !self->serialising)) {
469                 continue;
470             }
471             if (tracked_request_overlaps(req, self->overlap_offset,
472                                          self->overlap_bytes))
473             {
474                 /* Hitting this means there was a reentrant request, for
475                  * example, a block driver issuing nested requests.  This must
476                  * never happen since it means deadlock.
477                  */
478                 assert(qemu_coroutine_self() != req->co);
479 
480                 /* If the request is already (indirectly) waiting for us, or
481                  * will wait for us as soon as it wakes up, then just go on
482                  * (instead of producing a deadlock in the former case). */
483                 if (!req->waiting_for) {
484                     self->waiting_for = req;
485                     qemu_co_queue_wait(&req->wait_queue);
486                     self->waiting_for = NULL;
487                     retry = true;
488                     waited = true;
489                     break;
490                 }
491             }
492         }
493     } while (retry);
494 
495     return waited;
496 }
497 
498 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
499                                    size_t size)
500 {
501     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
502         return -EIO;
503     }
504 
505     if (!bdrv_is_inserted(bs)) {
506         return -ENOMEDIUM;
507     }
508 
509     if (offset < 0) {
510         return -EIO;
511     }
512 
513     return 0;
514 }
515 
516 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
517                               int nb_sectors)
518 {
519     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
520         return -EIO;
521     }
522 
523     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
524                                    nb_sectors * BDRV_SECTOR_SIZE);
525 }
526 
527 typedef struct RwCo {
528     BlockDriverState *bs;
529     int64_t offset;
530     QEMUIOVector *qiov;
531     bool is_write;
532     int ret;
533     BdrvRequestFlags flags;
534 } RwCo;
535 
536 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
537 {
538     RwCo *rwco = opaque;
539 
540     if (!rwco->is_write) {
541         rwco->ret = bdrv_co_preadv(rwco->bs, rwco->offset,
542                                    rwco->qiov->size, rwco->qiov,
543                                    rwco->flags);
544     } else {
545         rwco->ret = bdrv_co_pwritev(rwco->bs, rwco->offset,
546                                     rwco->qiov->size, rwco->qiov,
547                                     rwco->flags);
548     }
549 }
550 
551 /*
552  * Process a vectored synchronous request using coroutines
553  */
554 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
555                         QEMUIOVector *qiov, bool is_write,
556                         BdrvRequestFlags flags)
557 {
558     Coroutine *co;
559     RwCo rwco = {
560         .bs = bs,
561         .offset = offset,
562         .qiov = qiov,
563         .is_write = is_write,
564         .ret = NOT_DONE,
565         .flags = flags,
566     };
567 
568     if (qemu_in_coroutine()) {
569         /* Fast-path if already in coroutine context */
570         bdrv_rw_co_entry(&rwco);
571     } else {
572         AioContext *aio_context = bdrv_get_aio_context(bs);
573 
574         co = qemu_coroutine_create(bdrv_rw_co_entry);
575         qemu_coroutine_enter(co, &rwco);
576         while (rwco.ret == NOT_DONE) {
577             aio_poll(aio_context, true);
578         }
579     }
580     return rwco.ret;
581 }
582 
583 /*
584  * Process a synchronous request using coroutines
585  */
586 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
587                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
588 {
589     QEMUIOVector qiov;
590     struct iovec iov = {
591         .iov_base = (void *)buf,
592         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
593     };
594 
595     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
596         return -EINVAL;
597     }
598 
599     qemu_iovec_init_external(&qiov, &iov, 1);
600     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
601                         &qiov, is_write, flags);
602 }
603 
604 /* return < 0 if error. See bdrv_write() for the return codes */
605 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
606               uint8_t *buf, int nb_sectors)
607 {
608     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
609 }
610 
611 /* Return < 0 if error. Important errors are:
612   -EIO         generic I/O error (may happen for all errors)
613   -ENOMEDIUM   No media inserted.
614   -EINVAL      Invalid sector number or nb_sectors
615   -EACCES      Trying to write a read-only device
616 */
617 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
618                const uint8_t *buf, int nb_sectors)
619 {
620     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
621 }
622 
623 int bdrv_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
624                        int count, BdrvRequestFlags flags)
625 {
626     QEMUIOVector qiov;
627     struct iovec iov = {
628         .iov_base = NULL,
629         .iov_len = count,
630     };
631 
632     qemu_iovec_init_external(&qiov, &iov, 1);
633     return bdrv_prwv_co(bs, offset, &qiov, true,
634                         BDRV_REQ_ZERO_WRITE | flags);
635 }
636 
637 /*
638  * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
639  * The operation is sped up by checking the block status and only writing
640  * zeroes to the device if they currently do not return zeroes. Optional
641  * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
642  * BDRV_REQ_FUA).
643  *
644  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
645  */
646 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
647 {
648     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
649     BlockDriverState *file;
650     int n;
651 
652     target_sectors = bdrv_nb_sectors(bs);
653     if (target_sectors < 0) {
654         return target_sectors;
655     }
656 
657     for (;;) {
658         nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
659         if (nb_sectors <= 0) {
660             return 0;
661         }
662         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file);
663         if (ret < 0) {
664             error_report("error getting block status at sector %" PRId64 ": %s",
665                          sector_num, strerror(-ret));
666             return ret;
667         }
668         if (ret & BDRV_BLOCK_ZERO) {
669             sector_num += n;
670             continue;
671         }
672         ret = bdrv_pwrite_zeroes(bs, sector_num << BDRV_SECTOR_BITS,
673                                  n << BDRV_SECTOR_BITS, flags);
674         if (ret < 0) {
675             error_report("error writing zeroes at sector %" PRId64 ": %s",
676                          sector_num, strerror(-ret));
677             return ret;
678         }
679         sector_num += n;
680     }
681 }
682 
683 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
684 {
685     QEMUIOVector qiov;
686     struct iovec iov = {
687         .iov_base = (void *)buf,
688         .iov_len = bytes,
689     };
690     int ret;
691 
692     if (bytes < 0) {
693         return -EINVAL;
694     }
695 
696     qemu_iovec_init_external(&qiov, &iov, 1);
697     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
698     if (ret < 0) {
699         return ret;
700     }
701 
702     return bytes;
703 }
704 
705 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
706 {
707     int ret;
708 
709     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
710     if (ret < 0) {
711         return ret;
712     }
713 
714     return qiov->size;
715 }
716 
717 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
718                 const void *buf, int bytes)
719 {
720     QEMUIOVector qiov;
721     struct iovec iov = {
722         .iov_base   = (void *) buf,
723         .iov_len    = bytes,
724     };
725 
726     if (bytes < 0) {
727         return -EINVAL;
728     }
729 
730     qemu_iovec_init_external(&qiov, &iov, 1);
731     return bdrv_pwritev(bs, offset, &qiov);
732 }
733 
734 /*
735  * Writes to the file and ensures that no writes are reordered across this
736  * request (acts as a barrier)
737  *
738  * Returns 0 on success, -errno in error cases.
739  */
740 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
741     const void *buf, int count)
742 {
743     int ret;
744 
745     ret = bdrv_pwrite(bs, offset, buf, count);
746     if (ret < 0) {
747         return ret;
748     }
749 
750     ret = bdrv_flush(bs);
751     if (ret < 0) {
752         return ret;
753     }
754 
755     return 0;
756 }
757 
758 typedef struct CoroutineIOCompletion {
759     Coroutine *coroutine;
760     int ret;
761 } CoroutineIOCompletion;
762 
763 static void bdrv_co_io_em_complete(void *opaque, int ret)
764 {
765     CoroutineIOCompletion *co = opaque;
766 
767     co->ret = ret;
768     qemu_coroutine_enter(co->coroutine, NULL);
769 }
770 
771 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
772                                            uint64_t offset, uint64_t bytes,
773                                            QEMUIOVector *qiov, int flags)
774 {
775     BlockDriver *drv = bs->drv;
776     int64_t sector_num;
777     unsigned int nb_sectors;
778 
779     if (drv->bdrv_co_preadv) {
780         return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
781     }
782 
783     sector_num = offset >> BDRV_SECTOR_BITS;
784     nb_sectors = bytes >> BDRV_SECTOR_BITS;
785 
786     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
787     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
788     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
789 
790     if (drv->bdrv_co_readv) {
791         return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
792     } else {
793         BlockAIOCB *acb;
794         CoroutineIOCompletion co = {
795             .coroutine = qemu_coroutine_self(),
796         };
797 
798         acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
799                                       bdrv_co_io_em_complete, &co);
800         if (acb == NULL) {
801             return -EIO;
802         } else {
803             qemu_coroutine_yield();
804             return co.ret;
805         }
806     }
807 }
808 
809 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
810                                             uint64_t offset, uint64_t bytes,
811                                             QEMUIOVector *qiov, int flags)
812 {
813     BlockDriver *drv = bs->drv;
814     int64_t sector_num;
815     unsigned int nb_sectors;
816     int ret;
817 
818     if (drv->bdrv_co_pwritev) {
819         ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
820                                    flags & bs->supported_write_flags);
821         flags &= ~bs->supported_write_flags;
822         goto emulate_flags;
823     }
824 
825     sector_num = offset >> BDRV_SECTOR_BITS;
826     nb_sectors = bytes >> BDRV_SECTOR_BITS;
827 
828     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
829     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
830     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
831 
832     if (drv->bdrv_co_writev_flags) {
833         ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
834                                         flags & bs->supported_write_flags);
835         flags &= ~bs->supported_write_flags;
836     } else if (drv->bdrv_co_writev) {
837         assert(!bs->supported_write_flags);
838         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
839     } else {
840         BlockAIOCB *acb;
841         CoroutineIOCompletion co = {
842             .coroutine = qemu_coroutine_self(),
843         };
844 
845         acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
846                                        bdrv_co_io_em_complete, &co);
847         if (acb == NULL) {
848             ret = -EIO;
849         } else {
850             qemu_coroutine_yield();
851             ret = co.ret;
852         }
853     }
854 
855 emulate_flags:
856     if (ret == 0 && (flags & BDRV_REQ_FUA)) {
857         ret = bdrv_co_flush(bs);
858     }
859 
860     return ret;
861 }
862 
863 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
864         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
865 {
866     /* Perform I/O through a temporary buffer so that users who scribble over
867      * their read buffer while the operation is in progress do not end up
868      * modifying the image file.  This is critical for zero-copy guest I/O
869      * where anything might happen inside guest memory.
870      */
871     void *bounce_buffer;
872 
873     BlockDriver *drv = bs->drv;
874     struct iovec iov;
875     QEMUIOVector bounce_qiov;
876     int64_t cluster_sector_num;
877     int cluster_nb_sectors;
878     size_t skip_bytes;
879     int ret;
880 
881     /* Cover entire cluster so no additional backing file I/O is required when
882      * allocating cluster in the image file.
883      */
884     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
885                            &cluster_sector_num, &cluster_nb_sectors);
886 
887     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
888                                    cluster_sector_num, cluster_nb_sectors);
889 
890     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
891     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
892     if (bounce_buffer == NULL) {
893         ret = -ENOMEM;
894         goto err;
895     }
896 
897     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
898 
899     ret = bdrv_driver_preadv(bs, cluster_sector_num * BDRV_SECTOR_SIZE,
900                              cluster_nb_sectors * BDRV_SECTOR_SIZE,
901                              &bounce_qiov, 0);
902     if (ret < 0) {
903         goto err;
904     }
905 
906     if (drv->bdrv_co_pwrite_zeroes &&
907         buffer_is_zero(bounce_buffer, iov.iov_len)) {
908         ret = bdrv_co_do_pwrite_zeroes(bs,
909                                        cluster_sector_num * BDRV_SECTOR_SIZE,
910                                        cluster_nb_sectors * BDRV_SECTOR_SIZE,
911                                        0);
912     } else {
913         /* This does not change the data on the disk, it is not necessary
914          * to flush even in cache=writethrough mode.
915          */
916         ret = bdrv_driver_pwritev(bs, cluster_sector_num * BDRV_SECTOR_SIZE,
917                                   cluster_nb_sectors * BDRV_SECTOR_SIZE,
918                                   &bounce_qiov, 0);
919     }
920 
921     if (ret < 0) {
922         /* It might be okay to ignore write errors for guest requests.  If this
923          * is a deliberate copy-on-read then we don't want to ignore the error.
924          * Simply report it in all cases.
925          */
926         goto err;
927     }
928 
929     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
930     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
931                         nb_sectors * BDRV_SECTOR_SIZE);
932 
933 err:
934     qemu_vfree(bounce_buffer);
935     return ret;
936 }
937 
938 /*
939  * Forwards an already correctly aligned request to the BlockDriver. This
940  * handles copy on read and zeroing after EOF; any other features must be
941  * implemented by the caller.
942  */
943 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
944     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
945     int64_t align, QEMUIOVector *qiov, int flags)
946 {
947     int ret;
948 
949     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
950     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
951 
952     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
953     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
954     assert(!qiov || bytes == qiov->size);
955     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
956 
957     /* Handle Copy on Read and associated serialisation */
958     if (flags & BDRV_REQ_COPY_ON_READ) {
959         /* If we touch the same cluster it counts as an overlap.  This
960          * guarantees that allocating writes will be serialized and not race
961          * with each other for the same cluster.  For example, in copy-on-read
962          * it ensures that the CoR read and write operations are atomic and
963          * guest writes cannot interleave between them. */
964         mark_request_serialising(req, bdrv_get_cluster_size(bs));
965     }
966 
967     if (!(flags & BDRV_REQ_NO_SERIALISING)) {
968         wait_serialising_requests(req);
969     }
970 
971     if (flags & BDRV_REQ_COPY_ON_READ) {
972         int pnum;
973 
974         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
975         if (ret < 0) {
976             goto out;
977         }
978 
979         if (!ret || pnum != nb_sectors) {
980             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
981             goto out;
982         }
983     }
984 
985     /* Forward the request to the BlockDriver */
986     if (!bs->zero_beyond_eof) {
987         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
988     } else {
989         /* Read zeros after EOF */
990         int64_t total_sectors, max_nb_sectors;
991 
992         total_sectors = bdrv_nb_sectors(bs);
993         if (total_sectors < 0) {
994             ret = total_sectors;
995             goto out;
996         }
997 
998         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
999                                   align >> BDRV_SECTOR_BITS);
1000         if (nb_sectors < max_nb_sectors) {
1001             ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1002         } else if (max_nb_sectors > 0) {
1003             QEMUIOVector local_qiov;
1004 
1005             qemu_iovec_init(&local_qiov, qiov->niov);
1006             qemu_iovec_concat(&local_qiov, qiov, 0,
1007                               max_nb_sectors * BDRV_SECTOR_SIZE);
1008 
1009             ret = bdrv_driver_preadv(bs, offset,
1010                                      max_nb_sectors * BDRV_SECTOR_SIZE,
1011                                      &local_qiov, 0);
1012 
1013             qemu_iovec_destroy(&local_qiov);
1014         } else {
1015             ret = 0;
1016         }
1017 
1018         /* Reading beyond end of file is supposed to produce zeroes */
1019         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
1020             uint64_t offset = MAX(0, total_sectors - sector_num);
1021             uint64_t bytes = (sector_num + nb_sectors - offset) *
1022                               BDRV_SECTOR_SIZE;
1023             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
1024         }
1025     }
1026 
1027 out:
1028     return ret;
1029 }
1030 
1031 /*
1032  * Handle a read request in coroutine context
1033  */
1034 int coroutine_fn bdrv_co_preadv(BlockDriverState *bs,
1035     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1036     BdrvRequestFlags flags)
1037 {
1038     BlockDriver *drv = bs->drv;
1039     BdrvTrackedRequest req;
1040 
1041     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
1042     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1043     uint8_t *head_buf = NULL;
1044     uint8_t *tail_buf = NULL;
1045     QEMUIOVector local_qiov;
1046     bool use_local_qiov = false;
1047     int ret;
1048 
1049     if (!drv) {
1050         return -ENOMEDIUM;
1051     }
1052 
1053     ret = bdrv_check_byte_request(bs, offset, bytes);
1054     if (ret < 0) {
1055         return ret;
1056     }
1057 
1058     /* Don't do copy-on-read if we read data before write operation */
1059     if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
1060         flags |= BDRV_REQ_COPY_ON_READ;
1061     }
1062 
1063     /* Align read if necessary by padding qiov */
1064     if (offset & (align - 1)) {
1065         head_buf = qemu_blockalign(bs, align);
1066         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1067         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1068         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1069         use_local_qiov = true;
1070 
1071         bytes += offset & (align - 1);
1072         offset = offset & ~(align - 1);
1073     }
1074 
1075     if ((offset + bytes) & (align - 1)) {
1076         if (!use_local_qiov) {
1077             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1078             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1079             use_local_qiov = true;
1080         }
1081         tail_buf = qemu_blockalign(bs, align);
1082         qemu_iovec_add(&local_qiov, tail_buf,
1083                        align - ((offset + bytes) & (align - 1)));
1084 
1085         bytes = ROUND_UP(bytes, align);
1086     }
1087 
1088     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1089     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1090                               use_local_qiov ? &local_qiov : qiov,
1091                               flags);
1092     tracked_request_end(&req);
1093 
1094     if (use_local_qiov) {
1095         qemu_iovec_destroy(&local_qiov);
1096         qemu_vfree(head_buf);
1097         qemu_vfree(tail_buf);
1098     }
1099 
1100     return ret;
1101 }
1102 
1103 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1104     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1105     BdrvRequestFlags flags)
1106 {
1107     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1108         return -EINVAL;
1109     }
1110 
1111     return bdrv_co_preadv(bs, sector_num << BDRV_SECTOR_BITS,
1112                           nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1113 }
1114 
1115 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1116     int nb_sectors, QEMUIOVector *qiov)
1117 {
1118     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1119 
1120     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1121 }
1122 
1123 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
1124 
1125 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1126     int64_t offset, int count, BdrvRequestFlags flags)
1127 {
1128     BlockDriver *drv = bs->drv;
1129     QEMUIOVector qiov;
1130     struct iovec iov = {0};
1131     int ret = 0;
1132     bool need_flush = false;
1133     int head = 0;
1134     int tail = 0;
1135 
1136     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1137     int alignment = MAX(bs->bl.pwrite_zeroes_alignment ?: 1,
1138                         bs->request_alignment);
1139 
1140     assert(is_power_of_2(alignment));
1141     head = offset & (alignment - 1);
1142     tail = (offset + count) & (alignment - 1);
1143     max_write_zeroes &= ~(alignment - 1);
1144 
1145     while (count > 0 && !ret) {
1146         int num = count;
1147 
1148         /* Align request.  Block drivers can expect the "bulk" of the request
1149          * to be aligned, and that unaligned requests do not cross cluster
1150          * boundaries.
1151          */
1152         if (head) {
1153             /* Make a small request up to the first aligned sector.  */
1154             num = MIN(count, alignment - head);
1155             head = 0;
1156         } else if (tail && num > alignment) {
1157             /* Shorten the request to the last aligned sector.  */
1158             num -= tail;
1159         }
1160 
1161         /* limit request size */
1162         if (num > max_write_zeroes) {
1163             num = max_write_zeroes;
1164         }
1165 
1166         ret = -ENOTSUP;
1167         /* First try the efficient write zeroes operation */
1168         if (drv->bdrv_co_pwrite_zeroes) {
1169             ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1170                                              flags & bs->supported_zero_flags);
1171             if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1172                 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1173                 need_flush = true;
1174             }
1175         } else {
1176             assert(!bs->supported_zero_flags);
1177         }
1178 
1179         if (ret == -ENOTSUP) {
1180             /* Fall back to bounce buffer if write zeroes is unsupported */
1181             int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
1182                                             MAX_WRITE_ZEROES_BOUNCE_BUFFER);
1183             BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1184 
1185             if ((flags & BDRV_REQ_FUA) &&
1186                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1187                 /* No need for bdrv_driver_pwrite() to do a fallback
1188                  * flush on each chunk; use just one at the end */
1189                 write_flags &= ~BDRV_REQ_FUA;
1190                 need_flush = true;
1191             }
1192             num = MIN(num, max_xfer_len << BDRV_SECTOR_BITS);
1193             iov.iov_len = num;
1194             if (iov.iov_base == NULL) {
1195                 iov.iov_base = qemu_try_blockalign(bs, num);
1196                 if (iov.iov_base == NULL) {
1197                     ret = -ENOMEM;
1198                     goto fail;
1199                 }
1200                 memset(iov.iov_base, 0, num);
1201             }
1202             qemu_iovec_init_external(&qiov, &iov, 1);
1203 
1204             ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
1205 
1206             /* Keep bounce buffer around if it is big enough for all
1207              * all future requests.
1208              */
1209             if (num < max_xfer_len << BDRV_SECTOR_BITS) {
1210                 qemu_vfree(iov.iov_base);
1211                 iov.iov_base = NULL;
1212             }
1213         }
1214 
1215         offset += num;
1216         count -= num;
1217     }
1218 
1219 fail:
1220     if (ret == 0 && need_flush) {
1221         ret = bdrv_co_flush(bs);
1222     }
1223     qemu_vfree(iov.iov_base);
1224     return ret;
1225 }
1226 
1227 /*
1228  * Forwards an already correctly aligned write request to the BlockDriver.
1229  */
1230 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
1231     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1232     QEMUIOVector *qiov, int flags)
1233 {
1234     BlockDriver *drv = bs->drv;
1235     bool waited;
1236     int ret;
1237 
1238     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
1239     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
1240 
1241     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1242     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1243     assert(!qiov || bytes == qiov->size);
1244     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1245 
1246     waited = wait_serialising_requests(req);
1247     assert(!waited || !req->serialising);
1248     assert(req->overlap_offset <= offset);
1249     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1250 
1251     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1252 
1253     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1254         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
1255         qemu_iovec_is_zero(qiov)) {
1256         flags |= BDRV_REQ_ZERO_WRITE;
1257         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1258             flags |= BDRV_REQ_MAY_UNMAP;
1259         }
1260     }
1261 
1262     if (ret < 0) {
1263         /* Do nothing, write notifier decided to fail this request */
1264     } else if (flags & BDRV_REQ_ZERO_WRITE) {
1265         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1266         ret = bdrv_co_do_pwrite_zeroes(bs, sector_num << BDRV_SECTOR_BITS,
1267                                        nb_sectors << BDRV_SECTOR_BITS, flags);
1268     } else {
1269         bdrv_debug_event(bs, BLKDBG_PWRITEV);
1270         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
1271     }
1272     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1273 
1274     bdrv_set_dirty(bs, sector_num, nb_sectors);
1275 
1276     if (bs->wr_highest_offset < offset + bytes) {
1277         bs->wr_highest_offset = offset + bytes;
1278     }
1279 
1280     if (ret >= 0) {
1281         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
1282     }
1283 
1284     return ret;
1285 }
1286 
1287 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
1288                                                 int64_t offset,
1289                                                 unsigned int bytes,
1290                                                 BdrvRequestFlags flags,
1291                                                 BdrvTrackedRequest *req)
1292 {
1293     uint8_t *buf = NULL;
1294     QEMUIOVector local_qiov;
1295     struct iovec iov;
1296     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1297     unsigned int head_padding_bytes, tail_padding_bytes;
1298     int ret = 0;
1299 
1300     head_padding_bytes = offset & (align - 1);
1301     tail_padding_bytes = align - ((offset + bytes) & (align - 1));
1302 
1303 
1304     assert(flags & BDRV_REQ_ZERO_WRITE);
1305     if (head_padding_bytes || tail_padding_bytes) {
1306         buf = qemu_blockalign(bs, align);
1307         iov = (struct iovec) {
1308             .iov_base   = buf,
1309             .iov_len    = align,
1310         };
1311         qemu_iovec_init_external(&local_qiov, &iov, 1);
1312     }
1313     if (head_padding_bytes) {
1314         uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1315 
1316         /* RMW the unaligned part before head. */
1317         mark_request_serialising(req, align);
1318         wait_serialising_requests(req);
1319         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1320         ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
1321                                   align, &local_qiov, 0);
1322         if (ret < 0) {
1323             goto fail;
1324         }
1325         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1326 
1327         memset(buf + head_padding_bytes, 0, zero_bytes);
1328         ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
1329                                    &local_qiov,
1330                                    flags & ~BDRV_REQ_ZERO_WRITE);
1331         if (ret < 0) {
1332             goto fail;
1333         }
1334         offset += zero_bytes;
1335         bytes -= zero_bytes;
1336     }
1337 
1338     assert(!bytes || (offset & (align - 1)) == 0);
1339     if (bytes >= align) {
1340         /* Write the aligned part in the middle. */
1341         uint64_t aligned_bytes = bytes & ~(align - 1);
1342         ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes,
1343                                    NULL, flags);
1344         if (ret < 0) {
1345             goto fail;
1346         }
1347         bytes -= aligned_bytes;
1348         offset += aligned_bytes;
1349     }
1350 
1351     assert(!bytes || (offset & (align - 1)) == 0);
1352     if (bytes) {
1353         assert(align == tail_padding_bytes + bytes);
1354         /* RMW the unaligned part after tail. */
1355         mark_request_serialising(req, align);
1356         wait_serialising_requests(req);
1357         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1358         ret = bdrv_aligned_preadv(bs, req, offset, align,
1359                                   align, &local_qiov, 0);
1360         if (ret < 0) {
1361             goto fail;
1362         }
1363         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1364 
1365         memset(buf, 0, bytes);
1366         ret = bdrv_aligned_pwritev(bs, req, offset, align,
1367                                    &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1368     }
1369 fail:
1370     qemu_vfree(buf);
1371     return ret;
1372 
1373 }
1374 
1375 /*
1376  * Handle a write request in coroutine context
1377  */
1378 int coroutine_fn bdrv_co_pwritev(BlockDriverState *bs,
1379     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1380     BdrvRequestFlags flags)
1381 {
1382     BdrvTrackedRequest req;
1383     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
1384     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1385     uint8_t *head_buf = NULL;
1386     uint8_t *tail_buf = NULL;
1387     QEMUIOVector local_qiov;
1388     bool use_local_qiov = false;
1389     int ret;
1390 
1391     if (!bs->drv) {
1392         return -ENOMEDIUM;
1393     }
1394     if (bs->read_only) {
1395         return -EPERM;
1396     }
1397     assert(!(bs->open_flags & BDRV_O_INACTIVE));
1398 
1399     ret = bdrv_check_byte_request(bs, offset, bytes);
1400     if (ret < 0) {
1401         return ret;
1402     }
1403 
1404     /*
1405      * Align write if necessary by performing a read-modify-write cycle.
1406      * Pad qiov with the read parts and be sure to have a tracked request not
1407      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1408      */
1409     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1410 
1411     if (!qiov) {
1412         ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
1413         goto out;
1414     }
1415 
1416     if (offset & (align - 1)) {
1417         QEMUIOVector head_qiov;
1418         struct iovec head_iov;
1419 
1420         mark_request_serialising(&req, align);
1421         wait_serialising_requests(&req);
1422 
1423         head_buf = qemu_blockalign(bs, align);
1424         head_iov = (struct iovec) {
1425             .iov_base   = head_buf,
1426             .iov_len    = align,
1427         };
1428         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1429 
1430         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1431         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
1432                                   align, &head_qiov, 0);
1433         if (ret < 0) {
1434             goto fail;
1435         }
1436         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1437 
1438         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1439         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1440         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1441         use_local_qiov = true;
1442 
1443         bytes += offset & (align - 1);
1444         offset = offset & ~(align - 1);
1445 
1446         /* We have read the tail already if the request is smaller
1447          * than one aligned block.
1448          */
1449         if (bytes < align) {
1450             qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1451             bytes = align;
1452         }
1453     }
1454 
1455     if ((offset + bytes) & (align - 1)) {
1456         QEMUIOVector tail_qiov;
1457         struct iovec tail_iov;
1458         size_t tail_bytes;
1459         bool waited;
1460 
1461         mark_request_serialising(&req, align);
1462         waited = wait_serialising_requests(&req);
1463         assert(!waited || !use_local_qiov);
1464 
1465         tail_buf = qemu_blockalign(bs, align);
1466         tail_iov = (struct iovec) {
1467             .iov_base   = tail_buf,
1468             .iov_len    = align,
1469         };
1470         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1471 
1472         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1473         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
1474                                   align, &tail_qiov, 0);
1475         if (ret < 0) {
1476             goto fail;
1477         }
1478         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1479 
1480         if (!use_local_qiov) {
1481             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1482             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1483             use_local_qiov = true;
1484         }
1485 
1486         tail_bytes = (offset + bytes) & (align - 1);
1487         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1488 
1489         bytes = ROUND_UP(bytes, align);
1490     }
1491 
1492     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
1493                                use_local_qiov ? &local_qiov : qiov,
1494                                flags);
1495 
1496 fail:
1497 
1498     if (use_local_qiov) {
1499         qemu_iovec_destroy(&local_qiov);
1500     }
1501     qemu_vfree(head_buf);
1502     qemu_vfree(tail_buf);
1503 out:
1504     tracked_request_end(&req);
1505     return ret;
1506 }
1507 
1508 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1509     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1510     BdrvRequestFlags flags)
1511 {
1512     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1513         return -EINVAL;
1514     }
1515 
1516     return bdrv_co_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
1517                            nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1518 }
1519 
1520 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1521     int nb_sectors, QEMUIOVector *qiov)
1522 {
1523     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1524 
1525     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1526 }
1527 
1528 int coroutine_fn bdrv_co_pwrite_zeroes(BlockDriverState *bs,
1529                                        int64_t offset, int count,
1530                                        BdrvRequestFlags flags)
1531 {
1532     trace_bdrv_co_pwrite_zeroes(bs, offset, count, flags);
1533 
1534     if (!(bs->open_flags & BDRV_O_UNMAP)) {
1535         flags &= ~BDRV_REQ_MAY_UNMAP;
1536     }
1537 
1538     return bdrv_co_pwritev(bs, offset, count, NULL,
1539                            BDRV_REQ_ZERO_WRITE | flags);
1540 }
1541 
1542 typedef struct BdrvCoGetBlockStatusData {
1543     BlockDriverState *bs;
1544     BlockDriverState *base;
1545     BlockDriverState **file;
1546     int64_t sector_num;
1547     int nb_sectors;
1548     int *pnum;
1549     int64_t ret;
1550     bool done;
1551 } BdrvCoGetBlockStatusData;
1552 
1553 /*
1554  * Returns the allocation status of the specified sectors.
1555  * Drivers not implementing the functionality are assumed to not support
1556  * backing files, hence all their sectors are reported as allocated.
1557  *
1558  * If 'sector_num' is beyond the end of the disk image the return value is 0
1559  * and 'pnum' is set to 0.
1560  *
1561  * 'pnum' is set to the number of sectors (including and immediately following
1562  * the specified sector) that are known to be in the same
1563  * allocated/unallocated state.
1564  *
1565  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1566  * beyond the end of the disk image it will be clamped.
1567  *
1568  * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
1569  * points to the BDS which the sector range is allocated in.
1570  */
1571 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
1572                                                      int64_t sector_num,
1573                                                      int nb_sectors, int *pnum,
1574                                                      BlockDriverState **file)
1575 {
1576     int64_t total_sectors;
1577     int64_t n;
1578     int64_t ret, ret2;
1579 
1580     total_sectors = bdrv_nb_sectors(bs);
1581     if (total_sectors < 0) {
1582         return total_sectors;
1583     }
1584 
1585     if (sector_num >= total_sectors) {
1586         *pnum = 0;
1587         return 0;
1588     }
1589 
1590     n = total_sectors - sector_num;
1591     if (n < nb_sectors) {
1592         nb_sectors = n;
1593     }
1594 
1595     if (!bs->drv->bdrv_co_get_block_status) {
1596         *pnum = nb_sectors;
1597         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1598         if (bs->drv->protocol_name) {
1599             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
1600         }
1601         return ret;
1602     }
1603 
1604     *file = NULL;
1605     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
1606                                             file);
1607     if (ret < 0) {
1608         *pnum = 0;
1609         return ret;
1610     }
1611 
1612     if (ret & BDRV_BLOCK_RAW) {
1613         assert(ret & BDRV_BLOCK_OFFSET_VALID);
1614         return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
1615                                      *pnum, pnum, file);
1616     }
1617 
1618     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1619         ret |= BDRV_BLOCK_ALLOCATED;
1620     } else {
1621         if (bdrv_unallocated_blocks_are_zero(bs)) {
1622             ret |= BDRV_BLOCK_ZERO;
1623         } else if (bs->backing) {
1624             BlockDriverState *bs2 = bs->backing->bs;
1625             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
1626             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
1627                 ret |= BDRV_BLOCK_ZERO;
1628             }
1629         }
1630     }
1631 
1632     if (*file && *file != bs &&
1633         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1634         (ret & BDRV_BLOCK_OFFSET_VALID)) {
1635         BlockDriverState *file2;
1636         int file_pnum;
1637 
1638         ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
1639                                         *pnum, &file_pnum, &file2);
1640         if (ret2 >= 0) {
1641             /* Ignore errors.  This is just providing extra information, it
1642              * is useful but not necessary.
1643              */
1644             if (!file_pnum) {
1645                 /* !file_pnum indicates an offset at or beyond the EOF; it is
1646                  * perfectly valid for the format block driver to point to such
1647                  * offsets, so catch it and mark everything as zero */
1648                 ret |= BDRV_BLOCK_ZERO;
1649             } else {
1650                 /* Limit request to the range reported by the protocol driver */
1651                 *pnum = file_pnum;
1652                 ret |= (ret2 & BDRV_BLOCK_ZERO);
1653             }
1654         }
1655     }
1656 
1657     return ret;
1658 }
1659 
1660 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
1661         BlockDriverState *base,
1662         int64_t sector_num,
1663         int nb_sectors,
1664         int *pnum,
1665         BlockDriverState **file)
1666 {
1667     BlockDriverState *p;
1668     int64_t ret = 0;
1669 
1670     assert(bs != base);
1671     for (p = bs; p != base; p = backing_bs(p)) {
1672         ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
1673         if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
1674             break;
1675         }
1676         /* [sector_num, pnum] unallocated on this layer, which could be only
1677          * the first part of [sector_num, nb_sectors].  */
1678         nb_sectors = MIN(nb_sectors, *pnum);
1679     }
1680     return ret;
1681 }
1682 
1683 /* Coroutine wrapper for bdrv_get_block_status_above() */
1684 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
1685 {
1686     BdrvCoGetBlockStatusData *data = opaque;
1687 
1688     data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
1689                                                data->sector_num,
1690                                                data->nb_sectors,
1691                                                data->pnum,
1692                                                data->file);
1693     data->done = true;
1694 }
1695 
1696 /*
1697  * Synchronous wrapper around bdrv_co_get_block_status_above().
1698  *
1699  * See bdrv_co_get_block_status_above() for details.
1700  */
1701 int64_t bdrv_get_block_status_above(BlockDriverState *bs,
1702                                     BlockDriverState *base,
1703                                     int64_t sector_num,
1704                                     int nb_sectors, int *pnum,
1705                                     BlockDriverState **file)
1706 {
1707     Coroutine *co;
1708     BdrvCoGetBlockStatusData data = {
1709         .bs = bs,
1710         .base = base,
1711         .file = file,
1712         .sector_num = sector_num,
1713         .nb_sectors = nb_sectors,
1714         .pnum = pnum,
1715         .done = false,
1716     };
1717 
1718     if (qemu_in_coroutine()) {
1719         /* Fast-path if already in coroutine context */
1720         bdrv_get_block_status_above_co_entry(&data);
1721     } else {
1722         AioContext *aio_context = bdrv_get_aio_context(bs);
1723 
1724         co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry);
1725         qemu_coroutine_enter(co, &data);
1726         while (!data.done) {
1727             aio_poll(aio_context, true);
1728         }
1729     }
1730     return data.ret;
1731 }
1732 
1733 int64_t bdrv_get_block_status(BlockDriverState *bs,
1734                               int64_t sector_num,
1735                               int nb_sectors, int *pnum,
1736                               BlockDriverState **file)
1737 {
1738     return bdrv_get_block_status_above(bs, backing_bs(bs),
1739                                        sector_num, nb_sectors, pnum, file);
1740 }
1741 
1742 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
1743                                    int nb_sectors, int *pnum)
1744 {
1745     BlockDriverState *file;
1746     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
1747                                         &file);
1748     if (ret < 0) {
1749         return ret;
1750     }
1751     return !!(ret & BDRV_BLOCK_ALLOCATED);
1752 }
1753 
1754 /*
1755  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
1756  *
1757  * Return true if the given sector is allocated in any image between
1758  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
1759  * sector is allocated in any image of the chain.  Return false otherwise.
1760  *
1761  * 'pnum' is set to the number of sectors (including and immediately following
1762  *  the specified sector) that are known to be in the same
1763  *  allocated/unallocated state.
1764  *
1765  */
1766 int bdrv_is_allocated_above(BlockDriverState *top,
1767                             BlockDriverState *base,
1768                             int64_t sector_num,
1769                             int nb_sectors, int *pnum)
1770 {
1771     BlockDriverState *intermediate;
1772     int ret, n = nb_sectors;
1773 
1774     intermediate = top;
1775     while (intermediate && intermediate != base) {
1776         int pnum_inter;
1777         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
1778                                 &pnum_inter);
1779         if (ret < 0) {
1780             return ret;
1781         } else if (ret) {
1782             *pnum = pnum_inter;
1783             return 1;
1784         }
1785 
1786         /*
1787          * [sector_num, nb_sectors] is unallocated on top but intermediate
1788          * might have
1789          *
1790          * [sector_num+x, nr_sectors] allocated.
1791          */
1792         if (n > pnum_inter &&
1793             (intermediate == top ||
1794              sector_num + pnum_inter < intermediate->total_sectors)) {
1795             n = pnum_inter;
1796         }
1797 
1798         intermediate = backing_bs(intermediate);
1799     }
1800 
1801     *pnum = n;
1802     return 0;
1803 }
1804 
1805 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1806                           const uint8_t *buf, int nb_sectors)
1807 {
1808     BlockDriver *drv = bs->drv;
1809     int ret;
1810 
1811     if (!drv) {
1812         return -ENOMEDIUM;
1813     }
1814     if (!drv->bdrv_write_compressed) {
1815         return -ENOTSUP;
1816     }
1817     ret = bdrv_check_request(bs, sector_num, nb_sectors);
1818     if (ret < 0) {
1819         return ret;
1820     }
1821 
1822     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1823 
1824     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1825 }
1826 
1827 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1828                       int64_t pos, int size)
1829 {
1830     QEMUIOVector qiov;
1831     struct iovec iov = {
1832         .iov_base   = (void *) buf,
1833         .iov_len    = size,
1834     };
1835 
1836     qemu_iovec_init_external(&qiov, &iov, 1);
1837     return bdrv_writev_vmstate(bs, &qiov, pos);
1838 }
1839 
1840 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
1841 {
1842     BlockDriver *drv = bs->drv;
1843 
1844     if (!drv) {
1845         return -ENOMEDIUM;
1846     } else if (drv->bdrv_save_vmstate) {
1847         return drv->bdrv_save_vmstate(bs, qiov, pos);
1848     } else if (bs->file) {
1849         return bdrv_writev_vmstate(bs->file->bs, qiov, pos);
1850     }
1851 
1852     return -ENOTSUP;
1853 }
1854 
1855 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1856                       int64_t pos, int size)
1857 {
1858     BlockDriver *drv = bs->drv;
1859     if (!drv)
1860         return -ENOMEDIUM;
1861     if (drv->bdrv_load_vmstate)
1862         return drv->bdrv_load_vmstate(bs, buf, pos, size);
1863     if (bs->file)
1864         return bdrv_load_vmstate(bs->file->bs, buf, pos, size);
1865     return -ENOTSUP;
1866 }
1867 
1868 /**************************************************************/
1869 /* async I/Os */
1870 
1871 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
1872                            QEMUIOVector *qiov, int nb_sectors,
1873                            BlockCompletionFunc *cb, void *opaque)
1874 {
1875     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
1876 
1877     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1878                                  cb, opaque, false);
1879 }
1880 
1881 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
1882                             QEMUIOVector *qiov, int nb_sectors,
1883                             BlockCompletionFunc *cb, void *opaque)
1884 {
1885     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
1886 
1887     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1888                                  cb, opaque, true);
1889 }
1890 
1891 void bdrv_aio_cancel(BlockAIOCB *acb)
1892 {
1893     qemu_aio_ref(acb);
1894     bdrv_aio_cancel_async(acb);
1895     while (acb->refcnt > 1) {
1896         if (acb->aiocb_info->get_aio_context) {
1897             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
1898         } else if (acb->bs) {
1899             aio_poll(bdrv_get_aio_context(acb->bs), true);
1900         } else {
1901             abort();
1902         }
1903     }
1904     qemu_aio_unref(acb);
1905 }
1906 
1907 /* Async version of aio cancel. The caller is not blocked if the acb implements
1908  * cancel_async, otherwise we do nothing and let the request normally complete.
1909  * In either case the completion callback must be called. */
1910 void bdrv_aio_cancel_async(BlockAIOCB *acb)
1911 {
1912     if (acb->aiocb_info->cancel_async) {
1913         acb->aiocb_info->cancel_async(acb);
1914     }
1915 }
1916 
1917 /**************************************************************/
1918 /* async block device emulation */
1919 
1920 typedef struct BlockRequest {
1921     union {
1922         /* Used during read, write, trim */
1923         struct {
1924             int64_t sector;
1925             int nb_sectors;
1926             int flags;
1927             QEMUIOVector *qiov;
1928         };
1929         /* Used during ioctl */
1930         struct {
1931             int req;
1932             void *buf;
1933         };
1934     };
1935     BlockCompletionFunc *cb;
1936     void *opaque;
1937 
1938     int error;
1939 } BlockRequest;
1940 
1941 typedef struct BlockAIOCBCoroutine {
1942     BlockAIOCB common;
1943     BlockRequest req;
1944     bool is_write;
1945     bool need_bh;
1946     bool *done;
1947     QEMUBH* bh;
1948 } BlockAIOCBCoroutine;
1949 
1950 static const AIOCBInfo bdrv_em_co_aiocb_info = {
1951     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
1952 };
1953 
1954 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
1955 {
1956     if (!acb->need_bh) {
1957         acb->common.cb(acb->common.opaque, acb->req.error);
1958         qemu_aio_unref(acb);
1959     }
1960 }
1961 
1962 static void bdrv_co_em_bh(void *opaque)
1963 {
1964     BlockAIOCBCoroutine *acb = opaque;
1965 
1966     assert(!acb->need_bh);
1967     qemu_bh_delete(acb->bh);
1968     bdrv_co_complete(acb);
1969 }
1970 
1971 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
1972 {
1973     acb->need_bh = false;
1974     if (acb->req.error != -EINPROGRESS) {
1975         BlockDriverState *bs = acb->common.bs;
1976 
1977         acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
1978         qemu_bh_schedule(acb->bh);
1979     }
1980 }
1981 
1982 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
1983 static void coroutine_fn bdrv_co_do_rw(void *opaque)
1984 {
1985     BlockAIOCBCoroutine *acb = opaque;
1986     BlockDriverState *bs = acb->common.bs;
1987 
1988     if (!acb->is_write) {
1989         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
1990             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
1991     } else {
1992         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
1993             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
1994     }
1995 
1996     bdrv_co_complete(acb);
1997 }
1998 
1999 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2000                                          int64_t sector_num,
2001                                          QEMUIOVector *qiov,
2002                                          int nb_sectors,
2003                                          BdrvRequestFlags flags,
2004                                          BlockCompletionFunc *cb,
2005                                          void *opaque,
2006                                          bool is_write)
2007 {
2008     Coroutine *co;
2009     BlockAIOCBCoroutine *acb;
2010 
2011     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2012     acb->need_bh = true;
2013     acb->req.error = -EINPROGRESS;
2014     acb->req.sector = sector_num;
2015     acb->req.nb_sectors = nb_sectors;
2016     acb->req.qiov = qiov;
2017     acb->req.flags = flags;
2018     acb->is_write = is_write;
2019 
2020     co = qemu_coroutine_create(bdrv_co_do_rw);
2021     qemu_coroutine_enter(co, acb);
2022 
2023     bdrv_co_maybe_schedule_bh(acb);
2024     return &acb->common;
2025 }
2026 
2027 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2028 {
2029     BlockAIOCBCoroutine *acb = opaque;
2030     BlockDriverState *bs = acb->common.bs;
2031 
2032     acb->req.error = bdrv_co_flush(bs);
2033     bdrv_co_complete(acb);
2034 }
2035 
2036 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2037         BlockCompletionFunc *cb, void *opaque)
2038 {
2039     trace_bdrv_aio_flush(bs, opaque);
2040 
2041     Coroutine *co;
2042     BlockAIOCBCoroutine *acb;
2043 
2044     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2045     acb->need_bh = true;
2046     acb->req.error = -EINPROGRESS;
2047 
2048     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
2049     qemu_coroutine_enter(co, acb);
2050 
2051     bdrv_co_maybe_schedule_bh(acb);
2052     return &acb->common;
2053 }
2054 
2055 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
2056 {
2057     BlockAIOCBCoroutine *acb = opaque;
2058     BlockDriverState *bs = acb->common.bs;
2059 
2060     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2061     bdrv_co_complete(acb);
2062 }
2063 
2064 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
2065         int64_t sector_num, int nb_sectors,
2066         BlockCompletionFunc *cb, void *opaque)
2067 {
2068     Coroutine *co;
2069     BlockAIOCBCoroutine *acb;
2070 
2071     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
2072 
2073     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2074     acb->need_bh = true;
2075     acb->req.error = -EINPROGRESS;
2076     acb->req.sector = sector_num;
2077     acb->req.nb_sectors = nb_sectors;
2078     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
2079     qemu_coroutine_enter(co, acb);
2080 
2081     bdrv_co_maybe_schedule_bh(acb);
2082     return &acb->common;
2083 }
2084 
2085 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
2086                    BlockCompletionFunc *cb, void *opaque)
2087 {
2088     BlockAIOCB *acb;
2089 
2090     acb = g_malloc(aiocb_info->aiocb_size);
2091     acb->aiocb_info = aiocb_info;
2092     acb->bs = bs;
2093     acb->cb = cb;
2094     acb->opaque = opaque;
2095     acb->refcnt = 1;
2096     return acb;
2097 }
2098 
2099 void qemu_aio_ref(void *p)
2100 {
2101     BlockAIOCB *acb = p;
2102     acb->refcnt++;
2103 }
2104 
2105 void qemu_aio_unref(void *p)
2106 {
2107     BlockAIOCB *acb = p;
2108     assert(acb->refcnt > 0);
2109     if (--acb->refcnt == 0) {
2110         g_free(acb);
2111     }
2112 }
2113 
2114 /**************************************************************/
2115 /* Coroutine block device emulation */
2116 
2117 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2118 {
2119     RwCo *rwco = opaque;
2120 
2121     rwco->ret = bdrv_co_flush(rwco->bs);
2122 }
2123 
2124 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2125 {
2126     int ret;
2127     BdrvTrackedRequest req;
2128 
2129     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2130         bdrv_is_sg(bs)) {
2131         return 0;
2132     }
2133 
2134     tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
2135 
2136     /* Write back all layers by calling one driver function */
2137     if (bs->drv->bdrv_co_flush) {
2138         ret = bs->drv->bdrv_co_flush(bs);
2139         goto out;
2140     }
2141 
2142     /* Write back cached data to the OS even with cache=unsafe */
2143     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2144     if (bs->drv->bdrv_co_flush_to_os) {
2145         ret = bs->drv->bdrv_co_flush_to_os(bs);
2146         if (ret < 0) {
2147             goto out;
2148         }
2149     }
2150 
2151     /* But don't actually force it to the disk with cache=unsafe */
2152     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2153         goto flush_parent;
2154     }
2155 
2156     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2157     if (bs->drv->bdrv_co_flush_to_disk) {
2158         ret = bs->drv->bdrv_co_flush_to_disk(bs);
2159     } else if (bs->drv->bdrv_aio_flush) {
2160         BlockAIOCB *acb;
2161         CoroutineIOCompletion co = {
2162             .coroutine = qemu_coroutine_self(),
2163         };
2164 
2165         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2166         if (acb == NULL) {
2167             ret = -EIO;
2168         } else {
2169             qemu_coroutine_yield();
2170             ret = co.ret;
2171         }
2172     } else {
2173         /*
2174          * Some block drivers always operate in either writethrough or unsafe
2175          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2176          * know how the server works (because the behaviour is hardcoded or
2177          * depends on server-side configuration), so we can't ensure that
2178          * everything is safe on disk. Returning an error doesn't work because
2179          * that would break guests even if the server operates in writethrough
2180          * mode.
2181          *
2182          * Let's hope the user knows what he's doing.
2183          */
2184         ret = 0;
2185     }
2186     if (ret < 0) {
2187         goto out;
2188     }
2189 
2190     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2191      * in the case of cache=unsafe, so there are no useless flushes.
2192      */
2193 flush_parent:
2194     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2195 out:
2196     tracked_request_end(&req);
2197     return ret;
2198 }
2199 
2200 int bdrv_flush(BlockDriverState *bs)
2201 {
2202     Coroutine *co;
2203     RwCo rwco = {
2204         .bs = bs,
2205         .ret = NOT_DONE,
2206     };
2207 
2208     if (qemu_in_coroutine()) {
2209         /* Fast-path if already in coroutine context */
2210         bdrv_flush_co_entry(&rwco);
2211     } else {
2212         AioContext *aio_context = bdrv_get_aio_context(bs);
2213 
2214         co = qemu_coroutine_create(bdrv_flush_co_entry);
2215         qemu_coroutine_enter(co, &rwco);
2216         while (rwco.ret == NOT_DONE) {
2217             aio_poll(aio_context, true);
2218         }
2219     }
2220 
2221     return rwco.ret;
2222 }
2223 
2224 typedef struct DiscardCo {
2225     BlockDriverState *bs;
2226     int64_t sector_num;
2227     int nb_sectors;
2228     int ret;
2229 } DiscardCo;
2230 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
2231 {
2232     DiscardCo *rwco = opaque;
2233 
2234     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
2235 }
2236 
2237 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
2238                                  int nb_sectors)
2239 {
2240     BdrvTrackedRequest req;
2241     int max_discard, ret;
2242 
2243     if (!bs->drv) {
2244         return -ENOMEDIUM;
2245     }
2246 
2247     ret = bdrv_check_request(bs, sector_num, nb_sectors);
2248     if (ret < 0) {
2249         return ret;
2250     } else if (bs->read_only) {
2251         return -EPERM;
2252     }
2253     assert(!(bs->open_flags & BDRV_O_INACTIVE));
2254 
2255     /* Do nothing if disabled.  */
2256     if (!(bs->open_flags & BDRV_O_UNMAP)) {
2257         return 0;
2258     }
2259 
2260     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
2261         return 0;
2262     }
2263 
2264     tracked_request_begin(&req, bs, sector_num, nb_sectors,
2265                           BDRV_TRACKED_DISCARD);
2266     bdrv_set_dirty(bs, sector_num, nb_sectors);
2267 
2268     max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
2269     while (nb_sectors > 0) {
2270         int ret;
2271         int num = nb_sectors;
2272 
2273         /* align request */
2274         if (bs->bl.discard_alignment &&
2275             num >= bs->bl.discard_alignment &&
2276             sector_num % bs->bl.discard_alignment) {
2277             if (num > bs->bl.discard_alignment) {
2278                 num = bs->bl.discard_alignment;
2279             }
2280             num -= sector_num % bs->bl.discard_alignment;
2281         }
2282 
2283         /* limit request size */
2284         if (num > max_discard) {
2285             num = max_discard;
2286         }
2287 
2288         if (bs->drv->bdrv_co_discard) {
2289             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
2290         } else {
2291             BlockAIOCB *acb;
2292             CoroutineIOCompletion co = {
2293                 .coroutine = qemu_coroutine_self(),
2294             };
2295 
2296             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
2297                                             bdrv_co_io_em_complete, &co);
2298             if (acb == NULL) {
2299                 ret = -EIO;
2300                 goto out;
2301             } else {
2302                 qemu_coroutine_yield();
2303                 ret = co.ret;
2304             }
2305         }
2306         if (ret && ret != -ENOTSUP) {
2307             goto out;
2308         }
2309 
2310         sector_num += num;
2311         nb_sectors -= num;
2312     }
2313     ret = 0;
2314 out:
2315     tracked_request_end(&req);
2316     return ret;
2317 }
2318 
2319 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
2320 {
2321     Coroutine *co;
2322     DiscardCo rwco = {
2323         .bs = bs,
2324         .sector_num = sector_num,
2325         .nb_sectors = nb_sectors,
2326         .ret = NOT_DONE,
2327     };
2328 
2329     if (qemu_in_coroutine()) {
2330         /* Fast-path if already in coroutine context */
2331         bdrv_discard_co_entry(&rwco);
2332     } else {
2333         AioContext *aio_context = bdrv_get_aio_context(bs);
2334 
2335         co = qemu_coroutine_create(bdrv_discard_co_entry);
2336         qemu_coroutine_enter(co, &rwco);
2337         while (rwco.ret == NOT_DONE) {
2338             aio_poll(aio_context, true);
2339         }
2340     }
2341 
2342     return rwco.ret;
2343 }
2344 
2345 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
2346 {
2347     BlockDriver *drv = bs->drv;
2348     BdrvTrackedRequest tracked_req;
2349     CoroutineIOCompletion co = {
2350         .coroutine = qemu_coroutine_self(),
2351     };
2352     BlockAIOCB *acb;
2353 
2354     tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
2355     if (!drv || !drv->bdrv_aio_ioctl) {
2356         co.ret = -ENOTSUP;
2357         goto out;
2358     }
2359 
2360     acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2361     if (!acb) {
2362         co.ret = -ENOTSUP;
2363         goto out;
2364     }
2365     qemu_coroutine_yield();
2366 out:
2367     tracked_request_end(&tracked_req);
2368     return co.ret;
2369 }
2370 
2371 typedef struct {
2372     BlockDriverState *bs;
2373     int req;
2374     void *buf;
2375     int ret;
2376 } BdrvIoctlCoData;
2377 
2378 static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
2379 {
2380     BdrvIoctlCoData *data = opaque;
2381     data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
2382 }
2383 
2384 /* needed for generic scsi interface */
2385 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2386 {
2387     BdrvIoctlCoData data = {
2388         .bs = bs,
2389         .req = req,
2390         .buf = buf,
2391         .ret = -EINPROGRESS,
2392     };
2393 
2394     if (qemu_in_coroutine()) {
2395         /* Fast-path if already in coroutine context */
2396         bdrv_co_ioctl_entry(&data);
2397     } else {
2398         Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry);
2399 
2400         qemu_coroutine_enter(co, &data);
2401         while (data.ret == -EINPROGRESS) {
2402             aio_poll(bdrv_get_aio_context(bs), true);
2403         }
2404     }
2405     return data.ret;
2406 }
2407 
2408 static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
2409 {
2410     BlockAIOCBCoroutine *acb = opaque;
2411     acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
2412                                       acb->req.req, acb->req.buf);
2413     bdrv_co_complete(acb);
2414 }
2415 
2416 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
2417         unsigned long int req, void *buf,
2418         BlockCompletionFunc *cb, void *opaque)
2419 {
2420     BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
2421                                             bs, cb, opaque);
2422     Coroutine *co;
2423 
2424     acb->need_bh = true;
2425     acb->req.error = -EINPROGRESS;
2426     acb->req.req = req;
2427     acb->req.buf = buf;
2428     co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry);
2429     qemu_coroutine_enter(co, acb);
2430 
2431     bdrv_co_maybe_schedule_bh(acb);
2432     return &acb->common;
2433 }
2434 
2435 void *qemu_blockalign(BlockDriverState *bs, size_t size)
2436 {
2437     return qemu_memalign(bdrv_opt_mem_align(bs), size);
2438 }
2439 
2440 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2441 {
2442     return memset(qemu_blockalign(bs, size), 0, size);
2443 }
2444 
2445 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2446 {
2447     size_t align = bdrv_opt_mem_align(bs);
2448 
2449     /* Ensure that NULL is never returned on success */
2450     assert(align > 0);
2451     if (size == 0) {
2452         size = align;
2453     }
2454 
2455     return qemu_try_memalign(align, size);
2456 }
2457 
2458 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2459 {
2460     void *mem = qemu_try_blockalign(bs, size);
2461 
2462     if (mem) {
2463         memset(mem, 0, size);
2464     }
2465 
2466     return mem;
2467 }
2468 
2469 /*
2470  * Check if all memory in this vector is sector aligned.
2471  */
2472 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2473 {
2474     int i;
2475     size_t alignment = bdrv_min_mem_align(bs);
2476 
2477     for (i = 0; i < qiov->niov; i++) {
2478         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2479             return false;
2480         }
2481         if (qiov->iov[i].iov_len % alignment) {
2482             return false;
2483         }
2484     }
2485 
2486     return true;
2487 }
2488 
2489 void bdrv_add_before_write_notifier(BlockDriverState *bs,
2490                                     NotifierWithReturn *notifier)
2491 {
2492     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2493 }
2494 
2495 void bdrv_io_plug(BlockDriverState *bs)
2496 {
2497     BdrvChild *child;
2498 
2499     QLIST_FOREACH(child, &bs->children, next) {
2500         bdrv_io_plug(child->bs);
2501     }
2502 
2503     if (bs->io_plugged++ == 0 && bs->io_plug_disabled == 0) {
2504         BlockDriver *drv = bs->drv;
2505         if (drv && drv->bdrv_io_plug) {
2506             drv->bdrv_io_plug(bs);
2507         }
2508     }
2509 }
2510 
2511 void bdrv_io_unplug(BlockDriverState *bs)
2512 {
2513     BdrvChild *child;
2514 
2515     assert(bs->io_plugged);
2516     if (--bs->io_plugged == 0 && bs->io_plug_disabled == 0) {
2517         BlockDriver *drv = bs->drv;
2518         if (drv && drv->bdrv_io_unplug) {
2519             drv->bdrv_io_unplug(bs);
2520         }
2521     }
2522 
2523     QLIST_FOREACH(child, &bs->children, next) {
2524         bdrv_io_unplug(child->bs);
2525     }
2526 }
2527 
2528 void bdrv_io_unplugged_begin(BlockDriverState *bs)
2529 {
2530     BdrvChild *child;
2531 
2532     if (bs->io_plug_disabled++ == 0 && bs->io_plugged > 0) {
2533         BlockDriver *drv = bs->drv;
2534         if (drv && drv->bdrv_io_unplug) {
2535             drv->bdrv_io_unplug(bs);
2536         }
2537     }
2538 
2539     QLIST_FOREACH(child, &bs->children, next) {
2540         bdrv_io_unplugged_begin(child->bs);
2541     }
2542 }
2543 
2544 void bdrv_io_unplugged_end(BlockDriverState *bs)
2545 {
2546     BdrvChild *child;
2547 
2548     assert(bs->io_plug_disabled);
2549     QLIST_FOREACH(child, &bs->children, next) {
2550         bdrv_io_unplugged_end(child->bs);
2551     }
2552 
2553     if (--bs->io_plug_disabled == 0 && bs->io_plugged > 0) {
2554         BlockDriver *drv = bs->drv;
2555         if (drv && drv->bdrv_io_plug) {
2556             drv->bdrv_io_plug(bs);
2557         }
2558     }
2559 }
2560