xref: /qemu/hw/block/virtio-blk.c (revision 226419d6)
1 /*
2  * Virtio Block Device
3  *
4  * Copyright IBM, Corp. 2007
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  */
13 
14 #include "qemu/osdep.h"
15 #include "qemu-common.h"
16 #include "qemu/iov.h"
17 #include "qemu/error-report.h"
18 #include "trace.h"
19 #include "hw/block/block.h"
20 #include "sysemu/block-backend.h"
21 #include "sysemu/blockdev.h"
22 #include "hw/virtio/virtio-blk.h"
23 #include "dataplane/virtio-blk.h"
24 #include "block/scsi.h"
25 #ifdef __linux__
26 # include <scsi/sg.h>
27 #endif
28 #include "hw/virtio/virtio-bus.h"
29 #include "hw/virtio/virtio-access.h"
30 
31 void virtio_blk_init_request(VirtIOBlock *s, VirtIOBlockReq *req)
32 {
33     req->dev = s;
34     req->qiov.size = 0;
35     req->in_len = 0;
36     req->next = NULL;
37     req->mr_next = NULL;
38 }
39 
40 void virtio_blk_free_request(VirtIOBlockReq *req)
41 {
42     if (req) {
43         g_free(req);
44     }
45 }
46 
47 static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status)
48 {
49     VirtIOBlock *s = req->dev;
50     VirtIODevice *vdev = VIRTIO_DEVICE(s);
51 
52     trace_virtio_blk_req_complete(req, status);
53 
54     stb_p(&req->in->status, status);
55     virtqueue_push(s->vq, &req->elem, req->in_len);
56     if (s->dataplane) {
57         virtio_blk_data_plane_notify(s->dataplane);
58     } else {
59         virtio_notify(vdev, s->vq);
60     }
61 }
62 
63 static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
64     bool is_read)
65 {
66     BlockErrorAction action = blk_get_error_action(req->dev->blk,
67                                                    is_read, error);
68     VirtIOBlock *s = req->dev;
69 
70     if (action == BLOCK_ERROR_ACTION_STOP) {
71         /* Break the link as the next request is going to be parsed from the
72          * ring again. Otherwise we may end up doing a double completion! */
73         req->mr_next = NULL;
74         req->next = s->rq;
75         s->rq = req;
76     } else if (action == BLOCK_ERROR_ACTION_REPORT) {
77         virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
78         block_acct_failed(blk_get_stats(s->blk), &req->acct);
79         virtio_blk_free_request(req);
80     }
81 
82     blk_error_action(s->blk, action, is_read, error);
83     return action != BLOCK_ERROR_ACTION_IGNORE;
84 }
85 
86 static void virtio_blk_rw_complete(void *opaque, int ret)
87 {
88     VirtIOBlockReq *next = opaque;
89 
90     while (next) {
91         VirtIOBlockReq *req = next;
92         next = req->mr_next;
93         trace_virtio_blk_rw_complete(req, ret);
94 
95         if (req->qiov.nalloc != -1) {
96             /* If nalloc is != 1 req->qiov is a local copy of the original
97              * external iovec. It was allocated in submit_merged_requests
98              * to be able to merge requests. */
99             qemu_iovec_destroy(&req->qiov);
100         }
101 
102         if (ret) {
103             int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
104             bool is_read = !(p & VIRTIO_BLK_T_OUT);
105             /* Note that memory may be dirtied on read failure.  If the
106              * virtio request is not completed here, as is the case for
107              * BLOCK_ERROR_ACTION_STOP, the memory may not be copied
108              * correctly during live migration.  While this is ugly,
109              * it is acceptable because the device is free to write to
110              * the memory until the request is completed (which will
111              * happen on the other side of the migration).
112              */
113             if (virtio_blk_handle_rw_error(req, -ret, is_read)) {
114                 continue;
115             }
116         }
117 
118         virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
119         block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
120         virtio_blk_free_request(req);
121     }
122 }
123 
124 static void virtio_blk_flush_complete(void *opaque, int ret)
125 {
126     VirtIOBlockReq *req = opaque;
127 
128     if (ret) {
129         if (virtio_blk_handle_rw_error(req, -ret, 0)) {
130             return;
131         }
132     }
133 
134     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
135     block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
136     virtio_blk_free_request(req);
137 }
138 
139 #ifdef __linux__
140 
141 typedef struct {
142     VirtIOBlockReq *req;
143     struct sg_io_hdr hdr;
144 } VirtIOBlockIoctlReq;
145 
146 static void virtio_blk_ioctl_complete(void *opaque, int status)
147 {
148     VirtIOBlockIoctlReq *ioctl_req = opaque;
149     VirtIOBlockReq *req = ioctl_req->req;
150     VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
151     struct virtio_scsi_inhdr *scsi;
152     struct sg_io_hdr *hdr;
153 
154     scsi = (void *)req->elem.in_sg[req->elem.in_num - 2].iov_base;
155 
156     if (status) {
157         status = VIRTIO_BLK_S_UNSUPP;
158         virtio_stl_p(vdev, &scsi->errors, 255);
159         goto out;
160     }
161 
162     hdr = &ioctl_req->hdr;
163     /*
164      * From SCSI-Generic-HOWTO: "Some lower level drivers (e.g. ide-scsi)
165      * clear the masked_status field [hence status gets cleared too, see
166      * block/scsi_ioctl.c] even when a CHECK_CONDITION or COMMAND_TERMINATED
167      * status has occurred.  However they do set DRIVER_SENSE in driver_status
168      * field. Also a (sb_len_wr > 0) indicates there is a sense buffer.
169      */
170     if (hdr->status == 0 && hdr->sb_len_wr > 0) {
171         hdr->status = CHECK_CONDITION;
172     }
173 
174     virtio_stl_p(vdev, &scsi->errors,
175                  hdr->status | (hdr->msg_status << 8) |
176                  (hdr->host_status << 16) | (hdr->driver_status << 24));
177     virtio_stl_p(vdev, &scsi->residual, hdr->resid);
178     virtio_stl_p(vdev, &scsi->sense_len, hdr->sb_len_wr);
179     virtio_stl_p(vdev, &scsi->data_len, hdr->dxfer_len);
180 
181 out:
182     virtio_blk_req_complete(req, status);
183     virtio_blk_free_request(req);
184     g_free(ioctl_req);
185 }
186 
187 #endif
188 
189 static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s)
190 {
191     VirtIOBlockReq *req = virtqueue_pop(s->vq, sizeof(VirtIOBlockReq));
192 
193     if (req) {
194         virtio_blk_init_request(s, req);
195     }
196     return req;
197 }
198 
199 static int virtio_blk_handle_scsi_req(VirtIOBlockReq *req)
200 {
201     int status = VIRTIO_BLK_S_OK;
202     struct virtio_scsi_inhdr *scsi = NULL;
203     VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
204     VirtQueueElement *elem = &req->elem;
205     VirtIOBlock *blk = req->dev;
206 
207 #ifdef __linux__
208     int i;
209     VirtIOBlockIoctlReq *ioctl_req;
210     BlockAIOCB *acb;
211 #endif
212 
213     /*
214      * We require at least one output segment each for the virtio_blk_outhdr
215      * and the SCSI command block.
216      *
217      * We also at least require the virtio_blk_inhdr, the virtio_scsi_inhdr
218      * and the sense buffer pointer in the input segments.
219      */
220     if (elem->out_num < 2 || elem->in_num < 3) {
221         status = VIRTIO_BLK_S_IOERR;
222         goto fail;
223     }
224 
225     /*
226      * The scsi inhdr is placed in the second-to-last input segment, just
227      * before the regular inhdr.
228      */
229     scsi = (void *)elem->in_sg[elem->in_num - 2].iov_base;
230 
231     if (!blk->conf.scsi) {
232         status = VIRTIO_BLK_S_UNSUPP;
233         goto fail;
234     }
235 
236     /*
237      * No support for bidirection commands yet.
238      */
239     if (elem->out_num > 2 && elem->in_num > 3) {
240         status = VIRTIO_BLK_S_UNSUPP;
241         goto fail;
242     }
243 
244 #ifdef __linux__
245     ioctl_req = g_new0(VirtIOBlockIoctlReq, 1);
246     ioctl_req->req = req;
247     ioctl_req->hdr.interface_id = 'S';
248     ioctl_req->hdr.cmd_len = elem->out_sg[1].iov_len;
249     ioctl_req->hdr.cmdp = elem->out_sg[1].iov_base;
250     ioctl_req->hdr.dxfer_len = 0;
251 
252     if (elem->out_num > 2) {
253         /*
254          * If there are more than the minimally required 2 output segments
255          * there is write payload starting from the third iovec.
256          */
257         ioctl_req->hdr.dxfer_direction = SG_DXFER_TO_DEV;
258         ioctl_req->hdr.iovec_count = elem->out_num - 2;
259 
260         for (i = 0; i < ioctl_req->hdr.iovec_count; i++) {
261             ioctl_req->hdr.dxfer_len += elem->out_sg[i + 2].iov_len;
262         }
263 
264         ioctl_req->hdr.dxferp = elem->out_sg + 2;
265 
266     } else if (elem->in_num > 3) {
267         /*
268          * If we have more than 3 input segments the guest wants to actually
269          * read data.
270          */
271         ioctl_req->hdr.dxfer_direction = SG_DXFER_FROM_DEV;
272         ioctl_req->hdr.iovec_count = elem->in_num - 3;
273         for (i = 0; i < ioctl_req->hdr.iovec_count; i++) {
274             ioctl_req->hdr.dxfer_len += elem->in_sg[i].iov_len;
275         }
276 
277         ioctl_req->hdr.dxferp = elem->in_sg;
278     } else {
279         /*
280          * Some SCSI commands don't actually transfer any data.
281          */
282         ioctl_req->hdr.dxfer_direction = SG_DXFER_NONE;
283     }
284 
285     ioctl_req->hdr.sbp = elem->in_sg[elem->in_num - 3].iov_base;
286     ioctl_req->hdr.mx_sb_len = elem->in_sg[elem->in_num - 3].iov_len;
287 
288     acb = blk_aio_ioctl(blk->blk, SG_IO, &ioctl_req->hdr,
289                         virtio_blk_ioctl_complete, ioctl_req);
290     if (!acb) {
291         g_free(ioctl_req);
292         status = VIRTIO_BLK_S_UNSUPP;
293         goto fail;
294     }
295     return -EINPROGRESS;
296 #else
297     abort();
298 #endif
299 
300 fail:
301     /* Just put anything nonzero so that the ioctl fails in the guest.  */
302     if (scsi) {
303         virtio_stl_p(vdev, &scsi->errors, 255);
304     }
305     return status;
306 }
307 
308 static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
309 {
310     int status;
311 
312     status = virtio_blk_handle_scsi_req(req);
313     if (status != -EINPROGRESS) {
314         virtio_blk_req_complete(req, status);
315         virtio_blk_free_request(req);
316     }
317 }
318 
319 static inline void submit_requests(BlockBackend *blk, MultiReqBuffer *mrb,
320                                    int start, int num_reqs, int niov)
321 {
322     QEMUIOVector *qiov = &mrb->reqs[start]->qiov;
323     int64_t sector_num = mrb->reqs[start]->sector_num;
324     int nb_sectors = mrb->reqs[start]->qiov.size / BDRV_SECTOR_SIZE;
325     bool is_write = mrb->is_write;
326 
327     if (num_reqs > 1) {
328         int i;
329         struct iovec *tmp_iov = qiov->iov;
330         int tmp_niov = qiov->niov;
331 
332         /* mrb->reqs[start]->qiov was initialized from external so we can't
333          * modifiy it here. We need to initialize it locally and then add the
334          * external iovecs. */
335         qemu_iovec_init(qiov, niov);
336 
337         for (i = 0; i < tmp_niov; i++) {
338             qemu_iovec_add(qiov, tmp_iov[i].iov_base, tmp_iov[i].iov_len);
339         }
340 
341         for (i = start + 1; i < start + num_reqs; i++) {
342             qemu_iovec_concat(qiov, &mrb->reqs[i]->qiov, 0,
343                               mrb->reqs[i]->qiov.size);
344             mrb->reqs[i - 1]->mr_next = mrb->reqs[i];
345             nb_sectors += mrb->reqs[i]->qiov.size / BDRV_SECTOR_SIZE;
346         }
347         assert(nb_sectors == qiov->size / BDRV_SECTOR_SIZE);
348 
349         trace_virtio_blk_submit_multireq(mrb, start, num_reqs, sector_num,
350                                          nb_sectors, is_write);
351         block_acct_merge_done(blk_get_stats(blk),
352                               is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
353                               num_reqs - 1);
354     }
355 
356     if (is_write) {
357         blk_aio_writev(blk, sector_num, qiov, nb_sectors,
358                        virtio_blk_rw_complete, mrb->reqs[start]);
359     } else {
360         blk_aio_readv(blk, sector_num, qiov, nb_sectors,
361                       virtio_blk_rw_complete, mrb->reqs[start]);
362     }
363 }
364 
365 static int multireq_compare(const void *a, const void *b)
366 {
367     const VirtIOBlockReq *req1 = *(VirtIOBlockReq **)a,
368                          *req2 = *(VirtIOBlockReq **)b;
369 
370     /*
371      * Note that we can't simply subtract sector_num1 from sector_num2
372      * here as that could overflow the return value.
373      */
374     if (req1->sector_num > req2->sector_num) {
375         return 1;
376     } else if (req1->sector_num < req2->sector_num) {
377         return -1;
378     } else {
379         return 0;
380     }
381 }
382 
383 void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb)
384 {
385     int i = 0, start = 0, num_reqs = 0, niov = 0, nb_sectors = 0;
386     int max_xfer_len = 0;
387     int64_t sector_num = 0;
388 
389     if (mrb->num_reqs == 1) {
390         submit_requests(blk, mrb, 0, 1, -1);
391         mrb->num_reqs = 0;
392         return;
393     }
394 
395     max_xfer_len = blk_get_max_transfer_length(mrb->reqs[0]->dev->blk);
396     max_xfer_len = MIN_NON_ZERO(max_xfer_len, BDRV_REQUEST_MAX_SECTORS);
397 
398     qsort(mrb->reqs, mrb->num_reqs, sizeof(*mrb->reqs),
399           &multireq_compare);
400 
401     for (i = 0; i < mrb->num_reqs; i++) {
402         VirtIOBlockReq *req = mrb->reqs[i];
403         if (num_reqs > 0) {
404             /*
405              * NOTE: We cannot merge the requests in below situations:
406              * 1. requests are not sequential
407              * 2. merge would exceed maximum number of IOVs
408              * 3. merge would exceed maximum transfer length of backend device
409              */
410             if (sector_num + nb_sectors != req->sector_num ||
411                 niov > blk_get_max_iov(blk) - req->qiov.niov ||
412                 req->qiov.size / BDRV_SECTOR_SIZE > max_xfer_len ||
413                 nb_sectors > max_xfer_len - req->qiov.size / BDRV_SECTOR_SIZE) {
414                 submit_requests(blk, mrb, start, num_reqs, niov);
415                 num_reqs = 0;
416             }
417         }
418 
419         if (num_reqs == 0) {
420             sector_num = req->sector_num;
421             nb_sectors = niov = 0;
422             start = i;
423         }
424 
425         nb_sectors += req->qiov.size / BDRV_SECTOR_SIZE;
426         niov += req->qiov.niov;
427         num_reqs++;
428     }
429 
430     submit_requests(blk, mrb, start, num_reqs, niov);
431     mrb->num_reqs = 0;
432 }
433 
434 static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
435 {
436     block_acct_start(blk_get_stats(req->dev->blk), &req->acct, 0,
437                      BLOCK_ACCT_FLUSH);
438 
439     /*
440      * Make sure all outstanding writes are posted to the backing device.
441      */
442     if (mrb->is_write && mrb->num_reqs > 0) {
443         virtio_blk_submit_multireq(req->dev->blk, mrb);
444     }
445     blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req);
446 }
447 
448 static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
449                                      uint64_t sector, size_t size)
450 {
451     uint64_t nb_sectors = size >> BDRV_SECTOR_BITS;
452     uint64_t total_sectors;
453 
454     if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
455         return false;
456     }
457     if (sector & dev->sector_mask) {
458         return false;
459     }
460     if (size % dev->conf.conf.logical_block_size) {
461         return false;
462     }
463     blk_get_geometry(dev->blk, &total_sectors);
464     if (sector > total_sectors || nb_sectors > total_sectors - sector) {
465         return false;
466     }
467     return true;
468 }
469 
470 void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
471 {
472     uint32_t type;
473     struct iovec *in_iov = req->elem.in_sg;
474     struct iovec *iov = req->elem.out_sg;
475     unsigned in_num = req->elem.in_num;
476     unsigned out_num = req->elem.out_num;
477 
478     if (req->elem.out_num < 1 || req->elem.in_num < 1) {
479         error_report("virtio-blk missing headers");
480         exit(1);
481     }
482 
483     if (unlikely(iov_to_buf(iov, out_num, 0, &req->out,
484                             sizeof(req->out)) != sizeof(req->out))) {
485         error_report("virtio-blk request outhdr too short");
486         exit(1);
487     }
488 
489     iov_discard_front(&iov, &out_num, sizeof(req->out));
490 
491     if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
492         error_report("virtio-blk request inhdr too short");
493         exit(1);
494     }
495 
496     /* We always touch the last byte, so just see how big in_iov is.  */
497     req->in_len = iov_size(in_iov, in_num);
498     req->in = (void *)in_iov[in_num - 1].iov_base
499               + in_iov[in_num - 1].iov_len
500               - sizeof(struct virtio_blk_inhdr);
501     iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
502 
503     type = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
504 
505     /* VIRTIO_BLK_T_OUT defines the command direction. VIRTIO_BLK_T_BARRIER
506      * is an optional flag. Although a guest should not send this flag if
507      * not negotiated we ignored it in the past. So keep ignoring it. */
508     switch (type & ~(VIRTIO_BLK_T_OUT | VIRTIO_BLK_T_BARRIER)) {
509     case VIRTIO_BLK_T_IN:
510     {
511         bool is_write = type & VIRTIO_BLK_T_OUT;
512         req->sector_num = virtio_ldq_p(VIRTIO_DEVICE(req->dev),
513                                        &req->out.sector);
514 
515         if (is_write) {
516             qemu_iovec_init_external(&req->qiov, iov, out_num);
517             trace_virtio_blk_handle_write(req, req->sector_num,
518                                           req->qiov.size / BDRV_SECTOR_SIZE);
519         } else {
520             qemu_iovec_init_external(&req->qiov, in_iov, in_num);
521             trace_virtio_blk_handle_read(req, req->sector_num,
522                                          req->qiov.size / BDRV_SECTOR_SIZE);
523         }
524 
525         if (!virtio_blk_sect_range_ok(req->dev, req->sector_num,
526                                       req->qiov.size)) {
527             virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
528             block_acct_invalid(blk_get_stats(req->dev->blk),
529                                is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
530             virtio_blk_free_request(req);
531             return;
532         }
533 
534         block_acct_start(blk_get_stats(req->dev->blk),
535                          &req->acct, req->qiov.size,
536                          is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
537 
538         /* merge would exceed maximum number of requests or IO direction
539          * changes */
540         if (mrb->num_reqs > 0 && (mrb->num_reqs == VIRTIO_BLK_MAX_MERGE_REQS ||
541                                   is_write != mrb->is_write ||
542                                   !req->dev->conf.request_merging)) {
543             virtio_blk_submit_multireq(req->dev->blk, mrb);
544         }
545 
546         assert(mrb->num_reqs < VIRTIO_BLK_MAX_MERGE_REQS);
547         mrb->reqs[mrb->num_reqs++] = req;
548         mrb->is_write = is_write;
549         break;
550     }
551     case VIRTIO_BLK_T_FLUSH:
552         virtio_blk_handle_flush(req, mrb);
553         break;
554     case VIRTIO_BLK_T_SCSI_CMD:
555         virtio_blk_handle_scsi(req);
556         break;
557     case VIRTIO_BLK_T_GET_ID:
558     {
559         VirtIOBlock *s = req->dev;
560 
561         /*
562          * NB: per existing s/n string convention the string is
563          * terminated by '\0' only when shorter than buffer.
564          */
565         const char *serial = s->conf.serial ? s->conf.serial : "";
566         size_t size = MIN(strlen(serial) + 1,
567                           MIN(iov_size(in_iov, in_num),
568                               VIRTIO_BLK_ID_BYTES));
569         iov_from_buf(in_iov, in_num, 0, serial, size);
570         virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
571         virtio_blk_free_request(req);
572         break;
573     }
574     default:
575         virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
576         virtio_blk_free_request(req);
577     }
578 }
579 
580 static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
581 {
582     VirtIOBlock *s = VIRTIO_BLK(vdev);
583     VirtIOBlockReq *req;
584     MultiReqBuffer mrb = {};
585 
586     /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
587      * dataplane here instead of waiting for .set_status().
588      */
589     if (s->dataplane && !s->dataplane_started) {
590         virtio_blk_data_plane_start(s->dataplane);
591         return;
592     }
593 
594     blk_io_plug(s->blk);
595 
596     while ((req = virtio_blk_get_request(s))) {
597         virtio_blk_handle_request(req, &mrb);
598     }
599 
600     if (mrb.num_reqs) {
601         virtio_blk_submit_multireq(s->blk, &mrb);
602     }
603 
604     blk_io_unplug(s->blk);
605 }
606 
607 static void virtio_blk_dma_restart_bh(void *opaque)
608 {
609     VirtIOBlock *s = opaque;
610     VirtIOBlockReq *req = s->rq;
611     MultiReqBuffer mrb = {};
612 
613     qemu_bh_delete(s->bh);
614     s->bh = NULL;
615 
616     s->rq = NULL;
617 
618     while (req) {
619         VirtIOBlockReq *next = req->next;
620         virtio_blk_handle_request(req, &mrb);
621         req = next;
622     }
623 
624     if (mrb.num_reqs) {
625         virtio_blk_submit_multireq(s->blk, &mrb);
626     }
627 }
628 
629 static void virtio_blk_dma_restart_cb(void *opaque, int running,
630                                       RunState state)
631 {
632     VirtIOBlock *s = opaque;
633 
634     if (!running) {
635         return;
636     }
637 
638     if (!s->bh) {
639         s->bh = aio_bh_new(blk_get_aio_context(s->conf.conf.blk),
640                            virtio_blk_dma_restart_bh, s);
641         qemu_bh_schedule(s->bh);
642     }
643 }
644 
645 static void virtio_blk_reset(VirtIODevice *vdev)
646 {
647     VirtIOBlock *s = VIRTIO_BLK(vdev);
648     AioContext *ctx;
649 
650     /*
651      * This should cancel pending requests, but can't do nicely until there
652      * are per-device request lists.
653      */
654     ctx = blk_get_aio_context(s->blk);
655     aio_context_acquire(ctx);
656     blk_drain(s->blk);
657 
658     if (s->dataplane) {
659         virtio_blk_data_plane_stop(s->dataplane);
660     }
661     aio_context_release(ctx);
662 
663     blk_set_enable_write_cache(s->blk, s->original_wce);
664 }
665 
666 /* coalesce internal state, copy to pci i/o region 0
667  */
668 static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
669 {
670     VirtIOBlock *s = VIRTIO_BLK(vdev);
671     BlockConf *conf = &s->conf.conf;
672     struct virtio_blk_config blkcfg;
673     uint64_t capacity;
674     int blk_size = conf->logical_block_size;
675 
676     blk_get_geometry(s->blk, &capacity);
677     memset(&blkcfg, 0, sizeof(blkcfg));
678     virtio_stq_p(vdev, &blkcfg.capacity, capacity);
679     virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2);
680     virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
681     virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
682     virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size);
683     virtio_stw_p(vdev, &blkcfg.opt_io_size, conf->opt_io_size / blk_size);
684     blkcfg.geometry.heads = conf->heads;
685     /*
686      * We must ensure that the block device capacity is a multiple of
687      * the logical block size. If that is not the case, let's use
688      * sector_mask to adopt the geometry to have a correct picture.
689      * For those devices where the capacity is ok for the given geometry
690      * we don't touch the sector value of the geometry, since some devices
691      * (like s390 dasd) need a specific value. Here the capacity is already
692      * cyls*heads*secs*blk_size and the sector value is not block size
693      * divided by 512 - instead it is the amount of blk_size blocks
694      * per track (cylinder).
695      */
696     if (blk_getlength(s->blk) /  conf->heads / conf->secs % blk_size) {
697         blkcfg.geometry.sectors = conf->secs & ~s->sector_mask;
698     } else {
699         blkcfg.geometry.sectors = conf->secs;
700     }
701     blkcfg.size_max = 0;
702     blkcfg.physical_block_exp = get_physical_block_exp(conf);
703     blkcfg.alignment_offset = 0;
704     blkcfg.wce = blk_enable_write_cache(s->blk);
705     memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
706 }
707 
708 static void virtio_blk_set_config(VirtIODevice *vdev, const uint8_t *config)
709 {
710     VirtIOBlock *s = VIRTIO_BLK(vdev);
711     struct virtio_blk_config blkcfg;
712 
713     memcpy(&blkcfg, config, sizeof(blkcfg));
714 
715     aio_context_acquire(blk_get_aio_context(s->blk));
716     blk_set_enable_write_cache(s->blk, blkcfg.wce != 0);
717     aio_context_release(blk_get_aio_context(s->blk));
718 }
719 
720 static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
721                                         Error **errp)
722 {
723     VirtIOBlock *s = VIRTIO_BLK(vdev);
724 
725     virtio_add_feature(&features, VIRTIO_BLK_F_SEG_MAX);
726     virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
727     virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
728     virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
729     if (virtio_has_feature(features, VIRTIO_F_VERSION_1)) {
730         if (s->conf.scsi) {
731             error_setg(errp, "Please set scsi=off for virtio-blk devices in order to use virtio 1.0");
732             return 0;
733         }
734     } else {
735         virtio_clear_feature(&features, VIRTIO_F_ANY_LAYOUT);
736         virtio_add_feature(&features, VIRTIO_BLK_F_SCSI);
737     }
738 
739     if (s->conf.config_wce) {
740         virtio_add_feature(&features, VIRTIO_BLK_F_CONFIG_WCE);
741     }
742     if (blk_enable_write_cache(s->blk)) {
743         virtio_add_feature(&features, VIRTIO_BLK_F_WCE);
744     }
745     if (blk_is_read_only(s->blk)) {
746         virtio_add_feature(&features, VIRTIO_BLK_F_RO);
747     }
748 
749     return features;
750 }
751 
752 static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status)
753 {
754     VirtIOBlock *s = VIRTIO_BLK(vdev);
755 
756     if (s->dataplane && !(status & (VIRTIO_CONFIG_S_DRIVER |
757                                     VIRTIO_CONFIG_S_DRIVER_OK))) {
758         virtio_blk_data_plane_stop(s->dataplane);
759     }
760 
761     if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
762         return;
763     }
764 
765     /* A guest that supports VIRTIO_BLK_F_CONFIG_WCE must be able to send
766      * cache flushes.  Thus, the "auto writethrough" behavior is never
767      * necessary for guests that support the VIRTIO_BLK_F_CONFIG_WCE feature.
768      * Leaving it enabled would break the following sequence:
769      *
770      *     Guest started with "-drive cache=writethrough"
771      *     Guest sets status to 0
772      *     Guest sets DRIVER bit in status field
773      *     Guest reads host features (WCE=0, CONFIG_WCE=1)
774      *     Guest writes guest features (WCE=0, CONFIG_WCE=1)
775      *     Guest writes 1 to the WCE configuration field (writeback mode)
776      *     Guest sets DRIVER_OK bit in status field
777      *
778      * s->blk would erroneously be placed in writethrough mode.
779      */
780     if (!virtio_vdev_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE)) {
781         aio_context_acquire(blk_get_aio_context(s->blk));
782         blk_set_enable_write_cache(s->blk,
783                                    virtio_vdev_has_feature(vdev,
784                                                            VIRTIO_BLK_F_WCE));
785         aio_context_release(blk_get_aio_context(s->blk));
786     }
787 }
788 
789 static void virtio_blk_save(QEMUFile *f, void *opaque)
790 {
791     VirtIODevice *vdev = VIRTIO_DEVICE(opaque);
792     VirtIOBlock *s = VIRTIO_BLK(vdev);
793 
794     if (s->dataplane) {
795         virtio_blk_data_plane_stop(s->dataplane);
796     }
797 
798     virtio_save(vdev, f);
799 }
800 
801 static void virtio_blk_save_device(VirtIODevice *vdev, QEMUFile *f)
802 {
803     VirtIOBlock *s = VIRTIO_BLK(vdev);
804     VirtIOBlockReq *req = s->rq;
805 
806     while (req) {
807         qemu_put_sbyte(f, 1);
808         qemu_put_virtqueue_element(f, &req->elem);
809         req = req->next;
810     }
811     qemu_put_sbyte(f, 0);
812 }
813 
814 static int virtio_blk_load(QEMUFile *f, void *opaque, int version_id)
815 {
816     VirtIOBlock *s = opaque;
817     VirtIODevice *vdev = VIRTIO_DEVICE(s);
818 
819     if (version_id != 2)
820         return -EINVAL;
821 
822     return virtio_load(vdev, f, version_id);
823 }
824 
825 static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f,
826                                   int version_id)
827 {
828     VirtIOBlock *s = VIRTIO_BLK(vdev);
829 
830     while (qemu_get_sbyte(f)) {
831         VirtIOBlockReq *req;
832         req = qemu_get_virtqueue_element(f, sizeof(VirtIOBlockReq));
833         virtio_blk_init_request(s, req);
834         req->next = s->rq;
835         s->rq = req;
836     }
837 
838     return 0;
839 }
840 
841 static void virtio_blk_resize(void *opaque)
842 {
843     VirtIODevice *vdev = VIRTIO_DEVICE(opaque);
844 
845     virtio_notify_config(vdev);
846 }
847 
848 static const BlockDevOps virtio_block_ops = {
849     .resize_cb = virtio_blk_resize,
850 };
851 
852 static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
853 {
854     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
855     VirtIOBlock *s = VIRTIO_BLK(dev);
856     VirtIOBlkConf *conf = &s->conf;
857     Error *err = NULL;
858     static int virtio_blk_id;
859 
860     if (!conf->conf.blk) {
861         error_setg(errp, "drive property not set");
862         return;
863     }
864     if (!blk_is_inserted(conf->conf.blk)) {
865         error_setg(errp, "Device needs media, but drive is empty");
866         return;
867     }
868 
869     blkconf_serial(&conf->conf, &conf->serial);
870     s->original_wce = blk_enable_write_cache(conf->conf.blk);
871     blkconf_geometry(&conf->conf, NULL, 65535, 255, 255, &err);
872     if (err) {
873         error_propagate(errp, err);
874         return;
875     }
876     blkconf_blocksizes(&conf->conf);
877 
878     virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
879                 sizeof(struct virtio_blk_config));
880 
881     s->blk = conf->conf.blk;
882     s->rq = NULL;
883     s->sector_mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
884 
885     s->vq = virtio_add_queue(vdev, 128, virtio_blk_handle_output);
886     virtio_blk_data_plane_create(vdev, conf, &s->dataplane, &err);
887     if (err != NULL) {
888         error_propagate(errp, err);
889         virtio_cleanup(vdev);
890         return;
891     }
892 
893     s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
894     register_savevm(dev, "virtio-blk", virtio_blk_id++, 2,
895                     virtio_blk_save, virtio_blk_load, s);
896     blk_set_dev_ops(s->blk, &virtio_block_ops, s);
897     blk_set_guest_block_size(s->blk, s->conf.conf.logical_block_size);
898 
899     blk_iostatus_enable(s->blk);
900 }
901 
902 static void virtio_blk_device_unrealize(DeviceState *dev, Error **errp)
903 {
904     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
905     VirtIOBlock *s = VIRTIO_BLK(dev);
906 
907     virtio_blk_data_plane_destroy(s->dataplane);
908     s->dataplane = NULL;
909     qemu_del_vm_change_state_handler(s->change);
910     unregister_savevm(dev, "virtio-blk", s);
911     blockdev_mark_auto_del(s->blk);
912     virtio_cleanup(vdev);
913 }
914 
915 static void virtio_blk_instance_init(Object *obj)
916 {
917     VirtIOBlock *s = VIRTIO_BLK(obj);
918 
919     object_property_add_link(obj, "iothread", TYPE_IOTHREAD,
920                              (Object **)&s->conf.iothread,
921                              qdev_prop_allow_set_link_before_realize,
922                              OBJ_PROP_LINK_UNREF_ON_RELEASE, NULL);
923     device_add_bootindex_property(obj, &s->conf.conf.bootindex,
924                                   "bootindex", "/disk@0,0",
925                                   DEVICE(obj), NULL);
926 }
927 
928 static Property virtio_blk_properties[] = {
929     DEFINE_BLOCK_PROPERTIES(VirtIOBlock, conf.conf),
930     DEFINE_BLOCK_CHS_PROPERTIES(VirtIOBlock, conf.conf),
931     DEFINE_PROP_STRING("serial", VirtIOBlock, conf.serial),
932     DEFINE_PROP_BIT("config-wce", VirtIOBlock, conf.config_wce, 0, true),
933 #ifdef __linux__
934     DEFINE_PROP_BIT("scsi", VirtIOBlock, conf.scsi, 0, false),
935 #endif
936     DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
937                     true),
938     DEFINE_PROP_END_OF_LIST(),
939 };
940 
941 static void virtio_blk_class_init(ObjectClass *klass, void *data)
942 {
943     DeviceClass *dc = DEVICE_CLASS(klass);
944     VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
945 
946     dc->props = virtio_blk_properties;
947     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
948     vdc->realize = virtio_blk_device_realize;
949     vdc->unrealize = virtio_blk_device_unrealize;
950     vdc->get_config = virtio_blk_update_config;
951     vdc->set_config = virtio_blk_set_config;
952     vdc->get_features = virtio_blk_get_features;
953     vdc->set_status = virtio_blk_set_status;
954     vdc->reset = virtio_blk_reset;
955     vdc->save = virtio_blk_save_device;
956     vdc->load = virtio_blk_load_device;
957 }
958 
959 static const TypeInfo virtio_device_info = {
960     .name = TYPE_VIRTIO_BLK,
961     .parent = TYPE_VIRTIO_DEVICE,
962     .instance_size = sizeof(VirtIOBlock),
963     .instance_init = virtio_blk_instance_init,
964     .class_init = virtio_blk_class_init,
965 };
966 
967 static void virtio_register_types(void)
968 {
969     type_register_static(&virtio_device_info);
970 }
971 
972 type_init(virtio_register_types)
973