1 /*
2  * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License version
6  * 2 as published by the Free Software Foundation.
7  *
8  * You should have received a copy of the GNU General Public License
9  * along with this program. If not, see <http://www.gnu.org/licenses/>.
10  *
11  * Contributions after 2012-01-13 are licensed under the terms of the
12  * GNU GPL, version 2 or (at your option) any later version.
13  */
14 
15 #include "qemu/osdep.h"
16 #include "qemu-common.h"
17 #include "qapi/error.h"
18 #include "qapi/qapi-visit-sockets.h"
19 #include "qapi/qapi-visit-block-core.h"
20 #include "qapi/qmp/qdict.h"
21 #include "qapi/qobject-input-visitor.h"
22 #include "qapi/qobject-output-visitor.h"
23 #include "qemu/uri.h"
24 #include "qemu/error-report.h"
25 #include "qemu/main-loop.h"
26 #include "qemu/module.h"
27 #include "qemu/option.h"
28 #include "qemu/sockets.h"
29 #include "block/block_int.h"
30 #include "block/qdict.h"
31 #include "sysemu/block-backend.h"
32 #include "qemu/bitops.h"
33 #include "qemu/cutils.h"
34 #include "trace.h"
35 
36 #define SD_PROTO_VER 0x01
37 
38 #define SD_DEFAULT_ADDR "localhost"
39 #define SD_DEFAULT_PORT 7000
40 
41 #define SD_OP_CREATE_AND_WRITE_OBJ  0x01
42 #define SD_OP_READ_OBJ       0x02
43 #define SD_OP_WRITE_OBJ      0x03
44 /* 0x04 is used internally by Sheepdog */
45 
46 #define SD_OP_NEW_VDI        0x11
47 #define SD_OP_LOCK_VDI       0x12
48 #define SD_OP_RELEASE_VDI    0x13
49 #define SD_OP_GET_VDI_INFO   0x14
50 #define SD_OP_READ_VDIS      0x15
51 #define SD_OP_FLUSH_VDI      0x16
52 #define SD_OP_DEL_VDI        0x17
53 #define SD_OP_GET_CLUSTER_DEFAULT   0x18
54 
55 #define SD_FLAG_CMD_WRITE    0x01
56 #define SD_FLAG_CMD_COW      0x02
57 #define SD_FLAG_CMD_CACHE    0x04 /* Writeback mode for cache */
58 #define SD_FLAG_CMD_DIRECT   0x08 /* Don't use cache */
59 
60 #define SD_RES_SUCCESS       0x00 /* Success */
61 #define SD_RES_UNKNOWN       0x01 /* Unknown error */
62 #define SD_RES_NO_OBJ        0x02 /* No object found */
63 #define SD_RES_EIO           0x03 /* I/O error */
64 #define SD_RES_VDI_EXIST     0x04 /* Vdi exists already */
65 #define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
66 #define SD_RES_SYSTEM_ERROR  0x06 /* System error */
67 #define SD_RES_VDI_LOCKED    0x07 /* Vdi is locked */
68 #define SD_RES_NO_VDI        0x08 /* No vdi found */
69 #define SD_RES_NO_BASE_VDI   0x09 /* No base vdi found */
70 #define SD_RES_VDI_READ      0x0A /* Cannot read requested vdi */
71 #define SD_RES_VDI_WRITE     0x0B /* Cannot write requested vdi */
72 #define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
73 #define SD_RES_BASE_VDI_WRITE   0x0D /* Cannot write base vdi */
74 #define SD_RES_NO_TAG        0x0E /* Requested tag is not found */
75 #define SD_RES_STARTUP       0x0F /* Sheepdog is on starting up */
76 #define SD_RES_VDI_NOT_LOCKED   0x10 /* Vdi is not locked */
77 #define SD_RES_SHUTDOWN      0x11 /* Sheepdog is shutting down */
78 #define SD_RES_NO_MEM        0x12 /* Cannot allocate memory */
79 #define SD_RES_FULL_VDI      0x13 /* we already have the maximum vdis */
80 #define SD_RES_VER_MISMATCH  0x14 /* Protocol version mismatch */
81 #define SD_RES_NO_SPACE      0x15 /* Server has no room for new objects */
82 #define SD_RES_WAIT_FOR_FORMAT  0x16 /* Waiting for a format operation */
83 #define SD_RES_WAIT_FOR_JOIN    0x17 /* Waiting for other nodes joining */
84 #define SD_RES_JOIN_FAILED   0x18 /* Target node had failed to join sheepdog */
85 #define SD_RES_HALT          0x19 /* Sheepdog is stopped serving IO request */
86 #define SD_RES_READONLY      0x1A /* Object is read-only */
87 
88 /*
89  * Object ID rules
90  *
91  *  0 - 19 (20 bits): data object space
92  * 20 - 31 (12 bits): reserved data object space
93  * 32 - 55 (24 bits): vdi object space
94  * 56 - 59 ( 4 bits): reserved vdi object space
95  * 60 - 63 ( 4 bits): object type identifier space
96  */
97 
98 #define VDI_SPACE_SHIFT   32
99 #define VDI_BIT (UINT64_C(1) << 63)
100 #define VMSTATE_BIT (UINT64_C(1) << 62)
101 #define MAX_DATA_OBJS (UINT64_C(1) << 20)
102 #define MAX_CHILDREN 1024
103 #define SD_MAX_VDI_LEN 256
104 #define SD_MAX_VDI_TAG_LEN 256
105 #define SD_NR_VDIS   (1U << 24)
106 #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
107 #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
108 #define SD_DEFAULT_BLOCK_SIZE_SHIFT 22
109 /*
110  * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and
111  * (SD_EC_MAX_STRIP - 1) for parity strips
112  *
113  * SD_MAX_COPIES is sum of number of data strips and parity strips.
114  */
115 #define SD_EC_MAX_STRIP 16
116 #define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1)
117 
118 #define SD_INODE_SIZE (sizeof(SheepdogInode))
119 #define CURRENT_VDI_ID 0
120 
121 #define LOCK_TYPE_NORMAL 0
122 #define LOCK_TYPE_SHARED 1      /* for iSCSI multipath */
123 
124 typedef struct SheepdogReq {
125     uint8_t proto_ver;
126     uint8_t opcode;
127     uint16_t flags;
128     uint32_t epoch;
129     uint32_t id;
130     uint32_t data_length;
131     uint32_t opcode_specific[8];
132 } SheepdogReq;
133 
134 typedef struct SheepdogRsp {
135     uint8_t proto_ver;
136     uint8_t opcode;
137     uint16_t flags;
138     uint32_t epoch;
139     uint32_t id;
140     uint32_t data_length;
141     uint32_t result;
142     uint32_t opcode_specific[7];
143 } SheepdogRsp;
144 
145 typedef struct SheepdogObjReq {
146     uint8_t proto_ver;
147     uint8_t opcode;
148     uint16_t flags;
149     uint32_t epoch;
150     uint32_t id;
151     uint32_t data_length;
152     uint64_t oid;
153     uint64_t cow_oid;
154     uint8_t copies;
155     uint8_t copy_policy;
156     uint8_t reserved[6];
157     uint64_t offset;
158 } SheepdogObjReq;
159 
160 typedef struct SheepdogObjRsp {
161     uint8_t proto_ver;
162     uint8_t opcode;
163     uint16_t flags;
164     uint32_t epoch;
165     uint32_t id;
166     uint32_t data_length;
167     uint32_t result;
168     uint8_t copies;
169     uint8_t copy_policy;
170     uint8_t reserved[2];
171     uint32_t pad[6];
172 } SheepdogObjRsp;
173 
174 typedef struct SheepdogVdiReq {
175     uint8_t proto_ver;
176     uint8_t opcode;
177     uint16_t flags;
178     uint32_t epoch;
179     uint32_t id;
180     uint32_t data_length;
181     uint64_t vdi_size;
182     uint32_t base_vdi_id;
183     uint8_t copies;
184     uint8_t copy_policy;
185     uint8_t store_policy;
186     uint8_t block_size_shift;
187     uint32_t snapid;
188     uint32_t type;
189     uint32_t pad[2];
190 } SheepdogVdiReq;
191 
192 typedef struct SheepdogVdiRsp {
193     uint8_t proto_ver;
194     uint8_t opcode;
195     uint16_t flags;
196     uint32_t epoch;
197     uint32_t id;
198     uint32_t data_length;
199     uint32_t result;
200     uint32_t rsvd;
201     uint32_t vdi_id;
202     uint32_t pad[5];
203 } SheepdogVdiRsp;
204 
205 typedef struct SheepdogClusterRsp {
206     uint8_t proto_ver;
207     uint8_t opcode;
208     uint16_t flags;
209     uint32_t epoch;
210     uint32_t id;
211     uint32_t data_length;
212     uint32_t result;
213     uint8_t nr_copies;
214     uint8_t copy_policy;
215     uint8_t block_size_shift;
216     uint8_t __pad1;
217     uint32_t __pad2[6];
218 } SheepdogClusterRsp;
219 
220 typedef struct SheepdogInode {
221     char name[SD_MAX_VDI_LEN];
222     char tag[SD_MAX_VDI_TAG_LEN];
223     uint64_t ctime;
224     uint64_t snap_ctime;
225     uint64_t vm_clock_nsec;
226     uint64_t vdi_size;
227     uint64_t vm_state_size;
228     uint16_t copy_policy;
229     uint8_t nr_copies;
230     uint8_t block_size_shift;
231     uint32_t snap_id;
232     uint32_t vdi_id;
233     uint32_t parent_vdi_id;
234     uint32_t child_vdi_id[MAX_CHILDREN];
235     uint32_t data_vdi_id[MAX_DATA_OBJS];
236 } SheepdogInode;
237 
238 #define SD_INODE_HEADER_SIZE offsetof(SheepdogInode, data_vdi_id)
239 
240 /*
241  * 64 bit FNV-1a non-zero initial basis
242  */
243 #define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
244 
245 /*
246  * 64 bit Fowler/Noll/Vo FNV-1a hash code
247  */
fnv_64a_buf(void * buf,size_t len,uint64_t hval)248 static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
249 {
250     unsigned char *bp = buf;
251     unsigned char *be = bp + len;
252     while (bp < be) {
253         hval ^= (uint64_t) *bp++;
254         hval += (hval << 1) + (hval << 4) + (hval << 5) +
255             (hval << 7) + (hval << 8) + (hval << 40);
256     }
257     return hval;
258 }
259 
is_data_obj_writable(SheepdogInode * inode,unsigned int idx)260 static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
261 {
262     return inode->vdi_id == inode->data_vdi_id[idx];
263 }
264 
is_data_obj(uint64_t oid)265 static inline bool is_data_obj(uint64_t oid)
266 {
267     return !(VDI_BIT & oid);
268 }
269 
data_oid_to_idx(uint64_t oid)270 static inline uint64_t data_oid_to_idx(uint64_t oid)
271 {
272     return oid & (MAX_DATA_OBJS - 1);
273 }
274 
oid_to_vid(uint64_t oid)275 static inline uint32_t oid_to_vid(uint64_t oid)
276 {
277     return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT;
278 }
279 
vid_to_vdi_oid(uint32_t vid)280 static inline uint64_t vid_to_vdi_oid(uint32_t vid)
281 {
282     return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
283 }
284 
vid_to_vmstate_oid(uint32_t vid,uint32_t idx)285 static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
286 {
287     return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
288 }
289 
vid_to_data_oid(uint32_t vid,uint32_t idx)290 static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
291 {
292     return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
293 }
294 
is_snapshot(struct SheepdogInode * inode)295 static inline bool is_snapshot(struct SheepdogInode *inode)
296 {
297     return !!inode->snap_ctime;
298 }
299 
count_data_objs(const struct SheepdogInode * inode)300 static inline size_t count_data_objs(const struct SheepdogInode *inode)
301 {
302     return DIV_ROUND_UP(inode->vdi_size,
303                         (1UL << inode->block_size_shift));
304 }
305 
306 typedef struct SheepdogAIOCB SheepdogAIOCB;
307 typedef struct BDRVSheepdogState BDRVSheepdogState;
308 
309 typedef struct AIOReq {
310     SheepdogAIOCB *aiocb;
311     unsigned int iov_offset;
312 
313     uint64_t oid;
314     uint64_t base_oid;
315     uint64_t offset;
316     unsigned int data_len;
317     uint8_t flags;
318     uint32_t id;
319     bool create;
320 
321     QLIST_ENTRY(AIOReq) aio_siblings;
322 } AIOReq;
323 
324 enum AIOCBState {
325     AIOCB_WRITE_UDATA,
326     AIOCB_READ_UDATA,
327     AIOCB_FLUSH_CACHE,
328     AIOCB_DISCARD_OBJ,
329 };
330 
331 #define AIOCBOverlapping(x, y)                                 \
332     (!(x->max_affect_data_idx < y->min_affect_data_idx          \
333        || y->max_affect_data_idx < x->min_affect_data_idx))
334 
335 struct SheepdogAIOCB {
336     BDRVSheepdogState *s;
337 
338     QEMUIOVector *qiov;
339 
340     int64_t sector_num;
341     int nb_sectors;
342 
343     int ret;
344     enum AIOCBState aiocb_type;
345 
346     Coroutine *coroutine;
347     int nr_pending;
348 
349     uint32_t min_affect_data_idx;
350     uint32_t max_affect_data_idx;
351 
352     /*
353      * The difference between affect_data_idx and dirty_data_idx:
354      * affect_data_idx represents range of index of all request types.
355      * dirty_data_idx represents range of index updated by COW requests.
356      * dirty_data_idx is used for updating an inode object.
357      */
358     uint32_t min_dirty_data_idx;
359     uint32_t max_dirty_data_idx;
360 
361     QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
362 };
363 
364 struct BDRVSheepdogState {
365     BlockDriverState *bs;
366     AioContext *aio_context;
367 
368     SheepdogInode inode;
369 
370     char name[SD_MAX_VDI_LEN];
371     bool is_snapshot;
372     uint32_t cache_flags;
373     bool discard_supported;
374 
375     SocketAddress *addr;
376     int fd;
377 
378     CoMutex lock;
379     Coroutine *co_send;
380     Coroutine *co_recv;
381 
382     uint32_t aioreq_seq_num;
383 
384     /* Every aio request must be linked to either of these queues. */
385     QLIST_HEAD(, AIOReq) inflight_aio_head;
386     QLIST_HEAD(, AIOReq) failed_aio_head;
387 
388     CoMutex queue_lock;
389     CoQueue overlapping_queue;
390     QLIST_HEAD(, SheepdogAIOCB) inflight_aiocb_head;
391 };
392 
393 typedef struct BDRVSheepdogReopenState {
394     int fd;
395     int cache_flags;
396 } BDRVSheepdogReopenState;
397 
sd_strerror(int err)398 static const char *sd_strerror(int err)
399 {
400     int i;
401 
402     static const struct {
403         int err;
404         const char *desc;
405     } errors[] = {
406         {SD_RES_SUCCESS, "Success"},
407         {SD_RES_UNKNOWN, "Unknown error"},
408         {SD_RES_NO_OBJ, "No object found"},
409         {SD_RES_EIO, "I/O error"},
410         {SD_RES_VDI_EXIST, "VDI exists already"},
411         {SD_RES_INVALID_PARMS, "Invalid parameters"},
412         {SD_RES_SYSTEM_ERROR, "System error"},
413         {SD_RES_VDI_LOCKED, "VDI is already locked"},
414         {SD_RES_NO_VDI, "No vdi found"},
415         {SD_RES_NO_BASE_VDI, "No base VDI found"},
416         {SD_RES_VDI_READ, "Failed read the requested VDI"},
417         {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
418         {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
419         {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
420         {SD_RES_NO_TAG, "Failed to find the requested tag"},
421         {SD_RES_STARTUP, "The system is still booting"},
422         {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
423         {SD_RES_SHUTDOWN, "The system is shutting down"},
424         {SD_RES_NO_MEM, "Out of memory on the server"},
425         {SD_RES_FULL_VDI, "We already have the maximum vdis"},
426         {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
427         {SD_RES_NO_SPACE, "Server has no space for new objects"},
428         {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
429         {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
430         {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
431         {SD_RES_HALT, "Sheepdog is stopped serving IO request"},
432         {SD_RES_READONLY, "Object is read-only"},
433     };
434 
435     for (i = 0; i < ARRAY_SIZE(errors); ++i) {
436         if (errors[i].err == err) {
437             return errors[i].desc;
438         }
439     }
440 
441     return "Invalid error code";
442 }
443 
444 /*
445  * Sheepdog I/O handling:
446  *
447  * 1. In sd_co_rw_vector, we send the I/O requests to the server and
448  *    link the requests to the inflight_list in the
449  *    BDRVSheepdogState.  The function yields while waiting for
450  *    receiving the response.
451  *
452  * 2. We receive the response in aio_read_response, the fd handler to
453  *    the sheepdog connection.  We switch back to sd_co_readv/sd_writev
454  *    after all the requests belonging to the AIOCB are finished.  If
455  *    needed, sd_co_writev will send another requests for the vdi object.
456  */
457 
alloc_aio_req(BDRVSheepdogState * s,SheepdogAIOCB * acb,uint64_t oid,unsigned int data_len,uint64_t offset,uint8_t flags,bool create,uint64_t base_oid,unsigned int iov_offset)458 static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
459                                     uint64_t oid, unsigned int data_len,
460                                     uint64_t offset, uint8_t flags, bool create,
461                                     uint64_t base_oid, unsigned int iov_offset)
462 {
463     AIOReq *aio_req;
464 
465     aio_req = g_malloc(sizeof(*aio_req));
466     aio_req->aiocb = acb;
467     aio_req->iov_offset = iov_offset;
468     aio_req->oid = oid;
469     aio_req->base_oid = base_oid;
470     aio_req->offset = offset;
471     aio_req->data_len = data_len;
472     aio_req->flags = flags;
473     aio_req->id = s->aioreq_seq_num++;
474     aio_req->create = create;
475 
476     acb->nr_pending++;
477     return aio_req;
478 }
479 
wait_for_overlapping_aiocb(BDRVSheepdogState * s,SheepdogAIOCB * acb)480 static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
481 {
482     SheepdogAIOCB *cb;
483 
484 retry:
485     QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
486         if (AIOCBOverlapping(acb, cb)) {
487             qemu_co_queue_wait(&s->overlapping_queue, &s->queue_lock);
488             goto retry;
489         }
490     }
491 }
492 
sd_aio_setup(SheepdogAIOCB * acb,BDRVSheepdogState * s,QEMUIOVector * qiov,int64_t sector_num,int nb_sectors,int type)493 static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
494                          QEMUIOVector *qiov, int64_t sector_num, int nb_sectors,
495                          int type)
496 {
497     uint32_t object_size;
498 
499     object_size = (UINT32_C(1) << s->inode.block_size_shift);
500 
501     acb->s = s;
502 
503     acb->qiov = qiov;
504 
505     acb->sector_num = sector_num;
506     acb->nb_sectors = nb_sectors;
507 
508     acb->coroutine = qemu_coroutine_self();
509     acb->ret = 0;
510     acb->nr_pending = 0;
511 
512     acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
513     acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE +
514                               acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size;
515 
516     acb->min_dirty_data_idx = UINT32_MAX;
517     acb->max_dirty_data_idx = 0;
518     acb->aiocb_type = type;
519 
520     if (type == AIOCB_FLUSH_CACHE) {
521         return;
522     }
523 
524     qemu_co_mutex_lock(&s->queue_lock);
525     wait_for_overlapping_aiocb(s, acb);
526     QLIST_INSERT_HEAD(&s->inflight_aiocb_head, acb, aiocb_siblings);
527     qemu_co_mutex_unlock(&s->queue_lock);
528 }
529 
sd_server_config(QDict * options,Error ** errp)530 static SocketAddress *sd_server_config(QDict *options, Error **errp)
531 {
532     QDict *server = NULL;
533     Visitor *iv = NULL;
534     SocketAddress *saddr = NULL;
535     Error *local_err = NULL;
536 
537     qdict_extract_subqdict(options, &server, "server.");
538 
539     iv = qobject_input_visitor_new_flat_confused(server, errp);
540     if (!iv) {
541         goto done;
542     }
543 
544     visit_type_SocketAddress(iv, NULL, &saddr, &local_err);
545     if (local_err) {
546         error_propagate(errp, local_err);
547         goto done;
548     }
549 
550 done:
551     visit_free(iv);
552     qobject_unref(server);
553     return saddr;
554 }
555 
556 /* Return -EIO in case of error, file descriptor on success */
connect_to_sdog(BDRVSheepdogState * s,Error ** errp)557 static int connect_to_sdog(BDRVSheepdogState *s, Error **errp)
558 {
559     int fd;
560 
561     fd = socket_connect(s->addr, errp);
562 
563     if (s->addr->type == SOCKET_ADDRESS_TYPE_INET && fd >= 0) {
564         int ret = socket_set_nodelay(fd);
565         if (ret < 0) {
566             warn_report("can't set TCP_NODELAY: %s", strerror(errno));
567         }
568     }
569 
570     if (fd >= 0) {
571         qemu_set_nonblock(fd);
572     } else {
573         fd = -EIO;
574     }
575 
576     return fd;
577 }
578 
579 /* Return 0 on success and -errno in case of error */
send_co_req(int sockfd,SheepdogReq * hdr,void * data,unsigned int * wlen)580 static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
581                                     unsigned int *wlen)
582 {
583     int ret;
584 
585     ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
586     if (ret != sizeof(*hdr)) {
587         error_report("failed to send a req, %s", strerror(errno));
588         return -errno;
589     }
590 
591     ret = qemu_co_send(sockfd, data, *wlen);
592     if (ret != *wlen) {
593         error_report("failed to send a req, %s", strerror(errno));
594         return -errno;
595     }
596 
597     return ret;
598 }
599 
600 typedef struct SheepdogReqCo {
601     int sockfd;
602     BlockDriverState *bs;
603     AioContext *aio_context;
604     SheepdogReq *hdr;
605     void *data;
606     unsigned int *wlen;
607     unsigned int *rlen;
608     int ret;
609     bool finished;
610     Coroutine *co;
611 } SheepdogReqCo;
612 
restart_co_req(void * opaque)613 static void restart_co_req(void *opaque)
614 {
615     SheepdogReqCo *srco = opaque;
616 
617     aio_co_wake(srco->co);
618 }
619 
do_co_req(void * opaque)620 static coroutine_fn void do_co_req(void *opaque)
621 {
622     int ret;
623     SheepdogReqCo *srco = opaque;
624     int sockfd = srco->sockfd;
625     SheepdogReq *hdr = srco->hdr;
626     void *data = srco->data;
627     unsigned int *wlen = srco->wlen;
628     unsigned int *rlen = srco->rlen;
629 
630     srco->co = qemu_coroutine_self();
631     aio_set_fd_handler(srco->aio_context, sockfd, false,
632                        NULL, restart_co_req, NULL, srco);
633 
634     ret = send_co_req(sockfd, hdr, data, wlen);
635     if (ret < 0) {
636         goto out;
637     }
638 
639     aio_set_fd_handler(srco->aio_context, sockfd, false,
640                        restart_co_req, NULL, NULL, srco);
641 
642     ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
643     if (ret != sizeof(*hdr)) {
644         error_report("failed to get a rsp, %s", strerror(errno));
645         ret = -errno;
646         goto out;
647     }
648 
649     if (*rlen > hdr->data_length) {
650         *rlen = hdr->data_length;
651     }
652 
653     if (*rlen) {
654         ret = qemu_co_recv(sockfd, data, *rlen);
655         if (ret != *rlen) {
656             error_report("failed to get the data, %s", strerror(errno));
657             ret = -errno;
658             goto out;
659         }
660     }
661     ret = 0;
662 out:
663     /* there is at most one request for this sockfd, so it is safe to
664      * set each handler to NULL. */
665     aio_set_fd_handler(srco->aio_context, sockfd, false,
666                        NULL, NULL, NULL, NULL);
667 
668     srco->co = NULL;
669     srco->ret = ret;
670     /* Set srco->finished before reading bs->wakeup.  */
671     atomic_mb_set(&srco->finished, true);
672     if (srco->bs) {
673         bdrv_wakeup(srco->bs);
674     }
675 }
676 
677 /*
678  * Send the request to the sheep in a synchronous manner.
679  *
680  * Return 0 on success, -errno in case of error.
681  */
do_req(int sockfd,BlockDriverState * bs,SheepdogReq * hdr,void * data,unsigned int * wlen,unsigned int * rlen)682 static int do_req(int sockfd, BlockDriverState *bs, SheepdogReq *hdr,
683                   void *data, unsigned int *wlen, unsigned int *rlen)
684 {
685     Coroutine *co;
686     SheepdogReqCo srco = {
687         .sockfd = sockfd,
688         .aio_context = bs ? bdrv_get_aio_context(bs) : qemu_get_aio_context(),
689         .bs = bs,
690         .hdr = hdr,
691         .data = data,
692         .wlen = wlen,
693         .rlen = rlen,
694         .ret = 0,
695         .finished = false,
696     };
697 
698     if (qemu_in_coroutine()) {
699         do_co_req(&srco);
700     } else {
701         co = qemu_coroutine_create(do_co_req, &srco);
702         if (bs) {
703             bdrv_coroutine_enter(bs, co);
704             BDRV_POLL_WHILE(bs, !srco.finished);
705         } else {
706             qemu_coroutine_enter(co);
707             while (!srco.finished) {
708                 aio_poll(qemu_get_aio_context(), true);
709             }
710         }
711     }
712 
713     return srco.ret;
714 }
715 
716 static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
717                                          struct iovec *iov, int niov,
718                                          enum AIOCBState aiocb_type);
719 static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req);
720 static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag);
721 static int get_sheep_fd(BDRVSheepdogState *s, Error **errp);
722 static void co_write_request(void *opaque);
723 
reconnect_to_sdog(void * opaque)724 static coroutine_fn void reconnect_to_sdog(void *opaque)
725 {
726     BDRVSheepdogState *s = opaque;
727     AIOReq *aio_req, *next;
728 
729     aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
730                        NULL, NULL, NULL);
731     close(s->fd);
732     s->fd = -1;
733 
734     /* Wait for outstanding write requests to be completed. */
735     while (s->co_send != NULL) {
736         co_write_request(opaque);
737     }
738 
739     /* Try to reconnect the sheepdog server every one second. */
740     while (s->fd < 0) {
741         Error *local_err = NULL;
742         s->fd = get_sheep_fd(s, &local_err);
743         if (s->fd < 0) {
744             trace_sheepdog_reconnect_to_sdog();
745             error_report_err(local_err);
746             qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000000ULL);
747         }
748     };
749 
750     /*
751      * Now we have to resend all the request in the inflight queue.  However,
752      * resend_aioreq() can yield and newly created requests can be added to the
753      * inflight queue before the coroutine is resumed.  To avoid mixing them, we
754      * have to move all the inflight requests to the failed queue before
755      * resend_aioreq() is called.
756      */
757     qemu_co_mutex_lock(&s->queue_lock);
758     QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) {
759         QLIST_REMOVE(aio_req, aio_siblings);
760         QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings);
761     }
762 
763     /* Resend all the failed aio requests. */
764     while (!QLIST_EMPTY(&s->failed_aio_head)) {
765         aio_req = QLIST_FIRST(&s->failed_aio_head);
766         QLIST_REMOVE(aio_req, aio_siblings);
767         qemu_co_mutex_unlock(&s->queue_lock);
768         resend_aioreq(s, aio_req);
769         qemu_co_mutex_lock(&s->queue_lock);
770     }
771     qemu_co_mutex_unlock(&s->queue_lock);
772 }
773 
774 /*
775  * Receive responses of the I/O requests.
776  *
777  * This function is registered as a fd handler, and called from the
778  * main loop when s->fd is ready for reading responses.
779  */
aio_read_response(void * opaque)780 static void coroutine_fn aio_read_response(void *opaque)
781 {
782     SheepdogObjRsp rsp;
783     BDRVSheepdogState *s = opaque;
784     int fd = s->fd;
785     int ret;
786     AIOReq *aio_req = NULL;
787     SheepdogAIOCB *acb;
788     uint64_t idx;
789 
790     /* read a header */
791     ret = qemu_co_recv(fd, &rsp, sizeof(rsp));
792     if (ret != sizeof(rsp)) {
793         error_report("failed to get the header, %s", strerror(errno));
794         goto err;
795     }
796 
797     /* find the right aio_req from the inflight aio list */
798     QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) {
799         if (aio_req->id == rsp.id) {
800             break;
801         }
802     }
803     if (!aio_req) {
804         error_report("cannot find aio_req %x", rsp.id);
805         goto err;
806     }
807 
808     acb = aio_req->aiocb;
809 
810     switch (acb->aiocb_type) {
811     case AIOCB_WRITE_UDATA:
812         if (!is_data_obj(aio_req->oid)) {
813             break;
814         }
815         idx = data_oid_to_idx(aio_req->oid);
816 
817         if (aio_req->create) {
818             /*
819              * If the object is newly created one, we need to update
820              * the vdi object (metadata object).  min_dirty_data_idx
821              * and max_dirty_data_idx are changed to include updated
822              * index between them.
823              */
824             if (rsp.result == SD_RES_SUCCESS) {
825                 s->inode.data_vdi_id[idx] = s->inode.vdi_id;
826                 acb->max_dirty_data_idx = MAX(idx, acb->max_dirty_data_idx);
827                 acb->min_dirty_data_idx = MIN(idx, acb->min_dirty_data_idx);
828             }
829         }
830         break;
831     case AIOCB_READ_UDATA:
832         ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov,
833                             aio_req->iov_offset, rsp.data_length);
834         if (ret != rsp.data_length) {
835             error_report("failed to get the data, %s", strerror(errno));
836             goto err;
837         }
838         break;
839     case AIOCB_FLUSH_CACHE:
840         if (rsp.result == SD_RES_INVALID_PARMS) {
841             trace_sheepdog_aio_read_response();
842             s->cache_flags = SD_FLAG_CMD_DIRECT;
843             rsp.result = SD_RES_SUCCESS;
844         }
845         break;
846     case AIOCB_DISCARD_OBJ:
847         switch (rsp.result) {
848         case SD_RES_INVALID_PARMS:
849             error_report("server doesn't support discard command");
850             rsp.result = SD_RES_SUCCESS;
851             s->discard_supported = false;
852             break;
853         default:
854             break;
855         }
856     }
857 
858     /* No more data for this aio_req (reload_inode below uses its own file
859      * descriptor handler which doesn't use co_recv).
860     */
861     s->co_recv = NULL;
862 
863     qemu_co_mutex_lock(&s->queue_lock);
864     QLIST_REMOVE(aio_req, aio_siblings);
865     qemu_co_mutex_unlock(&s->queue_lock);
866 
867     switch (rsp.result) {
868     case SD_RES_SUCCESS:
869         break;
870     case SD_RES_READONLY:
871         if (s->inode.vdi_id == oid_to_vid(aio_req->oid)) {
872             ret = reload_inode(s, 0, "");
873             if (ret < 0) {
874                 goto err;
875             }
876         }
877         if (is_data_obj(aio_req->oid)) {
878             aio_req->oid = vid_to_data_oid(s->inode.vdi_id,
879                                            data_oid_to_idx(aio_req->oid));
880         } else {
881             aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id);
882         }
883         resend_aioreq(s, aio_req);
884         return;
885     default:
886         acb->ret = -EIO;
887         error_report("%s", sd_strerror(rsp.result));
888         break;
889     }
890 
891     g_free(aio_req);
892 
893     if (!--acb->nr_pending) {
894         /*
895          * We've finished all requests which belong to the AIOCB, so
896          * we can switch back to sd_co_readv/writev now.
897          */
898         aio_co_wake(acb->coroutine);
899     }
900 
901     return;
902 
903 err:
904     reconnect_to_sdog(opaque);
905 }
906 
co_read_response(void * opaque)907 static void co_read_response(void *opaque)
908 {
909     BDRVSheepdogState *s = opaque;
910 
911     if (!s->co_recv) {
912         s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
913     }
914 
915     aio_co_enter(s->aio_context, s->co_recv);
916 }
917 
co_write_request(void * opaque)918 static void co_write_request(void *opaque)
919 {
920     BDRVSheepdogState *s = opaque;
921 
922     aio_co_wake(s->co_send);
923 }
924 
925 /*
926  * Return a socket descriptor to read/write objects.
927  *
928  * We cannot use this descriptor for other operations because
929  * the block driver may be on waiting response from the server.
930  */
get_sheep_fd(BDRVSheepdogState * s,Error ** errp)931 static int get_sheep_fd(BDRVSheepdogState *s, Error **errp)
932 {
933     int fd;
934 
935     fd = connect_to_sdog(s, errp);
936     if (fd < 0) {
937         return fd;
938     }
939 
940     aio_set_fd_handler(s->aio_context, fd, false,
941                        co_read_response, NULL, NULL, s);
942     return fd;
943 }
944 
945 /*
946  * Parse numeric snapshot ID in @str
947  * If @str can't be parsed as number, return false.
948  * Else, if the number is zero or too large, set *@snapid to zero and
949  * return true.
950  * Else, set *@snapid to the number and return true.
951  */
sd_parse_snapid(const char * str,uint32_t * snapid)952 static bool sd_parse_snapid(const char *str, uint32_t *snapid)
953 {
954     unsigned long ul;
955     int ret;
956 
957     ret = qemu_strtoul(str, NULL, 10, &ul);
958     if (ret == -ERANGE) {
959         ul = ret = 0;
960     }
961     if (ret) {
962         return false;
963     }
964     if (ul > UINT32_MAX) {
965         ul = 0;
966     }
967 
968     *snapid = ul;
969     return true;
970 }
971 
sd_parse_snapid_or_tag(const char * str,uint32_t * snapid,char tag[])972 static bool sd_parse_snapid_or_tag(const char *str,
973                                    uint32_t *snapid, char tag[])
974 {
975     if (!sd_parse_snapid(str, snapid)) {
976         *snapid = 0;
977         if (g_strlcpy(tag, str, SD_MAX_VDI_TAG_LEN) >= SD_MAX_VDI_TAG_LEN) {
978             return false;
979         }
980     } else if (!*snapid) {
981         return false;
982     } else {
983         tag[0] = 0;
984     }
985     return true;
986 }
987 
988 typedef struct {
989     const char *path;           /* non-null iff transport is tcp */
990     const char *host;           /* valid when transport is tcp */
991     int port;                   /* valid when transport is tcp */
992     char vdi[SD_MAX_VDI_LEN];
993     char tag[SD_MAX_VDI_TAG_LEN];
994     uint32_t snap_id;
995     /* Remainder is only for sd_config_done() */
996     URI *uri;
997     QueryParams *qp;
998 } SheepdogConfig;
999 
sd_config_done(SheepdogConfig * cfg)1000 static void sd_config_done(SheepdogConfig *cfg)
1001 {
1002     if (cfg->qp) {
1003         query_params_free(cfg->qp);
1004     }
1005     uri_free(cfg->uri);
1006 }
1007 
sd_parse_uri(SheepdogConfig * cfg,const char * filename,Error ** errp)1008 static void sd_parse_uri(SheepdogConfig *cfg, const char *filename,
1009                          Error **errp)
1010 {
1011     Error *err = NULL;
1012     QueryParams *qp = NULL;
1013     bool is_unix;
1014     URI *uri;
1015 
1016     memset(cfg, 0, sizeof(*cfg));
1017 
1018     cfg->uri = uri = uri_parse(filename);
1019     if (!uri) {
1020         error_setg(&err, "invalid URI '%s'", filename);
1021         goto out;
1022     }
1023 
1024     /* transport */
1025     if (!g_strcmp0(uri->scheme, "sheepdog")) {
1026         is_unix = false;
1027     } else if (!g_strcmp0(uri->scheme, "sheepdog+tcp")) {
1028         is_unix = false;
1029     } else if (!g_strcmp0(uri->scheme, "sheepdog+unix")) {
1030         is_unix = true;
1031     } else {
1032         error_setg(&err, "URI scheme must be 'sheepdog', 'sheepdog+tcp',"
1033                    " or 'sheepdog+unix'");
1034         goto out;
1035     }
1036 
1037     if (uri->path == NULL || !strcmp(uri->path, "/")) {
1038         error_setg(&err, "missing file path in URI");
1039         goto out;
1040     }
1041     if (g_strlcpy(cfg->vdi, uri->path + 1, SD_MAX_VDI_LEN)
1042         >= SD_MAX_VDI_LEN) {
1043         error_setg(&err, "VDI name is too long");
1044         goto out;
1045     }
1046 
1047     cfg->qp = qp = query_params_parse(uri->query);
1048 
1049     if (is_unix) {
1050         /* sheepdog+unix:///vdiname?socket=path */
1051         if (uri->server || uri->port) {
1052             error_setg(&err, "URI scheme %s doesn't accept a server address",
1053                        uri->scheme);
1054             goto out;
1055         }
1056         if (!qp->n) {
1057             error_setg(&err,
1058                        "URI scheme %s requires query parameter 'socket'",
1059                        uri->scheme);
1060             goto out;
1061         }
1062         if (qp->n != 1 || strcmp(qp->p[0].name, "socket")) {
1063             error_setg(&err, "unexpected query parameters");
1064             goto out;
1065         }
1066         cfg->path = qp->p[0].value;
1067     } else {
1068         /* sheepdog[+tcp]://[host:port]/vdiname */
1069         if (qp->n) {
1070             error_setg(&err, "unexpected query parameters");
1071             goto out;
1072         }
1073         cfg->host = uri->server;
1074         cfg->port = uri->port;
1075     }
1076 
1077     /* snapshot tag */
1078     if (uri->fragment) {
1079         if (!sd_parse_snapid_or_tag(uri->fragment,
1080                                     &cfg->snap_id, cfg->tag)) {
1081             error_setg(&err, "'%s' is not a valid snapshot ID",
1082                        uri->fragment);
1083             goto out;
1084         }
1085     } else {
1086         cfg->snap_id = CURRENT_VDI_ID; /* search current vdi */
1087     }
1088 
1089 out:
1090     if (err) {
1091         error_propagate(errp, err);
1092         sd_config_done(cfg);
1093     }
1094 }
1095 
1096 /*
1097  * Parse a filename (old syntax)
1098  *
1099  * filename must be one of the following formats:
1100  *   1. [vdiname]
1101  *   2. [vdiname]:[snapid]
1102  *   3. [vdiname]:[tag]
1103  *   4. [hostname]:[port]:[vdiname]
1104  *   5. [hostname]:[port]:[vdiname]:[snapid]
1105  *   6. [hostname]:[port]:[vdiname]:[tag]
1106  *
1107  * You can boot from the snapshot images by specifying `snapid` or
1108  * `tag'.
1109  *
1110  * You can run VMs outside the Sheepdog cluster by specifying
1111  * `hostname' and `port' (experimental).
1112  */
parse_vdiname(SheepdogConfig * cfg,const char * filename,Error ** errp)1113 static void parse_vdiname(SheepdogConfig *cfg, const char *filename,
1114                           Error **errp)
1115 {
1116     Error *err = NULL;
1117     char *p, *q, *uri;
1118     const char *host_spec, *vdi_spec;
1119     int nr_sep;
1120 
1121     strstart(filename, "sheepdog:", &filename);
1122     p = q = g_strdup(filename);
1123 
1124     /* count the number of separators */
1125     nr_sep = 0;
1126     while (*p) {
1127         if (*p == ':') {
1128             nr_sep++;
1129         }
1130         p++;
1131     }
1132     p = q;
1133 
1134     /* use the first two tokens as host_spec. */
1135     if (nr_sep >= 2) {
1136         host_spec = p;
1137         p = strchr(p, ':');
1138         p++;
1139         p = strchr(p, ':');
1140         *p++ = '\0';
1141     } else {
1142         host_spec = "";
1143     }
1144 
1145     vdi_spec = p;
1146 
1147     p = strchr(vdi_spec, ':');
1148     if (p) {
1149         *p++ = '#';
1150     }
1151 
1152     uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec);
1153 
1154     /*
1155      * FIXME We to escape URI meta-characters, e.g. "x?y=z"
1156      * produces "sheepdog://x?y=z".  Because of that ...
1157      */
1158     sd_parse_uri(cfg, uri, &err);
1159     if (err) {
1160         /*
1161          * ... this can fail, but the error message is misleading.
1162          * Replace it by the traditional useless one until the
1163          * escaping is fixed.
1164          */
1165         error_free(err);
1166         error_setg(errp, "Can't parse filename");
1167     }
1168 
1169     g_free(q);
1170     g_free(uri);
1171 }
1172 
sd_parse_filename(const char * filename,QDict * options,Error ** errp)1173 static void sd_parse_filename(const char *filename, QDict *options,
1174                               Error **errp)
1175 {
1176     Error *err = NULL;
1177     SheepdogConfig cfg;
1178     char buf[32];
1179 
1180     if (strstr(filename, "://")) {
1181         sd_parse_uri(&cfg, filename, &err);
1182     } else {
1183         parse_vdiname(&cfg, filename, &err);
1184     }
1185     if (err) {
1186         error_propagate(errp, err);
1187         return;
1188     }
1189 
1190     if (cfg.path) {
1191         qdict_set_default_str(options, "server.path", cfg.path);
1192         qdict_set_default_str(options, "server.type", "unix");
1193     } else {
1194         qdict_set_default_str(options, "server.type", "inet");
1195         qdict_set_default_str(options, "server.host",
1196                               cfg.host ?: SD_DEFAULT_ADDR);
1197         snprintf(buf, sizeof(buf), "%d", cfg.port ?: SD_DEFAULT_PORT);
1198         qdict_set_default_str(options, "server.port", buf);
1199     }
1200     qdict_set_default_str(options, "vdi", cfg.vdi);
1201     qdict_set_default_str(options, "tag", cfg.tag);
1202     if (cfg.snap_id) {
1203         snprintf(buf, sizeof(buf), "%d", cfg.snap_id);
1204         qdict_set_default_str(options, "snap-id", buf);
1205     }
1206 
1207     sd_config_done(&cfg);
1208 }
1209 
find_vdi_name(BDRVSheepdogState * s,const char * filename,uint32_t snapid,const char * tag,uint32_t * vid,bool lock,Error ** errp)1210 static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
1211                          uint32_t snapid, const char *tag, uint32_t *vid,
1212                          bool lock, Error **errp)
1213 {
1214     int ret, fd;
1215     SheepdogVdiReq hdr;
1216     SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1217     unsigned int wlen, rlen = 0;
1218     char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN] QEMU_NONSTRING;
1219 
1220     fd = connect_to_sdog(s, errp);
1221     if (fd < 0) {
1222         return fd;
1223     }
1224 
1225     /* This pair of strncpy calls ensures that the buffer is zero-filled,
1226      * which is desirable since we'll soon be sending those bytes, and
1227      * don't want the send_req to read uninitialized data.
1228      */
1229     strncpy(buf, filename, SD_MAX_VDI_LEN);
1230     strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
1231 
1232     memset(&hdr, 0, sizeof(hdr));
1233     if (lock) {
1234         hdr.opcode = SD_OP_LOCK_VDI;
1235         hdr.type = LOCK_TYPE_NORMAL;
1236     } else {
1237         hdr.opcode = SD_OP_GET_VDI_INFO;
1238     }
1239     wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
1240     hdr.proto_ver = SD_PROTO_VER;
1241     hdr.data_length = wlen;
1242     hdr.snapid = snapid;
1243     hdr.flags = SD_FLAG_CMD_WRITE;
1244 
1245     ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1246     if (ret) {
1247         error_setg_errno(errp, -ret, "cannot get vdi info");
1248         goto out;
1249     }
1250 
1251     if (rsp->result != SD_RES_SUCCESS) {
1252         error_setg(errp, "cannot get vdi info, %s, %s %" PRIu32 " %s",
1253                    sd_strerror(rsp->result), filename, snapid, tag);
1254         if (rsp->result == SD_RES_NO_VDI) {
1255             ret = -ENOENT;
1256         } else if (rsp->result == SD_RES_VDI_LOCKED) {
1257             ret = -EBUSY;
1258         } else {
1259             ret = -EIO;
1260         }
1261         goto out;
1262     }
1263     *vid = rsp->vdi_id;
1264 
1265     ret = 0;
1266 out:
1267     closesocket(fd);
1268     return ret;
1269 }
1270 
add_aio_request(BDRVSheepdogState * s,AIOReq * aio_req,struct iovec * iov,int niov,enum AIOCBState aiocb_type)1271 static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
1272                                          struct iovec *iov, int niov,
1273                                          enum AIOCBState aiocb_type)
1274 {
1275     int nr_copies = s->inode.nr_copies;
1276     SheepdogObjReq hdr;
1277     unsigned int wlen = 0;
1278     int ret;
1279     uint64_t oid = aio_req->oid;
1280     unsigned int datalen = aio_req->data_len;
1281     uint64_t offset = aio_req->offset;
1282     uint8_t flags = aio_req->flags;
1283     uint64_t old_oid = aio_req->base_oid;
1284     bool create = aio_req->create;
1285 
1286     qemu_co_mutex_lock(&s->queue_lock);
1287     QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
1288     qemu_co_mutex_unlock(&s->queue_lock);
1289 
1290     if (!nr_copies) {
1291         error_report("bug");
1292     }
1293 
1294     memset(&hdr, 0, sizeof(hdr));
1295 
1296     switch (aiocb_type) {
1297     case AIOCB_FLUSH_CACHE:
1298         hdr.opcode = SD_OP_FLUSH_VDI;
1299         break;
1300     case AIOCB_READ_UDATA:
1301         hdr.opcode = SD_OP_READ_OBJ;
1302         hdr.flags = flags;
1303         break;
1304     case AIOCB_WRITE_UDATA:
1305         if (create) {
1306             hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1307         } else {
1308             hdr.opcode = SD_OP_WRITE_OBJ;
1309         }
1310         wlen = datalen;
1311         hdr.flags = SD_FLAG_CMD_WRITE | flags;
1312         break;
1313     case AIOCB_DISCARD_OBJ:
1314         hdr.opcode = SD_OP_WRITE_OBJ;
1315         hdr.flags = SD_FLAG_CMD_WRITE | flags;
1316         s->inode.data_vdi_id[data_oid_to_idx(oid)] = 0;
1317         offset = offsetof(SheepdogInode,
1318                           data_vdi_id[data_oid_to_idx(oid)]);
1319         oid = vid_to_vdi_oid(s->inode.vdi_id);
1320         wlen = datalen = sizeof(uint32_t);
1321         break;
1322     }
1323 
1324     if (s->cache_flags) {
1325         hdr.flags |= s->cache_flags;
1326     }
1327 
1328     hdr.oid = oid;
1329     hdr.cow_oid = old_oid;
1330     hdr.copies = s->inode.nr_copies;
1331 
1332     hdr.data_length = datalen;
1333     hdr.offset = offset;
1334 
1335     hdr.id = aio_req->id;
1336 
1337     qemu_co_mutex_lock(&s->lock);
1338     s->co_send = qemu_coroutine_self();
1339     aio_set_fd_handler(s->aio_context, s->fd, false,
1340                        co_read_response, co_write_request, NULL, s);
1341     socket_set_cork(s->fd, 1);
1342 
1343     /* send a header */
1344     ret = qemu_co_send(s->fd, &hdr, sizeof(hdr));
1345     if (ret != sizeof(hdr)) {
1346         error_report("failed to send a req, %s", strerror(errno));
1347         goto out;
1348     }
1349 
1350     if (wlen) {
1351         ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen);
1352         if (ret != wlen) {
1353             error_report("failed to send a data, %s", strerror(errno));
1354         }
1355     }
1356 out:
1357     socket_set_cork(s->fd, 0);
1358     aio_set_fd_handler(s->aio_context, s->fd, false,
1359                        co_read_response, NULL, NULL, s);
1360     s->co_send = NULL;
1361     qemu_co_mutex_unlock(&s->lock);
1362 }
1363 
read_write_object(int fd,BlockDriverState * bs,char * buf,uint64_t oid,uint8_t copies,unsigned int datalen,uint64_t offset,bool write,bool create,uint32_t cache_flags)1364 static int read_write_object(int fd, BlockDriverState *bs, char *buf,
1365                              uint64_t oid, uint8_t copies,
1366                              unsigned int datalen, uint64_t offset,
1367                              bool write, bool create, uint32_t cache_flags)
1368 {
1369     SheepdogObjReq hdr;
1370     SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1371     unsigned int wlen, rlen;
1372     int ret;
1373 
1374     memset(&hdr, 0, sizeof(hdr));
1375 
1376     if (write) {
1377         wlen = datalen;
1378         rlen = 0;
1379         hdr.flags = SD_FLAG_CMD_WRITE;
1380         if (create) {
1381             hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1382         } else {
1383             hdr.opcode = SD_OP_WRITE_OBJ;
1384         }
1385     } else {
1386         wlen = 0;
1387         rlen = datalen;
1388         hdr.opcode = SD_OP_READ_OBJ;
1389     }
1390 
1391     hdr.flags |= cache_flags;
1392 
1393     hdr.oid = oid;
1394     hdr.data_length = datalen;
1395     hdr.offset = offset;
1396     hdr.copies = copies;
1397 
1398     ret = do_req(fd, bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1399     if (ret) {
1400         error_report("failed to send a request to the sheep");
1401         return ret;
1402     }
1403 
1404     switch (rsp->result) {
1405     case SD_RES_SUCCESS:
1406         return 0;
1407     default:
1408         error_report("%s", sd_strerror(rsp->result));
1409         return -EIO;
1410     }
1411 }
1412 
read_object(int fd,BlockDriverState * bs,char * buf,uint64_t oid,uint8_t copies,unsigned int datalen,uint64_t offset,uint32_t cache_flags)1413 static int read_object(int fd, BlockDriverState *bs, char *buf,
1414                        uint64_t oid, uint8_t copies,
1415                        unsigned int datalen, uint64_t offset,
1416                        uint32_t cache_flags)
1417 {
1418     return read_write_object(fd, bs, buf, oid, copies,
1419                              datalen, offset, false,
1420                              false, cache_flags);
1421 }
1422 
write_object(int fd,BlockDriverState * bs,char * buf,uint64_t oid,uint8_t copies,unsigned int datalen,uint64_t offset,bool create,uint32_t cache_flags)1423 static int write_object(int fd, BlockDriverState *bs, char *buf,
1424                         uint64_t oid, uint8_t copies,
1425                         unsigned int datalen, uint64_t offset, bool create,
1426                         uint32_t cache_flags)
1427 {
1428     return read_write_object(fd, bs, buf, oid, copies,
1429                              datalen, offset, true,
1430                              create, cache_flags);
1431 }
1432 
1433 /* update inode with the latest state */
reload_inode(BDRVSheepdogState * s,uint32_t snapid,const char * tag)1434 static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag)
1435 {
1436     Error *local_err = NULL;
1437     SheepdogInode *inode;
1438     int ret = 0, fd;
1439     uint32_t vid = 0;
1440 
1441     fd = connect_to_sdog(s, &local_err);
1442     if (fd < 0) {
1443         error_report_err(local_err);
1444         return -EIO;
1445     }
1446 
1447     inode = g_malloc(SD_INODE_HEADER_SIZE);
1448 
1449     ret = find_vdi_name(s, s->name, snapid, tag, &vid, false, &local_err);
1450     if (ret) {
1451         error_report_err(local_err);
1452         goto out;
1453     }
1454 
1455     ret = read_object(fd, s->bs, (char *)inode, vid_to_vdi_oid(vid),
1456                       s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0,
1457                       s->cache_flags);
1458     if (ret < 0) {
1459         goto out;
1460     }
1461 
1462     if (inode->vdi_id != s->inode.vdi_id) {
1463         memcpy(&s->inode, inode, SD_INODE_HEADER_SIZE);
1464     }
1465 
1466 out:
1467     g_free(inode);
1468     closesocket(fd);
1469 
1470     return ret;
1471 }
1472 
resend_aioreq(BDRVSheepdogState * s,AIOReq * aio_req)1473 static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
1474 {
1475     SheepdogAIOCB *acb = aio_req->aiocb;
1476 
1477     aio_req->create = false;
1478 
1479     /* check whether this request becomes a CoW one */
1480     if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) {
1481         int idx = data_oid_to_idx(aio_req->oid);
1482 
1483         if (is_data_obj_writable(&s->inode, idx)) {
1484             goto out;
1485         }
1486 
1487         if (s->inode.data_vdi_id[idx]) {
1488             aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
1489             aio_req->flags |= SD_FLAG_CMD_COW;
1490         }
1491         aio_req->create = true;
1492     }
1493 out:
1494     if (is_data_obj(aio_req->oid)) {
1495         add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
1496                         acb->aiocb_type);
1497     } else {
1498         struct iovec iov;
1499         iov.iov_base = &s->inode;
1500         iov.iov_len = sizeof(s->inode);
1501         add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
1502     }
1503 }
1504 
sd_detach_aio_context(BlockDriverState * bs)1505 static void sd_detach_aio_context(BlockDriverState *bs)
1506 {
1507     BDRVSheepdogState *s = bs->opaque;
1508 
1509     aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
1510                        NULL, NULL, NULL);
1511 }
1512 
sd_attach_aio_context(BlockDriverState * bs,AioContext * new_context)1513 static void sd_attach_aio_context(BlockDriverState *bs,
1514                                   AioContext *new_context)
1515 {
1516     BDRVSheepdogState *s = bs->opaque;
1517 
1518     s->aio_context = new_context;
1519     aio_set_fd_handler(new_context, s->fd, false,
1520                        co_read_response, NULL, NULL, s);
1521 }
1522 
1523 static QemuOptsList runtime_opts = {
1524     .name = "sheepdog",
1525     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
1526     .desc = {
1527         {
1528             .name = "vdi",
1529             .type = QEMU_OPT_STRING,
1530         },
1531         {
1532             .name = "snap-id",
1533             .type = QEMU_OPT_NUMBER,
1534         },
1535         {
1536             .name = "tag",
1537             .type = QEMU_OPT_STRING,
1538         },
1539         { /* end of list */ }
1540     },
1541 };
1542 
sd_open(BlockDriverState * bs,QDict * options,int flags,Error ** errp)1543 static int sd_open(BlockDriverState *bs, QDict *options, int flags,
1544                    Error **errp)
1545 {
1546     int ret, fd;
1547     uint32_t vid = 0;
1548     BDRVSheepdogState *s = bs->opaque;
1549     const char *vdi, *snap_id_str, *tag;
1550     uint64_t snap_id;
1551     char *buf = NULL;
1552     QemuOpts *opts;
1553     Error *local_err = NULL;
1554 
1555     s->bs = bs;
1556     s->aio_context = bdrv_get_aio_context(bs);
1557 
1558     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
1559     qemu_opts_absorb_qdict(opts, options, &local_err);
1560     if (local_err) {
1561         error_propagate(errp, local_err);
1562         ret = -EINVAL;
1563         goto err_no_fd;
1564     }
1565 
1566     s->addr = sd_server_config(options, errp);
1567     if (!s->addr) {
1568         ret = -EINVAL;
1569         goto err_no_fd;
1570     }
1571 
1572     vdi = qemu_opt_get(opts, "vdi");
1573     snap_id_str = qemu_opt_get(opts, "snap-id");
1574     snap_id = qemu_opt_get_number(opts, "snap-id", CURRENT_VDI_ID);
1575     tag = qemu_opt_get(opts, "tag");
1576 
1577     if (!vdi) {
1578         error_setg(errp, "parameter 'vdi' is missing");
1579         ret = -EINVAL;
1580         goto err_no_fd;
1581     }
1582     if (strlen(vdi) >= SD_MAX_VDI_LEN) {
1583         error_setg(errp, "value of parameter 'vdi' is too long");
1584         ret = -EINVAL;
1585         goto err_no_fd;
1586     }
1587 
1588     if (snap_id > UINT32_MAX) {
1589         snap_id = 0;
1590     }
1591     if (snap_id_str && !snap_id) {
1592         error_setg(errp, "'snap-id=%s' is not a valid snapshot ID",
1593                    snap_id_str);
1594         ret = -EINVAL;
1595         goto err_no_fd;
1596     }
1597 
1598     if (!tag) {
1599         tag = "";
1600     }
1601     if (strlen(tag) >= SD_MAX_VDI_TAG_LEN) {
1602         error_setg(errp, "value of parameter 'tag' is too long");
1603         ret = -EINVAL;
1604         goto err_no_fd;
1605     }
1606 
1607     QLIST_INIT(&s->inflight_aio_head);
1608     QLIST_INIT(&s->failed_aio_head);
1609     QLIST_INIT(&s->inflight_aiocb_head);
1610 
1611     s->fd = get_sheep_fd(s, errp);
1612     if (s->fd < 0) {
1613         ret = s->fd;
1614         goto err_no_fd;
1615     }
1616 
1617     ret = find_vdi_name(s, vdi, (uint32_t)snap_id, tag, &vid, true, errp);
1618     if (ret) {
1619         goto err;
1620     }
1621 
1622     /*
1623      * QEMU block layer emulates writethrough cache as 'writeback + flush', so
1624      * we always set SD_FLAG_CMD_CACHE (writeback cache) as default.
1625      */
1626     s->cache_flags = SD_FLAG_CMD_CACHE;
1627     if (flags & BDRV_O_NOCACHE) {
1628         s->cache_flags = SD_FLAG_CMD_DIRECT;
1629     }
1630     s->discard_supported = true;
1631 
1632     if (snap_id || tag[0]) {
1633         trace_sheepdog_open(vid);
1634         s->is_snapshot = true;
1635     }
1636 
1637     fd = connect_to_sdog(s, errp);
1638     if (fd < 0) {
1639         ret = fd;
1640         goto err;
1641     }
1642 
1643     buf = g_malloc(SD_INODE_SIZE);
1644     ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
1645                       0, SD_INODE_SIZE, 0, s->cache_flags);
1646 
1647     closesocket(fd);
1648 
1649     if (ret) {
1650         error_setg(errp, "Can't read snapshot inode");
1651         goto err;
1652     }
1653 
1654     memcpy(&s->inode, buf, sizeof(s->inode));
1655 
1656     bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
1657     pstrcpy(s->name, sizeof(s->name), vdi);
1658     qemu_co_mutex_init(&s->lock);
1659     qemu_co_mutex_init(&s->queue_lock);
1660     qemu_co_queue_init(&s->overlapping_queue);
1661     qemu_opts_del(opts);
1662     g_free(buf);
1663     return 0;
1664 
1665 err:
1666     aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
1667                        false, NULL, NULL, NULL, NULL);
1668     closesocket(s->fd);
1669 err_no_fd:
1670     qemu_opts_del(opts);
1671     g_free(buf);
1672     return ret;
1673 }
1674 
sd_reopen_prepare(BDRVReopenState * state,BlockReopenQueue * queue,Error ** errp)1675 static int sd_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue,
1676                              Error **errp)
1677 {
1678     BDRVSheepdogState *s = state->bs->opaque;
1679     BDRVSheepdogReopenState *re_s;
1680     int ret = 0;
1681 
1682     re_s = state->opaque = g_new0(BDRVSheepdogReopenState, 1);
1683 
1684     re_s->cache_flags = SD_FLAG_CMD_CACHE;
1685     if (state->flags & BDRV_O_NOCACHE) {
1686         re_s->cache_flags = SD_FLAG_CMD_DIRECT;
1687     }
1688 
1689     re_s->fd = get_sheep_fd(s, errp);
1690     if (re_s->fd < 0) {
1691         ret = re_s->fd;
1692         return ret;
1693     }
1694 
1695     return ret;
1696 }
1697 
sd_reopen_commit(BDRVReopenState * state)1698 static void sd_reopen_commit(BDRVReopenState *state)
1699 {
1700     BDRVSheepdogReopenState *re_s = state->opaque;
1701     BDRVSheepdogState *s = state->bs->opaque;
1702 
1703     if (s->fd) {
1704         aio_set_fd_handler(s->aio_context, s->fd, false,
1705                            NULL, NULL, NULL, NULL);
1706         closesocket(s->fd);
1707     }
1708 
1709     s->fd = re_s->fd;
1710     s->cache_flags = re_s->cache_flags;
1711 
1712     g_free(state->opaque);
1713     state->opaque = NULL;
1714 
1715     return;
1716 }
1717 
sd_reopen_abort(BDRVReopenState * state)1718 static void sd_reopen_abort(BDRVReopenState *state)
1719 {
1720     BDRVSheepdogReopenState *re_s = state->opaque;
1721     BDRVSheepdogState *s = state->bs->opaque;
1722 
1723     if (re_s == NULL) {
1724         return;
1725     }
1726 
1727     if (re_s->fd) {
1728         aio_set_fd_handler(s->aio_context, re_s->fd, false,
1729                            NULL, NULL, NULL, NULL);
1730         closesocket(re_s->fd);
1731     }
1732 
1733     g_free(state->opaque);
1734     state->opaque = NULL;
1735 
1736     return;
1737 }
1738 
do_sd_create(BDRVSheepdogState * s,uint32_t * vdi_id,int snapshot,Error ** errp)1739 static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
1740                         Error **errp)
1741 {
1742     SheepdogVdiReq hdr;
1743     SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1744     int fd, ret;
1745     unsigned int wlen, rlen = 0;
1746     char buf[SD_MAX_VDI_LEN];
1747 
1748     fd = connect_to_sdog(s, errp);
1749     if (fd < 0) {
1750         return fd;
1751     }
1752 
1753     /* FIXME: would it be better to fail (e.g., return -EIO) when filename
1754      * does not fit in buf?  For now, just truncate and avoid buffer overrun.
1755      */
1756     memset(buf, 0, sizeof(buf));
1757     pstrcpy(buf, sizeof(buf), s->name);
1758 
1759     memset(&hdr, 0, sizeof(hdr));
1760     hdr.opcode = SD_OP_NEW_VDI;
1761     hdr.base_vdi_id = s->inode.vdi_id;
1762 
1763     wlen = SD_MAX_VDI_LEN;
1764 
1765     hdr.flags = SD_FLAG_CMD_WRITE;
1766     hdr.snapid = snapshot;
1767 
1768     hdr.data_length = wlen;
1769     hdr.vdi_size = s->inode.vdi_size;
1770     hdr.copy_policy = s->inode.copy_policy;
1771     hdr.copies = s->inode.nr_copies;
1772     hdr.block_size_shift = s->inode.block_size_shift;
1773 
1774     ret = do_req(fd, NULL, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1775 
1776     closesocket(fd);
1777 
1778     if (ret) {
1779         error_setg_errno(errp, -ret, "create failed");
1780         return ret;
1781     }
1782 
1783     if (rsp->result != SD_RES_SUCCESS) {
1784         error_setg(errp, "%s, %s", sd_strerror(rsp->result), s->inode.name);
1785         return -EIO;
1786     }
1787 
1788     if (vdi_id) {
1789         *vdi_id = rsp->vdi_id;
1790     }
1791 
1792     return 0;
1793 }
1794 
sd_prealloc(BlockDriverState * bs,int64_t old_size,int64_t new_size,Error ** errp)1795 static int sd_prealloc(BlockDriverState *bs, int64_t old_size, int64_t new_size,
1796                        Error **errp)
1797 {
1798     BlockBackend *blk = NULL;
1799     BDRVSheepdogState *base = bs->opaque;
1800     unsigned long buf_size;
1801     uint32_t idx, max_idx;
1802     uint32_t object_size;
1803     void *buf = NULL;
1804     int ret;
1805 
1806     blk = blk_new(bdrv_get_aio_context(bs),
1807                   BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | BLK_PERM_RESIZE,
1808                   BLK_PERM_ALL);
1809 
1810     ret = blk_insert_bs(blk, bs, errp);
1811     if (ret < 0) {
1812         goto out_with_err_set;
1813     }
1814 
1815     blk_set_allow_write_beyond_eof(blk, true);
1816 
1817     object_size = (UINT32_C(1) << base->inode.block_size_shift);
1818     buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
1819     buf = g_malloc0(buf_size);
1820 
1821     max_idx = DIV_ROUND_UP(new_size, buf_size);
1822 
1823     for (idx = old_size / buf_size; idx < max_idx; idx++) {
1824         /*
1825          * The created image can be a cloned image, so we need to read
1826          * a data from the source image.
1827          */
1828         ret = blk_pread(blk, idx * buf_size, buf, buf_size);
1829         if (ret < 0) {
1830             goto out;
1831         }
1832         ret = blk_pwrite(blk, idx * buf_size, buf, buf_size, 0);
1833         if (ret < 0) {
1834             goto out;
1835         }
1836     }
1837 
1838     ret = 0;
1839 out:
1840     if (ret < 0) {
1841         error_setg_errno(errp, -ret, "Can't pre-allocate");
1842     }
1843 out_with_err_set:
1844     blk_unref(blk);
1845     g_free(buf);
1846 
1847     return ret;
1848 }
1849 
sd_create_prealloc(BlockdevOptionsSheepdog * location,int64_t size,Error ** errp)1850 static int sd_create_prealloc(BlockdevOptionsSheepdog *location, int64_t size,
1851                               Error **errp)
1852 {
1853     BlockDriverState *bs;
1854     Visitor *v;
1855     QObject *obj = NULL;
1856     QDict *qdict;
1857     Error *local_err = NULL;
1858     int ret;
1859 
1860     v = qobject_output_visitor_new(&obj);
1861     visit_type_BlockdevOptionsSheepdog(v, NULL, &location, &local_err);
1862     visit_free(v);
1863 
1864     if (local_err) {
1865         error_propagate(errp, local_err);
1866         qobject_unref(obj);
1867         return -EINVAL;
1868     }
1869 
1870     qdict = qobject_to(QDict, obj);
1871     qdict_flatten(qdict);
1872 
1873     qdict_put_str(qdict, "driver", "sheepdog");
1874 
1875     bs = bdrv_open(NULL, NULL, qdict, BDRV_O_PROTOCOL | BDRV_O_RDWR, errp);
1876     if (bs == NULL) {
1877         ret = -EIO;
1878         goto fail;
1879     }
1880 
1881     ret = sd_prealloc(bs, 0, size, errp);
1882 fail:
1883     bdrv_unref(bs);
1884     qobject_unref(qdict);
1885     return ret;
1886 }
1887 
parse_redundancy(BDRVSheepdogState * s,SheepdogRedundancy * opt)1888 static int parse_redundancy(BDRVSheepdogState *s, SheepdogRedundancy *opt)
1889 {
1890     struct SheepdogInode *inode = &s->inode;
1891 
1892     switch (opt->type) {
1893     case SHEEPDOG_REDUNDANCY_TYPE_FULL:
1894         if (opt->u.full.copies > SD_MAX_COPIES || opt->u.full.copies < 1) {
1895             return -EINVAL;
1896         }
1897         inode->copy_policy = 0;
1898         inode->nr_copies = opt->u.full.copies;
1899         return 0;
1900 
1901     case SHEEPDOG_REDUNDANCY_TYPE_ERASURE_CODED:
1902     {
1903         int64_t copy = opt->u.erasure_coded.data_strips;
1904         int64_t parity = opt->u.erasure_coded.parity_strips;
1905 
1906         if (copy != 2 && copy != 4 && copy != 8 && copy != 16) {
1907             return -EINVAL;
1908         }
1909 
1910         if (parity >= SD_EC_MAX_STRIP || parity < 1) {
1911             return -EINVAL;
1912         }
1913 
1914         /*
1915          * 4 bits for parity and 4 bits for data.
1916          * We have to compress upper data bits because it can't represent 16
1917          */
1918         inode->copy_policy = ((copy / 2) << 4) + parity;
1919         inode->nr_copies = copy + parity;
1920         return 0;
1921     }
1922 
1923     default:
1924         g_assert_not_reached();
1925     }
1926 
1927     return -EINVAL;
1928 }
1929 
1930 /*
1931  * Sheepdog support two kinds of redundancy, full replication and erasure
1932  * coding.
1933  *
1934  * # create a fully replicated vdi with x copies
1935  * -o redundancy=x (1 <= x <= SD_MAX_COPIES)
1936  *
1937  * # create a erasure coded vdi with x data strips and y parity strips
1938  * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP)
1939  */
parse_redundancy_str(const char * opt)1940 static SheepdogRedundancy *parse_redundancy_str(const char *opt)
1941 {
1942     SheepdogRedundancy *redundancy;
1943     const char *n1, *n2;
1944     long copy, parity;
1945     char p[10];
1946     int ret;
1947 
1948     pstrcpy(p, sizeof(p), opt);
1949     n1 = strtok(p, ":");
1950     n2 = strtok(NULL, ":");
1951 
1952     if (!n1) {
1953         return NULL;
1954     }
1955 
1956     ret = qemu_strtol(n1, NULL, 10, &copy);
1957     if (ret < 0) {
1958         return NULL;
1959     }
1960 
1961     redundancy = g_new0(SheepdogRedundancy, 1);
1962     if (!n2) {
1963         *redundancy = (SheepdogRedundancy) {
1964             .type               = SHEEPDOG_REDUNDANCY_TYPE_FULL,
1965             .u.full.copies      = copy,
1966         };
1967     } else {
1968         ret = qemu_strtol(n2, NULL, 10, &parity);
1969         if (ret < 0) {
1970             g_free(redundancy);
1971             return NULL;
1972         }
1973 
1974         *redundancy = (SheepdogRedundancy) {
1975             .type               = SHEEPDOG_REDUNDANCY_TYPE_ERASURE_CODED,
1976             .u.erasure_coded    = {
1977                 .data_strips    = copy,
1978                 .parity_strips  = parity,
1979             },
1980         };
1981     }
1982 
1983     return redundancy;
1984 }
1985 
parse_block_size_shift(BDRVSheepdogState * s,BlockdevCreateOptionsSheepdog * opts)1986 static int parse_block_size_shift(BDRVSheepdogState *s,
1987                                   BlockdevCreateOptionsSheepdog *opts)
1988 {
1989     struct SheepdogInode *inode = &s->inode;
1990     uint64_t object_size;
1991     int obj_order;
1992 
1993     if (opts->has_object_size) {
1994         object_size = opts->object_size;
1995 
1996         if ((object_size - 1) & object_size) {    /* not a power of 2? */
1997             return -EINVAL;
1998         }
1999         obj_order = ctz32(object_size);
2000         if (obj_order < 20 || obj_order > 31) {
2001             return -EINVAL;
2002         }
2003         inode->block_size_shift = (uint8_t)obj_order;
2004     }
2005 
2006     return 0;
2007 }
2008 
sd_co_create(BlockdevCreateOptions * options,Error ** errp)2009 static int sd_co_create(BlockdevCreateOptions *options, Error **errp)
2010 {
2011     BlockdevCreateOptionsSheepdog *opts = &options->u.sheepdog;
2012     int ret = 0;
2013     uint32_t vid = 0;
2014     char *backing_file = NULL;
2015     char *buf = NULL;
2016     BDRVSheepdogState *s;
2017     uint64_t max_vdi_size;
2018     bool prealloc = false;
2019 
2020     assert(options->driver == BLOCKDEV_DRIVER_SHEEPDOG);
2021 
2022     s = g_new0(BDRVSheepdogState, 1);
2023 
2024     /* Steal SocketAddress from QAPI, set NULL to prevent double free */
2025     s->addr = opts->location->server;
2026     opts->location->server = NULL;
2027 
2028     if (strlen(opts->location->vdi) >= sizeof(s->name)) {
2029         error_setg(errp, "'vdi' string too long");
2030         ret = -EINVAL;
2031         goto out;
2032     }
2033     pstrcpy(s->name, sizeof(s->name), opts->location->vdi);
2034 
2035     s->inode.vdi_size = opts->size;
2036     backing_file = opts->backing_file;
2037 
2038     if (!opts->has_preallocation) {
2039         opts->preallocation = PREALLOC_MODE_OFF;
2040     }
2041     switch (opts->preallocation) {
2042     case PREALLOC_MODE_OFF:
2043         prealloc = false;
2044         break;
2045     case PREALLOC_MODE_FULL:
2046         prealloc = true;
2047         break;
2048     default:
2049         error_setg(errp, "Preallocation mode not supported for Sheepdog");
2050         ret = -EINVAL;
2051         goto out;
2052     }
2053 
2054     if (opts->has_redundancy) {
2055         ret = parse_redundancy(s, opts->redundancy);
2056         if (ret < 0) {
2057             error_setg(errp, "Invalid redundancy mode");
2058             goto out;
2059         }
2060     }
2061     ret = parse_block_size_shift(s, opts);
2062     if (ret < 0) {
2063         error_setg(errp, "Invalid object_size."
2064                          " obect_size needs to be power of 2"
2065                          " and be limited from 2^20 to 2^31");
2066         goto out;
2067     }
2068 
2069     if (opts->has_backing_file) {
2070         BlockBackend *blk;
2071         BDRVSheepdogState *base;
2072         BlockDriver *drv;
2073 
2074         /* Currently, only Sheepdog backing image is supported. */
2075         drv = bdrv_find_protocol(opts->backing_file, true, NULL);
2076         if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
2077             error_setg(errp, "backing_file must be a sheepdog image");
2078             ret = -EINVAL;
2079             goto out;
2080         }
2081 
2082         blk = blk_new_open(opts->backing_file, NULL, NULL,
2083                            BDRV_O_PROTOCOL, errp);
2084         if (blk == NULL) {
2085             ret = -EIO;
2086             goto out;
2087         }
2088 
2089         base = blk_bs(blk)->opaque;
2090 
2091         if (!is_snapshot(&base->inode)) {
2092             error_setg(errp, "cannot clone from a non snapshot vdi");
2093             blk_unref(blk);
2094             ret = -EINVAL;
2095             goto out;
2096         }
2097         s->inode.vdi_id = base->inode.vdi_id;
2098         blk_unref(blk);
2099     }
2100 
2101     s->aio_context = qemu_get_aio_context();
2102 
2103     /* if block_size_shift is not specified, get cluster default value */
2104     if (s->inode.block_size_shift == 0) {
2105         SheepdogVdiReq hdr;
2106         SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
2107         int fd;
2108         unsigned int wlen = 0, rlen = 0;
2109 
2110         fd = connect_to_sdog(s, errp);
2111         if (fd < 0) {
2112             ret = fd;
2113             goto out;
2114         }
2115 
2116         memset(&hdr, 0, sizeof(hdr));
2117         hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
2118         hdr.proto_ver = SD_PROTO_VER;
2119 
2120         ret = do_req(fd, NULL, (SheepdogReq *)&hdr,
2121                      NULL, &wlen, &rlen);
2122         closesocket(fd);
2123         if (ret) {
2124             error_setg_errno(errp, -ret, "failed to get cluster default");
2125             goto out;
2126         }
2127         if (rsp->result == SD_RES_SUCCESS) {
2128             s->inode.block_size_shift = rsp->block_size_shift;
2129         } else {
2130             s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT;
2131         }
2132     }
2133 
2134     max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
2135 
2136     if (s->inode.vdi_size > max_vdi_size) {
2137         error_setg(errp, "An image is too large."
2138                          " The maximum image size is %"PRIu64 "GB",
2139                          max_vdi_size / 1024 / 1024 / 1024);
2140         ret = -EINVAL;
2141         goto out;
2142     }
2143 
2144     ret = do_sd_create(s, &vid, 0, errp);
2145     if (ret) {
2146         goto out;
2147     }
2148 
2149     if (prealloc) {
2150         ret = sd_create_prealloc(opts->location, opts->size, errp);
2151     }
2152 out:
2153     g_free(backing_file);
2154     g_free(buf);
2155     g_free(s->addr);
2156     g_free(s);
2157     return ret;
2158 }
2159 
sd_co_create_opts(const char * filename,QemuOpts * opts,Error ** errp)2160 static int coroutine_fn sd_co_create_opts(const char *filename, QemuOpts *opts,
2161                                           Error **errp)
2162 {
2163     BlockdevCreateOptions *create_options = NULL;
2164     QDict *qdict, *location_qdict;
2165     Visitor *v;
2166     char *redundancy;
2167     Error *local_err = NULL;
2168     int ret;
2169 
2170     redundancy = qemu_opt_get_del(opts, BLOCK_OPT_REDUNDANCY);
2171 
2172     qdict = qemu_opts_to_qdict(opts, NULL);
2173     qdict_put_str(qdict, "driver", "sheepdog");
2174 
2175     location_qdict = qdict_new();
2176     qdict_put(qdict, "location", location_qdict);
2177 
2178     sd_parse_filename(filename, location_qdict, &local_err);
2179     if (local_err) {
2180         error_propagate(errp, local_err);
2181         ret = -EINVAL;
2182         goto fail;
2183     }
2184 
2185     qdict_flatten(qdict);
2186 
2187     /* Change legacy command line options into QMP ones */
2188     static const QDictRenames opt_renames[] = {
2189         { BLOCK_OPT_BACKING_FILE,       "backing-file" },
2190         { BLOCK_OPT_OBJECT_SIZE,        "object-size" },
2191         { NULL, NULL },
2192     };
2193 
2194     if (!qdict_rename_keys(qdict, opt_renames, errp)) {
2195         ret = -EINVAL;
2196         goto fail;
2197     }
2198 
2199     /* Get the QAPI object */
2200     v = qobject_input_visitor_new_flat_confused(qdict, errp);
2201     if (!v) {
2202         ret = -EINVAL;
2203         goto fail;
2204     }
2205 
2206     visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err);
2207     visit_free(v);
2208 
2209     if (local_err) {
2210         error_propagate(errp, local_err);
2211         ret = -EINVAL;
2212         goto fail;
2213     }
2214 
2215     assert(create_options->driver == BLOCKDEV_DRIVER_SHEEPDOG);
2216     create_options->u.sheepdog.size =
2217         ROUND_UP(create_options->u.sheepdog.size, BDRV_SECTOR_SIZE);
2218 
2219     if (redundancy) {
2220         create_options->u.sheepdog.has_redundancy = true;
2221         create_options->u.sheepdog.redundancy =
2222             parse_redundancy_str(redundancy);
2223         if (create_options->u.sheepdog.redundancy == NULL) {
2224             error_setg(errp, "Invalid redundancy mode");
2225             ret = -EINVAL;
2226             goto fail;
2227         }
2228     }
2229 
2230     ret = sd_co_create(create_options, errp);
2231 fail:
2232     qapi_free_BlockdevCreateOptions(create_options);
2233     qobject_unref(qdict);
2234     g_free(redundancy);
2235     return ret;
2236 }
2237 
sd_close(BlockDriverState * bs)2238 static void sd_close(BlockDriverState *bs)
2239 {
2240     Error *local_err = NULL;
2241     BDRVSheepdogState *s = bs->opaque;
2242     SheepdogVdiReq hdr;
2243     SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
2244     unsigned int wlen, rlen = 0;
2245     int fd, ret;
2246 
2247     trace_sheepdog_close(s->name);
2248 
2249     fd = connect_to_sdog(s, &local_err);
2250     if (fd < 0) {
2251         error_report_err(local_err);
2252         return;
2253     }
2254 
2255     memset(&hdr, 0, sizeof(hdr));
2256 
2257     hdr.opcode = SD_OP_RELEASE_VDI;
2258     hdr.type = LOCK_TYPE_NORMAL;
2259     hdr.base_vdi_id = s->inode.vdi_id;
2260     wlen = strlen(s->name) + 1;
2261     hdr.data_length = wlen;
2262     hdr.flags = SD_FLAG_CMD_WRITE;
2263 
2264     ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
2265                  s->name, &wlen, &rlen);
2266 
2267     closesocket(fd);
2268 
2269     if (!ret && rsp->result != SD_RES_SUCCESS &&
2270         rsp->result != SD_RES_VDI_NOT_LOCKED) {
2271         error_report("%s, %s", sd_strerror(rsp->result), s->name);
2272     }
2273 
2274     aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
2275                        false, NULL, NULL, NULL, NULL);
2276     closesocket(s->fd);
2277     qapi_free_SocketAddress(s->addr);
2278 }
2279 
sd_getlength(BlockDriverState * bs)2280 static int64_t sd_getlength(BlockDriverState *bs)
2281 {
2282     BDRVSheepdogState *s = bs->opaque;
2283 
2284     return s->inode.vdi_size;
2285 }
2286 
sd_co_truncate(BlockDriverState * bs,int64_t offset,bool exact,PreallocMode prealloc,Error ** errp)2287 static int coroutine_fn sd_co_truncate(BlockDriverState *bs, int64_t offset,
2288                                        bool exact, PreallocMode prealloc,
2289                                        Error **errp)
2290 {
2291     BDRVSheepdogState *s = bs->opaque;
2292     int ret, fd;
2293     unsigned int datalen;
2294     uint64_t max_vdi_size;
2295     int64_t old_size = s->inode.vdi_size;
2296 
2297     if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_FULL) {
2298         error_setg(errp, "Unsupported preallocation mode '%s'",
2299                    PreallocMode_str(prealloc));
2300         return -ENOTSUP;
2301     }
2302 
2303     max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
2304     if (offset < old_size) {
2305         error_setg(errp, "shrinking is not supported");
2306         return -EINVAL;
2307     } else if (offset > max_vdi_size) {
2308         error_setg(errp, "too big image size");
2309         return -EINVAL;
2310     }
2311 
2312     fd = connect_to_sdog(s, errp);
2313     if (fd < 0) {
2314         return fd;
2315     }
2316 
2317     /* we don't need to update entire object */
2318     datalen = SD_INODE_HEADER_SIZE;
2319     s->inode.vdi_size = offset;
2320     ret = write_object(fd, s->bs, (char *)&s->inode,
2321                        vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
2322                        datalen, 0, false, s->cache_flags);
2323     close(fd);
2324 
2325     if (ret < 0) {
2326         error_setg_errno(errp, -ret, "failed to update an inode");
2327         return ret;
2328     }
2329 
2330     if (prealloc == PREALLOC_MODE_FULL) {
2331         ret = sd_prealloc(bs, old_size, offset, errp);
2332         if (ret < 0) {
2333             return ret;
2334         }
2335     }
2336 
2337     return 0;
2338 }
2339 
2340 /*
2341  * This function is called after writing data objects.  If we need to
2342  * update metadata, this sends a write request to the vdi object.
2343  */
sd_write_done(SheepdogAIOCB * acb)2344 static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
2345 {
2346     BDRVSheepdogState *s = acb->s;
2347     struct iovec iov;
2348     AIOReq *aio_req;
2349     uint32_t offset, data_len, mn, mx;
2350 
2351     mn = acb->min_dirty_data_idx;
2352     mx = acb->max_dirty_data_idx;
2353     if (mn <= mx) {
2354         /* we need to update the vdi object. */
2355         ++acb->nr_pending;
2356         offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
2357             mn * sizeof(s->inode.data_vdi_id[0]);
2358         data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
2359 
2360         acb->min_dirty_data_idx = UINT32_MAX;
2361         acb->max_dirty_data_idx = 0;
2362 
2363         iov.iov_base = &s->inode;
2364         iov.iov_len = sizeof(s->inode);
2365         aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
2366                                 data_len, offset, 0, false, 0, offset);
2367         add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
2368         if (--acb->nr_pending) {
2369             qemu_coroutine_yield();
2370         }
2371     }
2372 }
2373 
2374 /* Delete current working VDI on the snapshot chain */
sd_delete(BDRVSheepdogState * s)2375 static bool sd_delete(BDRVSheepdogState *s)
2376 {
2377     Error *local_err = NULL;
2378     unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0;
2379     SheepdogVdiReq hdr = {
2380         .opcode = SD_OP_DEL_VDI,
2381         .base_vdi_id = s->inode.vdi_id,
2382         .data_length = wlen,
2383         .flags = SD_FLAG_CMD_WRITE,
2384     };
2385     SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
2386     int fd, ret;
2387 
2388     fd = connect_to_sdog(s, &local_err);
2389     if (fd < 0) {
2390         error_report_err(local_err);
2391         return false;
2392     }
2393 
2394     ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
2395                  s->name, &wlen, &rlen);
2396     closesocket(fd);
2397     if (ret) {
2398         return false;
2399     }
2400     switch (rsp->result) {
2401     case SD_RES_NO_VDI:
2402         error_report("%s was already deleted", s->name);
2403         /* fall through */
2404     case SD_RES_SUCCESS:
2405         break;
2406     default:
2407         error_report("%s, %s", sd_strerror(rsp->result), s->name);
2408         return false;
2409     }
2410 
2411     return true;
2412 }
2413 
2414 /*
2415  * Create a writable VDI from a snapshot
2416  */
sd_create_branch(BDRVSheepdogState * s)2417 static int sd_create_branch(BDRVSheepdogState *s)
2418 {
2419     Error *local_err = NULL;
2420     int ret, fd;
2421     uint32_t vid;
2422     char *buf;
2423     bool deleted;
2424 
2425     trace_sheepdog_create_branch_snapshot(s->inode.vdi_id);
2426 
2427     buf = g_malloc(SD_INODE_SIZE);
2428 
2429     /*
2430      * Even If deletion fails, we will just create extra snapshot based on
2431      * the working VDI which was supposed to be deleted. So no need to
2432      * false bail out.
2433      */
2434     deleted = sd_delete(s);
2435     ret = do_sd_create(s, &vid, !deleted, &local_err);
2436     if (ret) {
2437         error_report_err(local_err);
2438         goto out;
2439     }
2440 
2441     trace_sheepdog_create_branch_created(vid);
2442 
2443     fd = connect_to_sdog(s, &local_err);
2444     if (fd < 0) {
2445         error_report_err(local_err);
2446         ret = fd;
2447         goto out;
2448     }
2449 
2450     ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
2451                       s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags);
2452 
2453     closesocket(fd);
2454 
2455     if (ret < 0) {
2456         goto out;
2457     }
2458 
2459     memcpy(&s->inode, buf, sizeof(s->inode));
2460 
2461     s->is_snapshot = false;
2462     ret = 0;
2463     trace_sheepdog_create_branch_new(s->inode.vdi_id);
2464 
2465 out:
2466     g_free(buf);
2467 
2468     return ret;
2469 }
2470 
2471 /*
2472  * Send I/O requests to the server.
2473  *
2474  * This function sends requests to the server, links the requests to
2475  * the inflight_list in BDRVSheepdogState, and exits without
2476  * waiting the response.  The responses are received in the
2477  * `aio_read_response' function which is called from the main loop as
2478  * a fd handler.
2479  *
2480  * Returns 1 when we need to wait a response, 0 when there is no sent
2481  * request and -errno in error cases.
2482  */
sd_co_rw_vector(SheepdogAIOCB * acb)2483 static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb)
2484 {
2485     int ret = 0;
2486     unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
2487     unsigned long idx;
2488     uint32_t object_size;
2489     uint64_t oid;
2490     uint64_t offset;
2491     BDRVSheepdogState *s = acb->s;
2492     SheepdogInode *inode = &s->inode;
2493     AIOReq *aio_req;
2494 
2495     if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
2496         /*
2497          * In the case we open the snapshot VDI, Sheepdog creates the
2498          * writable VDI when we do a write operation first.
2499          */
2500         ret = sd_create_branch(s);
2501         if (ret) {
2502             acb->ret = -EIO;
2503             return;
2504         }
2505     }
2506 
2507     object_size = (UINT32_C(1) << inode->block_size_shift);
2508     idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
2509     offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size;
2510 
2511     /*
2512      * Make sure we don't free the aiocb before we are done with all requests.
2513      * This additional reference is dropped at the end of this function.
2514      */
2515     acb->nr_pending++;
2516 
2517     while (done != total) {
2518         uint8_t flags = 0;
2519         uint64_t old_oid = 0;
2520         bool create = false;
2521 
2522         oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
2523 
2524         len = MIN(total - done, object_size - offset);
2525 
2526         switch (acb->aiocb_type) {
2527         case AIOCB_READ_UDATA:
2528             if (!inode->data_vdi_id[idx]) {
2529                 qemu_iovec_memset(acb->qiov, done, 0, len);
2530                 goto done;
2531             }
2532             break;
2533         case AIOCB_WRITE_UDATA:
2534             if (!inode->data_vdi_id[idx]) {
2535                 create = true;
2536             } else if (!is_data_obj_writable(inode, idx)) {
2537                 /* Copy-On-Write */
2538                 create = true;
2539                 old_oid = oid;
2540                 flags = SD_FLAG_CMD_COW;
2541             }
2542             break;
2543         case AIOCB_DISCARD_OBJ:
2544             /*
2545              * We discard the object only when the whole object is
2546              * 1) allocated 2) trimmed. Otherwise, simply skip it.
2547              */
2548             if (len != object_size || inode->data_vdi_id[idx] == 0) {
2549                 goto done;
2550             }
2551             break;
2552         default:
2553             break;
2554         }
2555 
2556         if (create) {
2557             trace_sheepdog_co_rw_vector_update(inode->vdi_id, oid,
2558                                   vid_to_data_oid(inode->data_vdi_id[idx], idx),
2559                                   idx);
2560             oid = vid_to_data_oid(inode->vdi_id, idx);
2561             trace_sheepdog_co_rw_vector_new(oid);
2562         }
2563 
2564         aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create,
2565                                 old_oid,
2566                                 acb->aiocb_type == AIOCB_DISCARD_OBJ ?
2567                                 0 : done);
2568         add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
2569                         acb->aiocb_type);
2570     done:
2571         offset = 0;
2572         idx++;
2573         done += len;
2574     }
2575     if (--acb->nr_pending) {
2576         qemu_coroutine_yield();
2577     }
2578 }
2579 
sd_aio_complete(SheepdogAIOCB * acb)2580 static void sd_aio_complete(SheepdogAIOCB *acb)
2581 {
2582     BDRVSheepdogState *s;
2583     if (acb->aiocb_type == AIOCB_FLUSH_CACHE) {
2584         return;
2585     }
2586 
2587     s = acb->s;
2588     qemu_co_mutex_lock(&s->queue_lock);
2589     QLIST_REMOVE(acb, aiocb_siblings);
2590     qemu_co_queue_restart_all(&s->overlapping_queue);
2591     qemu_co_mutex_unlock(&s->queue_lock);
2592 }
2593 
sd_co_writev(BlockDriverState * bs,int64_t sector_num,int nb_sectors,QEMUIOVector * qiov,int flags)2594 static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
2595                                      int nb_sectors, QEMUIOVector *qiov,
2596                                      int flags)
2597 {
2598     SheepdogAIOCB acb;
2599     int ret;
2600     int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
2601     BDRVSheepdogState *s = bs->opaque;
2602 
2603     assert(!flags);
2604     if (offset > s->inode.vdi_size) {
2605         ret = sd_co_truncate(bs, offset, false, PREALLOC_MODE_OFF, NULL);
2606         if (ret < 0) {
2607             return ret;
2608         }
2609     }
2610 
2611     sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_WRITE_UDATA);
2612     sd_co_rw_vector(&acb);
2613     sd_write_done(&acb);
2614     sd_aio_complete(&acb);
2615 
2616     return acb.ret;
2617 }
2618 
sd_co_readv(BlockDriverState * bs,int64_t sector_num,int nb_sectors,QEMUIOVector * qiov)2619 static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
2620                        int nb_sectors, QEMUIOVector *qiov)
2621 {
2622     SheepdogAIOCB acb;
2623     BDRVSheepdogState *s = bs->opaque;
2624 
2625     sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_READ_UDATA);
2626     sd_co_rw_vector(&acb);
2627     sd_aio_complete(&acb);
2628 
2629     return acb.ret;
2630 }
2631 
sd_co_flush_to_disk(BlockDriverState * bs)2632 static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
2633 {
2634     BDRVSheepdogState *s = bs->opaque;
2635     SheepdogAIOCB acb;
2636     AIOReq *aio_req;
2637 
2638     if (s->cache_flags != SD_FLAG_CMD_CACHE) {
2639         return 0;
2640     }
2641 
2642     sd_aio_setup(&acb, s, NULL, 0, 0, AIOCB_FLUSH_CACHE);
2643 
2644     acb.nr_pending++;
2645     aio_req = alloc_aio_req(s, &acb, vid_to_vdi_oid(s->inode.vdi_id),
2646                             0, 0, 0, false, 0, 0);
2647     add_aio_request(s, aio_req, NULL, 0, acb.aiocb_type);
2648 
2649     if (--acb.nr_pending) {
2650         qemu_coroutine_yield();
2651     }
2652 
2653     sd_aio_complete(&acb);
2654     return acb.ret;
2655 }
2656 
sd_snapshot_create(BlockDriverState * bs,QEMUSnapshotInfo * sn_info)2657 static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
2658 {
2659     Error *local_err = NULL;
2660     BDRVSheepdogState *s = bs->opaque;
2661     int ret, fd;
2662     uint32_t new_vid;
2663     SheepdogInode *inode;
2664     unsigned int datalen;
2665 
2666     trace_sheepdog_snapshot_create_info(sn_info->name, sn_info->id_str, s->name,
2667                                         sn_info->vm_state_size, s->is_snapshot);
2668 
2669     if (s->is_snapshot) {
2670         error_report("You can't create a snapshot of a snapshot VDI, "
2671                      "%s (%" PRIu32 ").", s->name, s->inode.vdi_id);
2672 
2673         return -EINVAL;
2674     }
2675 
2676     trace_sheepdog_snapshot_create(sn_info->name, sn_info->id_str);
2677 
2678     s->inode.vm_state_size = sn_info->vm_state_size;
2679     s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
2680     /* It appears that inode.tag does not require a NUL terminator,
2681      * which means this use of strncpy is ok.
2682      */
2683     strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
2684     /* we don't need to update entire object */
2685     datalen = SD_INODE_HEADER_SIZE;
2686     inode = g_malloc(datalen);
2687 
2688     /* refresh inode. */
2689     fd = connect_to_sdog(s, &local_err);
2690     if (fd < 0) {
2691         error_report_err(local_err);
2692         ret = fd;
2693         goto cleanup;
2694     }
2695 
2696     ret = write_object(fd, s->bs, (char *)&s->inode,
2697                        vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
2698                        datalen, 0, false, s->cache_flags);
2699     if (ret < 0) {
2700         error_report("failed to write snapshot's inode.");
2701         goto cleanup;
2702     }
2703 
2704     ret = do_sd_create(s, &new_vid, 1, &local_err);
2705     if (ret < 0) {
2706         error_reportf_err(local_err,
2707                           "failed to create inode for snapshot: ");
2708         goto cleanup;
2709     }
2710 
2711     ret = read_object(fd, s->bs, (char *)inode,
2712                       vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0,
2713                       s->cache_flags);
2714 
2715     if (ret < 0) {
2716         error_report("failed to read new inode info. %s", strerror(errno));
2717         goto cleanup;
2718     }
2719 
2720     memcpy(&s->inode, inode, datalen);
2721     trace_sheepdog_snapshot_create_inode(s->inode.name, s->inode.snap_id,
2722                                          s->inode.vdi_id);
2723 
2724 cleanup:
2725     g_free(inode);
2726     closesocket(fd);
2727     return ret;
2728 }
2729 
2730 /*
2731  * We implement rollback(loadvm) operation to the specified snapshot by
2732  * 1) switch to the snapshot
2733  * 2) rely on sd_create_branch to delete working VDI and
2734  * 3) create a new working VDI based on the specified snapshot
2735  */
sd_snapshot_goto(BlockDriverState * bs,const char * snapshot_id)2736 static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
2737 {
2738     BDRVSheepdogState *s = bs->opaque;
2739     BDRVSheepdogState *old_s;
2740     char tag[SD_MAX_VDI_TAG_LEN];
2741     uint32_t snapid = 0;
2742     int ret;
2743 
2744     if (!sd_parse_snapid_or_tag(snapshot_id, &snapid, tag)) {
2745         return -EINVAL;
2746     }
2747 
2748     old_s = g_new(BDRVSheepdogState, 1);
2749 
2750     memcpy(old_s, s, sizeof(BDRVSheepdogState));
2751 
2752     ret = reload_inode(s, snapid, tag);
2753     if (ret) {
2754         goto out;
2755     }
2756 
2757     ret = sd_create_branch(s);
2758     if (ret) {
2759         goto out;
2760     }
2761 
2762     g_free(old_s);
2763 
2764     return 0;
2765 out:
2766     /* recover bdrv_sd_state */
2767     memcpy(s, old_s, sizeof(BDRVSheepdogState));
2768     g_free(old_s);
2769 
2770     error_report("failed to open. recover old bdrv_sd_state.");
2771 
2772     return ret;
2773 }
2774 
2775 #define NR_BATCHED_DISCARD 128
2776 
remove_objects(BDRVSheepdogState * s,Error ** errp)2777 static int remove_objects(BDRVSheepdogState *s, Error **errp)
2778 {
2779     int fd, i = 0, nr_objs = 0;
2780     int ret;
2781     SheepdogInode *inode = &s->inode;
2782 
2783     fd = connect_to_sdog(s, errp);
2784     if (fd < 0) {
2785         return fd;
2786     }
2787 
2788     nr_objs = count_data_objs(inode);
2789     while (i < nr_objs) {
2790         int start_idx, nr_filled_idx;
2791 
2792         while (i < nr_objs && !inode->data_vdi_id[i]) {
2793             i++;
2794         }
2795         start_idx = i;
2796 
2797         nr_filled_idx = 0;
2798         while (i < nr_objs && nr_filled_idx < NR_BATCHED_DISCARD) {
2799             if (inode->data_vdi_id[i]) {
2800                 inode->data_vdi_id[i] = 0;
2801                 nr_filled_idx++;
2802             }
2803 
2804             i++;
2805         }
2806 
2807         ret = write_object(fd, s->bs,
2808                            (char *)&inode->data_vdi_id[start_idx],
2809                            vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies,
2810                            (i - start_idx) * sizeof(uint32_t),
2811                            offsetof(struct SheepdogInode,
2812                                     data_vdi_id[start_idx]),
2813                            false, s->cache_flags);
2814         if (ret < 0) {
2815             error_setg(errp, "Failed to discard snapshot inode");
2816             goto out;
2817         }
2818     }
2819 
2820     ret = 0;
2821 out:
2822     closesocket(fd);
2823     return ret;
2824 }
2825 
sd_snapshot_delete(BlockDriverState * bs,const char * snapshot_id,const char * name,Error ** errp)2826 static int sd_snapshot_delete(BlockDriverState *bs,
2827                               const char *snapshot_id,
2828                               const char *name,
2829                               Error **errp)
2830 {
2831     /*
2832      * FIXME should delete the snapshot matching both @snapshot_id and
2833      * @name, but @name not used here
2834      */
2835     unsigned long snap_id = 0;
2836     char snap_tag[SD_MAX_VDI_TAG_LEN];
2837     int fd, ret;
2838     char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
2839     BDRVSheepdogState *s = bs->opaque;
2840     unsigned int wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN, rlen = 0;
2841     uint32_t vid;
2842     SheepdogVdiReq hdr = {
2843         .opcode = SD_OP_DEL_VDI,
2844         .data_length = wlen,
2845         .flags = SD_FLAG_CMD_WRITE,
2846     };
2847     SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
2848 
2849     ret = remove_objects(s, errp);
2850     if (ret) {
2851         return ret;
2852     }
2853 
2854     memset(buf, 0, sizeof(buf));
2855     memset(snap_tag, 0, sizeof(snap_tag));
2856     pstrcpy(buf, SD_MAX_VDI_LEN, s->name);
2857     /* TODO Use sd_parse_snapid() once this mess is cleaned up */
2858     ret = qemu_strtoul(snapshot_id, NULL, 10, &snap_id);
2859     if (ret || snap_id > UINT32_MAX) {
2860         /*
2861          * FIXME Since qemu_strtoul() returns -EINVAL when
2862          * @snapshot_id is null, @snapshot_id is mandatory.  Correct
2863          * would be to require at least one of @snapshot_id and @name.
2864          */
2865         error_setg(errp, "Invalid snapshot ID: %s",
2866                          snapshot_id ? snapshot_id : "<null>");
2867         return -EINVAL;
2868     }
2869 
2870     if (snap_id) {
2871         hdr.snapid = (uint32_t) snap_id;
2872     } else {
2873         /* FIXME I suspect we should use @name here */
2874         /* FIXME don't truncate silently */
2875         pstrcpy(snap_tag, sizeof(snap_tag), snapshot_id);
2876         pstrcpy(buf + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag);
2877     }
2878 
2879     ret = find_vdi_name(s, s->name, snap_id, snap_tag, &vid, true, errp);
2880     if (ret) {
2881         return ret;
2882     }
2883 
2884     fd = connect_to_sdog(s, errp);
2885     if (fd < 0) {
2886         return fd;
2887     }
2888 
2889     ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
2890                  buf, &wlen, &rlen);
2891     closesocket(fd);
2892     if (ret) {
2893         error_setg_errno(errp, -ret, "Couldn't send request to server");
2894         return ret;
2895     }
2896 
2897     switch (rsp->result) {
2898     case SD_RES_NO_VDI:
2899         error_setg(errp, "Can't find the snapshot");
2900         return -ENOENT;
2901     case SD_RES_SUCCESS:
2902         break;
2903     default:
2904         error_setg(errp, "%s", sd_strerror(rsp->result));
2905         return -EIO;
2906     }
2907 
2908     return 0;
2909 }
2910 
sd_snapshot_list(BlockDriverState * bs,QEMUSnapshotInfo ** psn_tab)2911 static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
2912 {
2913     Error *local_err = NULL;
2914     BDRVSheepdogState *s = bs->opaque;
2915     SheepdogReq req;
2916     int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
2917     QEMUSnapshotInfo *sn_tab = NULL;
2918     unsigned wlen, rlen;
2919     int found = 0;
2920     SheepdogInode *inode;
2921     unsigned long *vdi_inuse;
2922     unsigned int start_nr;
2923     uint64_t hval;
2924     uint32_t vid;
2925 
2926     vdi_inuse = g_malloc(max);
2927     inode = g_malloc(SD_INODE_HEADER_SIZE);
2928 
2929     fd = connect_to_sdog(s, &local_err);
2930     if (fd < 0) {
2931         error_report_err(local_err);
2932         ret = fd;
2933         goto out;
2934     }
2935 
2936     rlen = max;
2937     wlen = 0;
2938 
2939     memset(&req, 0, sizeof(req));
2940 
2941     req.opcode = SD_OP_READ_VDIS;
2942     req.data_length = max;
2943 
2944     ret = do_req(fd, s->bs, &req, vdi_inuse, &wlen, &rlen);
2945 
2946     closesocket(fd);
2947     if (ret) {
2948         goto out;
2949     }
2950 
2951     sn_tab = g_new0(QEMUSnapshotInfo, nr);
2952 
2953     /* calculate a vdi id with hash function */
2954     hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
2955     start_nr = hval & (SD_NR_VDIS - 1);
2956 
2957     fd = connect_to_sdog(s, &local_err);
2958     if (fd < 0) {
2959         error_report_err(local_err);
2960         ret = fd;
2961         goto out;
2962     }
2963 
2964     for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
2965         if (!test_bit(vid, vdi_inuse)) {
2966             break;
2967         }
2968 
2969         /* we don't need to read entire object */
2970         ret = read_object(fd, s->bs, (char *)inode,
2971                           vid_to_vdi_oid(vid),
2972                           0, SD_INODE_HEADER_SIZE, 0,
2973                           s->cache_flags);
2974 
2975         if (ret) {
2976             continue;
2977         }
2978 
2979         if (!strcmp(inode->name, s->name) && is_snapshot(inode)) {
2980             sn_tab[found].date_sec = inode->snap_ctime >> 32;
2981             sn_tab[found].date_nsec = inode->snap_ctime & 0xffffffff;
2982             sn_tab[found].vm_state_size = inode->vm_state_size;
2983             sn_tab[found].vm_clock_nsec = inode->vm_clock_nsec;
2984 
2985             snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str),
2986                      "%" PRIu32, inode->snap_id);
2987             pstrcpy(sn_tab[found].name,
2988                     MIN(sizeof(sn_tab[found].name), sizeof(inode->tag)),
2989                     inode->tag);
2990             found++;
2991         }
2992     }
2993 
2994     closesocket(fd);
2995 out:
2996     *psn_tab = sn_tab;
2997 
2998     g_free(vdi_inuse);
2999     g_free(inode);
3000 
3001     if (ret < 0) {
3002         return ret;
3003     }
3004 
3005     return found;
3006 }
3007 
do_load_save_vmstate(BDRVSheepdogState * s,uint8_t * data,int64_t pos,int size,int load)3008 static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
3009                                 int64_t pos, int size, int load)
3010 {
3011     Error *local_err = NULL;
3012     bool create;
3013     int fd, ret = 0, remaining = size;
3014     unsigned int data_len;
3015     uint64_t vmstate_oid;
3016     uint64_t offset;
3017     uint32_t vdi_index;
3018     uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id;
3019     uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift);
3020 
3021     fd = connect_to_sdog(s, &local_err);
3022     if (fd < 0) {
3023         error_report_err(local_err);
3024         return fd;
3025     }
3026 
3027     while (remaining) {
3028         vdi_index = pos / object_size;
3029         offset = pos % object_size;
3030 
3031         data_len = MIN(remaining, object_size - offset);
3032 
3033         vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index);
3034 
3035         create = (offset == 0);
3036         if (load) {
3037             ret = read_object(fd, s->bs, (char *)data, vmstate_oid,
3038                               s->inode.nr_copies, data_len, offset,
3039                               s->cache_flags);
3040         } else {
3041             ret = write_object(fd, s->bs, (char *)data, vmstate_oid,
3042                                s->inode.nr_copies, data_len, offset, create,
3043                                s->cache_flags);
3044         }
3045 
3046         if (ret < 0) {
3047             error_report("failed to save vmstate %s", strerror(errno));
3048             goto cleanup;
3049         }
3050 
3051         pos += data_len;
3052         data += data_len;
3053         remaining -= data_len;
3054     }
3055     ret = size;
3056 cleanup:
3057     closesocket(fd);
3058     return ret;
3059 }
3060 
sd_save_vmstate(BlockDriverState * bs,QEMUIOVector * qiov,int64_t pos)3061 static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
3062                            int64_t pos)
3063 {
3064     BDRVSheepdogState *s = bs->opaque;
3065     void *buf;
3066     int ret;
3067 
3068     buf = qemu_blockalign(bs, qiov->size);
3069     qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
3070     ret = do_load_save_vmstate(s, (uint8_t *) buf, pos, qiov->size, 0);
3071     qemu_vfree(buf);
3072 
3073     return ret;
3074 }
3075 
sd_load_vmstate(BlockDriverState * bs,QEMUIOVector * qiov,int64_t pos)3076 static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
3077                            int64_t pos)
3078 {
3079     BDRVSheepdogState *s = bs->opaque;
3080     void *buf;
3081     int ret;
3082 
3083     buf = qemu_blockalign(bs, qiov->size);
3084     ret = do_load_save_vmstate(s, buf, pos, qiov->size, 1);
3085     qemu_iovec_from_buf(qiov, 0, buf, qiov->size);
3086     qemu_vfree(buf);
3087 
3088     return ret;
3089 }
3090 
3091 
sd_co_pdiscard(BlockDriverState * bs,int64_t offset,int bytes)3092 static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
3093                                       int bytes)
3094 {
3095     SheepdogAIOCB acb;
3096     BDRVSheepdogState *s = bs->opaque;
3097     QEMUIOVector discard_iov;
3098     struct iovec iov;
3099     uint32_t zero = 0;
3100 
3101     if (!s->discard_supported) {
3102         return 0;
3103     }
3104 
3105     memset(&discard_iov, 0, sizeof(discard_iov));
3106     memset(&iov, 0, sizeof(iov));
3107     iov.iov_base = &zero;
3108     iov.iov_len = sizeof(zero);
3109     discard_iov.iov = &iov;
3110     discard_iov.niov = 1;
3111     if (!QEMU_IS_ALIGNED(offset | bytes, BDRV_SECTOR_SIZE)) {
3112         return -ENOTSUP;
3113     }
3114     sd_aio_setup(&acb, s, &discard_iov, offset >> BDRV_SECTOR_BITS,
3115                  bytes >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
3116     sd_co_rw_vector(&acb);
3117     sd_aio_complete(&acb);
3118 
3119     return acb.ret;
3120 }
3121 
3122 static coroutine_fn int
sd_co_block_status(BlockDriverState * bs,bool want_zero,int64_t offset,int64_t bytes,int64_t * pnum,int64_t * map,BlockDriverState ** file)3123 sd_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
3124                    int64_t bytes, int64_t *pnum, int64_t *map,
3125                    BlockDriverState **file)
3126 {
3127     BDRVSheepdogState *s = bs->opaque;
3128     SheepdogInode *inode = &s->inode;
3129     uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
3130     unsigned long start = offset / object_size,
3131                   end = DIV_ROUND_UP(offset + bytes, object_size);
3132     unsigned long idx;
3133     *map = offset;
3134     int ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
3135 
3136     for (idx = start; idx < end; idx++) {
3137         if (inode->data_vdi_id[idx] == 0) {
3138             break;
3139         }
3140     }
3141     if (idx == start) {
3142         /* Get the longest length of unallocated sectors */
3143         ret = 0;
3144         for (idx = start + 1; idx < end; idx++) {
3145             if (inode->data_vdi_id[idx] != 0) {
3146                 break;
3147             }
3148         }
3149     }
3150 
3151     *pnum = (idx - start) * object_size;
3152     if (*pnum > bytes) {
3153         *pnum = bytes;
3154     }
3155     if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) {
3156         *file = bs;
3157     }
3158     return ret;
3159 }
3160 
sd_get_allocated_file_size(BlockDriverState * bs)3161 static int64_t sd_get_allocated_file_size(BlockDriverState *bs)
3162 {
3163     BDRVSheepdogState *s = bs->opaque;
3164     SheepdogInode *inode = &s->inode;
3165     uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
3166     unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size);
3167     uint64_t size = 0;
3168 
3169     for (i = 0; i < last; i++) {
3170         if (inode->data_vdi_id[i] == 0) {
3171             continue;
3172         }
3173         size += object_size;
3174     }
3175     return size;
3176 }
3177 
3178 static QemuOptsList sd_create_opts = {
3179     .name = "sheepdog-create-opts",
3180     .head = QTAILQ_HEAD_INITIALIZER(sd_create_opts.head),
3181     .desc = {
3182         {
3183             .name = BLOCK_OPT_SIZE,
3184             .type = QEMU_OPT_SIZE,
3185             .help = "Virtual disk size"
3186         },
3187         {
3188             .name = BLOCK_OPT_BACKING_FILE,
3189             .type = QEMU_OPT_STRING,
3190             .help = "File name of a base image"
3191         },
3192         {
3193             .name = BLOCK_OPT_PREALLOC,
3194             .type = QEMU_OPT_STRING,
3195             .help = "Preallocation mode (allowed values: off, full)"
3196         },
3197         {
3198             .name = BLOCK_OPT_REDUNDANCY,
3199             .type = QEMU_OPT_STRING,
3200             .help = "Redundancy of the image"
3201         },
3202         {
3203             .name = BLOCK_OPT_OBJECT_SIZE,
3204             .type = QEMU_OPT_SIZE,
3205             .help = "Object size of the image"
3206         },
3207         { /* end of list */ }
3208     }
3209 };
3210 
3211 static const char *const sd_strong_runtime_opts[] = {
3212     "vdi",
3213     "snap-id",
3214     "tag",
3215     "server.",
3216 
3217     NULL
3218 };
3219 
3220 static BlockDriver bdrv_sheepdog = {
3221     .format_name                  = "sheepdog",
3222     .protocol_name                = "sheepdog",
3223     .instance_size                = sizeof(BDRVSheepdogState),
3224     .bdrv_parse_filename          = sd_parse_filename,
3225     .bdrv_file_open               = sd_open,
3226     .bdrv_reopen_prepare          = sd_reopen_prepare,
3227     .bdrv_reopen_commit           = sd_reopen_commit,
3228     .bdrv_reopen_abort            = sd_reopen_abort,
3229     .bdrv_close                   = sd_close,
3230     .bdrv_co_create               = sd_co_create,
3231     .bdrv_co_create_opts          = sd_co_create_opts,
3232     .bdrv_has_zero_init           = bdrv_has_zero_init_1,
3233     .bdrv_has_zero_init_truncate  = bdrv_has_zero_init_1,
3234     .bdrv_getlength               = sd_getlength,
3235     .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
3236     .bdrv_co_truncate             = sd_co_truncate,
3237 
3238     .bdrv_co_readv                = sd_co_readv,
3239     .bdrv_co_writev               = sd_co_writev,
3240     .bdrv_co_flush_to_disk        = sd_co_flush_to_disk,
3241     .bdrv_co_pdiscard             = sd_co_pdiscard,
3242     .bdrv_co_block_status         = sd_co_block_status,
3243 
3244     .bdrv_snapshot_create         = sd_snapshot_create,
3245     .bdrv_snapshot_goto           = sd_snapshot_goto,
3246     .bdrv_snapshot_delete         = sd_snapshot_delete,
3247     .bdrv_snapshot_list           = sd_snapshot_list,
3248 
3249     .bdrv_save_vmstate            = sd_save_vmstate,
3250     .bdrv_load_vmstate            = sd_load_vmstate,
3251 
3252     .bdrv_detach_aio_context      = sd_detach_aio_context,
3253     .bdrv_attach_aio_context      = sd_attach_aio_context,
3254 
3255     .create_opts                  = &sd_create_opts,
3256     .strong_runtime_opts          = sd_strong_runtime_opts,
3257 };
3258 
3259 static BlockDriver bdrv_sheepdog_tcp = {
3260     .format_name                  = "sheepdog",
3261     .protocol_name                = "sheepdog+tcp",
3262     .instance_size                = sizeof(BDRVSheepdogState),
3263     .bdrv_parse_filename          = sd_parse_filename,
3264     .bdrv_file_open               = sd_open,
3265     .bdrv_reopen_prepare          = sd_reopen_prepare,
3266     .bdrv_reopen_commit           = sd_reopen_commit,
3267     .bdrv_reopen_abort            = sd_reopen_abort,
3268     .bdrv_close                   = sd_close,
3269     .bdrv_co_create               = sd_co_create,
3270     .bdrv_co_create_opts          = sd_co_create_opts,
3271     .bdrv_has_zero_init           = bdrv_has_zero_init_1,
3272     .bdrv_has_zero_init_truncate  = bdrv_has_zero_init_1,
3273     .bdrv_getlength               = sd_getlength,
3274     .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
3275     .bdrv_co_truncate             = sd_co_truncate,
3276 
3277     .bdrv_co_readv                = sd_co_readv,
3278     .bdrv_co_writev               = sd_co_writev,
3279     .bdrv_co_flush_to_disk        = sd_co_flush_to_disk,
3280     .bdrv_co_pdiscard             = sd_co_pdiscard,
3281     .bdrv_co_block_status         = sd_co_block_status,
3282 
3283     .bdrv_snapshot_create         = sd_snapshot_create,
3284     .bdrv_snapshot_goto           = sd_snapshot_goto,
3285     .bdrv_snapshot_delete         = sd_snapshot_delete,
3286     .bdrv_snapshot_list           = sd_snapshot_list,
3287 
3288     .bdrv_save_vmstate            = sd_save_vmstate,
3289     .bdrv_load_vmstate            = sd_load_vmstate,
3290 
3291     .bdrv_detach_aio_context      = sd_detach_aio_context,
3292     .bdrv_attach_aio_context      = sd_attach_aio_context,
3293 
3294     .create_opts                  = &sd_create_opts,
3295     .strong_runtime_opts          = sd_strong_runtime_opts,
3296 };
3297 
3298 static BlockDriver bdrv_sheepdog_unix = {
3299     .format_name                  = "sheepdog",
3300     .protocol_name                = "sheepdog+unix",
3301     .instance_size                = sizeof(BDRVSheepdogState),
3302     .bdrv_parse_filename          = sd_parse_filename,
3303     .bdrv_file_open               = sd_open,
3304     .bdrv_reopen_prepare          = sd_reopen_prepare,
3305     .bdrv_reopen_commit           = sd_reopen_commit,
3306     .bdrv_reopen_abort            = sd_reopen_abort,
3307     .bdrv_close                   = sd_close,
3308     .bdrv_co_create               = sd_co_create,
3309     .bdrv_co_create_opts          = sd_co_create_opts,
3310     .bdrv_has_zero_init           = bdrv_has_zero_init_1,
3311     .bdrv_has_zero_init_truncate  = bdrv_has_zero_init_1,
3312     .bdrv_getlength               = sd_getlength,
3313     .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
3314     .bdrv_co_truncate             = sd_co_truncate,
3315 
3316     .bdrv_co_readv                = sd_co_readv,
3317     .bdrv_co_writev               = sd_co_writev,
3318     .bdrv_co_flush_to_disk        = sd_co_flush_to_disk,
3319     .bdrv_co_pdiscard             = sd_co_pdiscard,
3320     .bdrv_co_block_status         = sd_co_block_status,
3321 
3322     .bdrv_snapshot_create         = sd_snapshot_create,
3323     .bdrv_snapshot_goto           = sd_snapshot_goto,
3324     .bdrv_snapshot_delete         = sd_snapshot_delete,
3325     .bdrv_snapshot_list           = sd_snapshot_list,
3326 
3327     .bdrv_save_vmstate            = sd_save_vmstate,
3328     .bdrv_load_vmstate            = sd_load_vmstate,
3329 
3330     .bdrv_detach_aio_context      = sd_detach_aio_context,
3331     .bdrv_attach_aio_context      = sd_attach_aio_context,
3332 
3333     .create_opts                  = &sd_create_opts,
3334     .strong_runtime_opts          = sd_strong_runtime_opts,
3335 };
3336 
bdrv_sheepdog_init(void)3337 static void bdrv_sheepdog_init(void)
3338 {
3339     bdrv_register(&bdrv_sheepdog);
3340     bdrv_register(&bdrv_sheepdog_tcp);
3341     bdrv_register(&bdrv_sheepdog_unix);
3342 }
3343 block_init(bdrv_sheepdog_init);
3344