1 /*
2  * virtio-fs glue for FUSE
3  * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates
4  *
5  * Authors:
6  *   Dave Gilbert  <dgilbert@redhat.com>
7  *
8  * Implements the glue between libfuse and libvhost-user
9  *
10  * This program can be distributed under the terms of the GNU LGPLv2.
11  * See the file COPYING.LIB
12  */
13 
14 #include "qemu/osdep.h"
15 #include "qemu/iov.h"
16 #include "qapi/error.h"
17 #include "fuse_i.h"
18 #include "standard-headers/linux/fuse.h"
19 #include "fuse_misc.h"
20 #include "fuse_opt.h"
21 #include "fuse_virtio.h"
22 
23 #include <sys/eventfd.h>
24 #include <sys/socket.h>
25 #include <sys/un.h>
26 #include <grp.h>
27 
28 #include "libvhost-user.h"
29 
30 struct fv_VuDev;
31 struct fv_QueueInfo {
32     pthread_t thread;
33     /*
34      * This lock protects the VuVirtq preventing races between
35      * fv_queue_thread() and fv_queue_worker().
36      */
37     pthread_mutex_t vq_lock;
38 
39     struct fv_VuDev *virtio_dev;
40 
41     /* Our queue index, corresponds to array position */
42     int qidx;
43     int kick_fd;
44     int kill_fd; /* For killing the thread */
45 };
46 
47 /* A FUSE request */
48 typedef struct {
49     VuVirtqElement elem;
50     struct fuse_chan ch;
51 
52     /* Used to complete requests that involve no reply */
53     bool reply_sent;
54 } FVRequest;
55 
56 /*
57  * We pass the dev element into libvhost-user
58  * and then use it to get back to the outer
59  * container for other data.
60  */
61 struct fv_VuDev {
62     VuDev dev;
63     struct fuse_session *se;
64 
65     /*
66      * Either handle virtqueues or vhost-user protocol messages.  Don't do
67      * both at the same time since that could lead to race conditions if
68      * virtqueues or memory tables change while another thread is accessing
69      * them.
70      *
71      * The assumptions are:
72      * 1. fv_queue_thread() reads/writes to virtqueues and only reads VuDev.
73      * 2. virtio_loop() reads/writes virtqueues and VuDev.
74      */
75     pthread_rwlock_t vu_dispatch_rwlock;
76 
77     /*
78      * The following pair of fields are only accessed in the main
79      * virtio_loop
80      */
81     size_t nqueues;
82     struct fv_QueueInfo **qi;
83 };
84 
85 /* From spec */
86 struct virtio_fs_config {
87     char tag[36];
88     uint32_t num_queues;
89 };
90 
91 /* Callback from libvhost-user */
fv_get_features(VuDev * dev)92 static uint64_t fv_get_features(VuDev *dev)
93 {
94     return 1ULL << VIRTIO_F_VERSION_1;
95 }
96 
97 /* Callback from libvhost-user */
fv_set_features(VuDev * dev,uint64_t features)98 static void fv_set_features(VuDev *dev, uint64_t features)
99 {
100 }
101 
102 /*
103  * Callback from libvhost-user if there's a new fd we're supposed to listen
104  * to, typically a queue kick?
105  */
fv_set_watch(VuDev * dev,int fd,int condition,vu_watch_cb cb,void * data)106 static void fv_set_watch(VuDev *dev, int fd, int condition, vu_watch_cb cb,
107                          void *data)
108 {
109     fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
110 }
111 
112 /*
113  * Callback from libvhost-user if we're no longer supposed to listen on an fd
114  */
fv_remove_watch(VuDev * dev,int fd)115 static void fv_remove_watch(VuDev *dev, int fd)
116 {
117     fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
118 }
119 
120 /* Callback from libvhost-user to panic */
fv_panic(VuDev * dev,const char * err)121 static void fv_panic(VuDev *dev, const char *err)
122 {
123     fuse_log(FUSE_LOG_ERR, "%s: libvhost-user: %s\n", __func__, err);
124     /* TODO: Allow reconnects?? */
125     exit(EXIT_FAILURE);
126 }
127 
128 /*
129  * Copy from an iovec into a fuse_buf (memory only)
130  * Caller must ensure there is space
131  */
copy_from_iov(struct fuse_buf * buf,size_t out_num,const struct iovec * out_sg,size_t max)132 static size_t copy_from_iov(struct fuse_buf *buf, size_t out_num,
133                             const struct iovec *out_sg,
134                             size_t max)
135 {
136     void *dest = buf->mem;
137     size_t copied = 0;
138 
139     while (out_num && max) {
140         size_t onelen = out_sg->iov_len;
141         onelen = MIN(onelen, max);
142         memcpy(dest, out_sg->iov_base, onelen);
143         dest += onelen;
144         copied += onelen;
145         out_sg++;
146         out_num--;
147         max -= onelen;
148     }
149 
150     return copied;
151 }
152 
153 /*
154  * Skip 'skip' bytes in the iov; 'sg_1stindex' is set as
155  * the index for the 1st iovec to read data from, and
156  * 'sg_1stskip' is the number of bytes to skip in that entry.
157  *
158  * Returns True if there are at least 'skip' bytes in the iovec
159  *
160  */
skip_iov(const struct iovec * sg,size_t sg_size,size_t skip,size_t * sg_1stindex,size_t * sg_1stskip)161 static bool skip_iov(const struct iovec *sg, size_t sg_size,
162                      size_t skip,
163                      size_t *sg_1stindex, size_t *sg_1stskip)
164 {
165     size_t vec;
166 
167     for (vec = 0; vec < sg_size; vec++) {
168         if (sg[vec].iov_len > skip) {
169             *sg_1stskip = skip;
170             *sg_1stindex = vec;
171 
172             return true;
173         }
174 
175         skip -= sg[vec].iov_len;
176     }
177 
178     *sg_1stindex = vec;
179     *sg_1stskip = 0;
180     return skip == 0;
181 }
182 
183 /*
184  * Copy from one iov to another, the given number of bytes
185  * The caller must have checked sizes.
186  */
copy_iov(struct iovec * src_iov,int src_count,struct iovec * dst_iov,int dst_count,size_t to_copy)187 static void copy_iov(struct iovec *src_iov, int src_count,
188                      struct iovec *dst_iov, int dst_count, size_t to_copy)
189 {
190     size_t dst_offset = 0;
191     /* Outer loop copies 'src' elements */
192     while (to_copy) {
193         assert(src_count);
194         size_t src_len = src_iov[0].iov_len;
195         size_t src_offset = 0;
196 
197         if (src_len > to_copy) {
198             src_len = to_copy;
199         }
200         /* Inner loop copies contents of one 'src' to maybe multiple dst. */
201         while (src_len) {
202             assert(dst_count);
203             size_t dst_len = dst_iov[0].iov_len - dst_offset;
204             if (dst_len > src_len) {
205                 dst_len = src_len;
206             }
207 
208             memcpy(dst_iov[0].iov_base + dst_offset,
209                    src_iov[0].iov_base + src_offset, dst_len);
210             src_len -= dst_len;
211             to_copy -= dst_len;
212             src_offset += dst_len;
213             dst_offset += dst_len;
214 
215             assert(dst_offset <= dst_iov[0].iov_len);
216             if (dst_offset == dst_iov[0].iov_len) {
217                 dst_offset = 0;
218                 dst_iov++;
219                 dst_count--;
220             }
221         }
222         src_iov++;
223         src_count--;
224     }
225 }
226 
227 /*
228  * pthread_rwlock_rdlock() and pthread_rwlock_wrlock can fail if
229  * a deadlock condition is detected or the current thread already
230  * owns the lock. They can also fail, like pthread_rwlock_unlock(),
231  * if the mutex wasn't properly initialized. None of these are ever
232  * expected to happen.
233  */
vu_dispatch_rdlock(struct fv_VuDev * vud)234 static void vu_dispatch_rdlock(struct fv_VuDev *vud)
235 {
236     int ret = pthread_rwlock_rdlock(&vud->vu_dispatch_rwlock);
237     assert(ret == 0);
238 }
239 
vu_dispatch_wrlock(struct fv_VuDev * vud)240 static void vu_dispatch_wrlock(struct fv_VuDev *vud)
241 {
242     int ret = pthread_rwlock_wrlock(&vud->vu_dispatch_rwlock);
243     assert(ret == 0);
244 }
245 
vu_dispatch_unlock(struct fv_VuDev * vud)246 static void vu_dispatch_unlock(struct fv_VuDev *vud)
247 {
248     int ret = pthread_rwlock_unlock(&vud->vu_dispatch_rwlock);
249     assert(ret == 0);
250 }
251 
252 /*
253  * Called back by ll whenever it wants to send a reply/message back
254  * The 1st element of the iov starts with the fuse_out_header
255  * 'unique'==0 means it's a notify message.
256  */
virtio_send_msg(struct fuse_session * se,struct fuse_chan * ch,struct iovec * iov,int count)257 int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch,
258                     struct iovec *iov, int count)
259 {
260     FVRequest *req = container_of(ch, FVRequest, ch);
261     struct fv_QueueInfo *qi = ch->qi;
262     VuDev *dev = &se->virtio_dev->dev;
263     VuVirtq *q = vu_get_queue(dev, qi->qidx);
264     VuVirtqElement *elem = &req->elem;
265     int ret = 0;
266 
267     assert(count >= 1);
268     assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
269 
270     struct fuse_out_header *out = iov[0].iov_base;
271     /* TODO: Endianness! */
272 
273     size_t tosend_len = iov_size(iov, count);
274 
275     /* unique == 0 is notification, which we don't support */
276     assert(out->unique);
277     assert(!req->reply_sent);
278 
279     /* The 'in' part of the elem is to qemu */
280     unsigned int in_num = elem->in_num;
281     struct iovec *in_sg = elem->in_sg;
282     size_t in_len = iov_size(in_sg, in_num);
283     fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
284              __func__, elem->index, in_num, in_len);
285 
286     /*
287      * The elem should have room for a 'fuse_out_header' (out from fuse)
288      * plus the data based on the len in the header.
289      */
290     if (in_len < sizeof(struct fuse_out_header)) {
291         fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
292                  __func__, elem->index);
293         ret = -E2BIG;
294         goto err;
295     }
296     if (in_len < tosend_len) {
297         fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
298                  __func__, elem->index, tosend_len);
299         ret = -E2BIG;
300         goto err;
301     }
302 
303     copy_iov(iov, count, in_sg, in_num, tosend_len);
304 
305     vu_dispatch_rdlock(qi->virtio_dev);
306     pthread_mutex_lock(&qi->vq_lock);
307     vu_queue_push(dev, q, elem, tosend_len);
308     vu_queue_notify(dev, q);
309     pthread_mutex_unlock(&qi->vq_lock);
310     vu_dispatch_unlock(qi->virtio_dev);
311 
312     req->reply_sent = true;
313 
314 err:
315     return ret;
316 }
317 
318 /*
319  * Callback from fuse_send_data_iov_* when it's virtio and the buffer
320  * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK
321  * We need send the iov and then the buffer.
322  * Return 0 on success
323  */
virtio_send_data_iov(struct fuse_session * se,struct fuse_chan * ch,struct iovec * iov,int count,struct fuse_bufvec * buf,size_t len)324 int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
325                          struct iovec *iov, int count, struct fuse_bufvec *buf,
326                          size_t len)
327 {
328     FVRequest *req = container_of(ch, FVRequest, ch);
329     struct fv_QueueInfo *qi = ch->qi;
330     VuDev *dev = &se->virtio_dev->dev;
331     VuVirtq *q = vu_get_queue(dev, qi->qidx);
332     VuVirtqElement *elem = &req->elem;
333     int ret = 0;
334     g_autofree struct iovec *in_sg_cpy = NULL;
335 
336     assert(count >= 1);
337     assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
338 
339     struct fuse_out_header *out = iov[0].iov_base;
340     /* TODO: Endianness! */
341 
342     size_t iov_len = iov_size(iov, count);
343     size_t tosend_len = iov_len + len;
344 
345     out->len = tosend_len;
346 
347     fuse_log(FUSE_LOG_DEBUG, "%s: count=%d len=%zd iov_len=%zd\n", __func__,
348              count, len, iov_len);
349 
350     /* unique == 0 is notification which we don't support */
351     assert(out->unique);
352 
353     assert(!req->reply_sent);
354 
355     /* The 'in' part of the elem is to qemu */
356     unsigned int in_num = elem->in_num;
357     struct iovec *in_sg = elem->in_sg;
358     size_t in_len = iov_size(in_sg, in_num);
359     fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
360              __func__, elem->index, in_num, in_len);
361 
362     /*
363      * The elem should have room for a 'fuse_out_header' (out from fuse)
364      * plus the data based on the len in the header.
365      */
366     if (in_len < sizeof(struct fuse_out_header)) {
367         fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
368                  __func__, elem->index);
369         return E2BIG;
370     }
371     if (in_len < tosend_len) {
372         fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
373                  __func__, elem->index, tosend_len);
374         return E2BIG;
375     }
376 
377     /* TODO: Limit to 'len' */
378 
379     /* First copy the header data from iov->in_sg */
380     copy_iov(iov, count, in_sg, in_num, iov_len);
381 
382     /*
383      * Build a copy of the the in_sg iov so we can skip bits in it,
384      * including changing the offsets
385      */
386     in_sg_cpy = g_new(struct iovec, in_num);
387     memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num);
388     /* These get updated as we skip */
389     struct iovec *in_sg_ptr = in_sg_cpy;
390     unsigned int in_sg_cpy_count = in_num;
391 
392     /* skip over parts of in_sg that contained the header iov */
393     iov_discard_front(&in_sg_ptr, &in_sg_cpy_count, iov_len);
394 
395     do {
396         fuse_log(FUSE_LOG_DEBUG, "%s: in_sg_cpy_count=%d len remaining=%zd\n",
397                  __func__, in_sg_cpy_count, len);
398 
399         ret = preadv(buf->buf[0].fd, in_sg_ptr, in_sg_cpy_count,
400                      buf->buf[0].pos);
401 
402         if (ret == -1) {
403             ret = errno;
404             if (ret == EINTR) {
405                 continue;
406             }
407             fuse_log(FUSE_LOG_DEBUG, "%s: preadv failed (%m) len=%zd\n",
408                      __func__, len);
409             return ret;
410         }
411 
412         if (!ret) {
413             /* EOF case? */
414             fuse_log(FUSE_LOG_DEBUG, "%s: !ret len remaining=%zd\n", __func__,
415                      len);
416             break;
417         }
418         fuse_log(FUSE_LOG_DEBUG, "%s: preadv ret=%d len=%zd\n", __func__,
419                  ret, len);
420 
421         len -= ret;
422         /* Short read. Retry reading remaining bytes */
423         if (len) {
424             fuse_log(FUSE_LOG_DEBUG, "%s: ret < len\n", __func__);
425             /* Skip over this much next time around */
426             iov_discard_front(&in_sg_ptr, &in_sg_cpy_count, ret);
427             buf->buf[0].pos += ret;
428         }
429     } while (len);
430 
431     /* Need to fix out->len on EOF */
432     if (len) {
433         struct fuse_out_header *out_sg = in_sg[0].iov_base;
434 
435         tosend_len -= len;
436         out_sg->len = tosend_len;
437     }
438 
439     vu_dispatch_rdlock(qi->virtio_dev);
440     pthread_mutex_lock(&qi->vq_lock);
441     vu_queue_push(dev, q, elem, tosend_len);
442     vu_queue_notify(dev, q);
443     pthread_mutex_unlock(&qi->vq_lock);
444     vu_dispatch_unlock(qi->virtio_dev);
445     req->reply_sent = true;
446     return 0;
447 }
448 
449 static __thread bool clone_fs_called;
450 
451 /* Process one FVRequest in a thread pool */
fv_queue_worker(gpointer data,gpointer user_data)452 static void fv_queue_worker(gpointer data, gpointer user_data)
453 {
454     struct fv_QueueInfo *qi = user_data;
455     struct fuse_session *se = qi->virtio_dev->se;
456     struct VuDev *dev = &qi->virtio_dev->dev;
457     FVRequest *req = data;
458     VuVirtqElement *elem = &req->elem;
459     struct fuse_buf fbuf = {};
460     bool allocated_bufv = false;
461     struct fuse_bufvec bufv;
462     struct fuse_bufvec *pbufv;
463     struct fuse_in_header inh;
464 
465     assert(se->bufsize > sizeof(struct fuse_in_header));
466 
467     if (!clone_fs_called) {
468         int ret;
469 
470         /* unshare FS for xattr operation */
471         ret = unshare(CLONE_FS);
472         /* should not fail */
473         assert(ret == 0);
474 
475         clone_fs_called = true;
476     }
477 
478     /*
479      * An element contains one request and the space to send our response
480      * They're spread over multiple descriptors in a scatter/gather set
481      * and we can't trust the guest to keep them still; so copy in/out.
482      */
483     fbuf.mem = g_malloc(se->bufsize);
484 
485     fuse_mutex_init(&req->ch.lock);
486     req->ch.fd = -1;
487     req->ch.qi = qi;
488 
489     /* The 'out' part of the elem is from qemu */
490     unsigned int out_num = elem->out_num;
491     struct iovec *out_sg = elem->out_sg;
492     size_t out_len = iov_size(out_sg, out_num);
493     fuse_log(FUSE_LOG_DEBUG,
494              "%s: elem %d: with %d out desc of length %zd\n",
495              __func__, elem->index, out_num, out_len);
496 
497     /*
498      * The elem should contain a 'fuse_in_header' (in to fuse)
499      * plus the data based on the len in the header.
500      */
501     if (out_len < sizeof(struct fuse_in_header)) {
502         fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n",
503                  __func__, elem->index);
504         assert(0); /* TODO */
505     }
506     if (out_len > se->bufsize) {
507         fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n", __func__,
508                  elem->index);
509         assert(0); /* TODO */
510     }
511     /* Copy just the fuse_in_header and look at it */
512     copy_from_iov(&fbuf, out_num, out_sg,
513                   sizeof(struct fuse_in_header));
514     memcpy(&inh, fbuf.mem, sizeof(struct fuse_in_header));
515 
516     pbufv = NULL; /* Compiler thinks an unitialised path */
517     if (inh.opcode == FUSE_WRITE &&
518         out_len >= (sizeof(struct fuse_in_header) +
519                     sizeof(struct fuse_write_in))) {
520         /*
521          * For a write we don't actually need to copy the
522          * data, we can just do it straight out of guest memory
523          * but we must still copy the headers in case the guest
524          * was nasty and changed them while we were using them.
525          */
526         fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__);
527 
528         fbuf.size = copy_from_iov(&fbuf, out_num, out_sg,
529                                   sizeof(struct fuse_in_header) +
530                                   sizeof(struct fuse_write_in));
531         /* That copy reread the in_header, make sure we use the original */
532         memcpy(fbuf.mem, &inh, sizeof(struct fuse_in_header));
533 
534         /* Allocate the bufv, with space for the rest of the iov */
535         pbufv = g_try_malloc(sizeof(struct fuse_bufvec) +
536                              sizeof(struct fuse_buf) * out_num);
537         if (!pbufv) {
538             fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n",
539                     __func__);
540             goto out;
541         }
542 
543         allocated_bufv = true;
544         pbufv->count = 1;
545         pbufv->buf[0] = fbuf;
546 
547         size_t iovindex, pbufvindex, iov_bytes_skip;
548         pbufvindex = 1; /* 2 headers, 1 fusebuf */
549 
550         if (!skip_iov(out_sg, out_num,
551                       sizeof(struct fuse_in_header) +
552                       sizeof(struct fuse_write_in),
553                       &iovindex, &iov_bytes_skip)) {
554             fuse_log(FUSE_LOG_ERR, "%s: skip failed\n",
555                     __func__);
556             goto out;
557         }
558 
559         for (; iovindex < out_num; iovindex++, pbufvindex++) {
560             pbufv->count++;
561             pbufv->buf[pbufvindex].pos = ~0; /* Dummy */
562             pbufv->buf[pbufvindex].flags = 0;
563             pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base;
564             pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len;
565 
566             if (iov_bytes_skip) {
567                 pbufv->buf[pbufvindex].mem += iov_bytes_skip;
568                 pbufv->buf[pbufvindex].size -= iov_bytes_skip;
569                 iov_bytes_skip = 0;
570             }
571         }
572     } else {
573         /* Normal (non fast write) path */
574 
575         copy_from_iov(&fbuf, out_num, out_sg, se->bufsize);
576         /* That copy reread the in_header, make sure we use the original */
577         memcpy(fbuf.mem, &inh, sizeof(struct fuse_in_header));
578         fbuf.size = out_len;
579 
580         /* TODO! Endianness of header */
581 
582         /* TODO: Add checks for fuse_session_exited */
583         bufv.buf[0] = fbuf;
584         bufv.count = 1;
585         pbufv = &bufv;
586     }
587     pbufv->idx = 0;
588     pbufv->off = 0;
589     fuse_session_process_buf_int(se, pbufv, &req->ch);
590 
591 out:
592     if (allocated_bufv) {
593         g_free(pbufv);
594     }
595 
596     /* If the request has no reply, still recycle the virtqueue element */
597     if (!req->reply_sent) {
598         struct VuVirtq *q = vu_get_queue(dev, qi->qidx);
599 
600         fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", __func__,
601                  elem->index);
602 
603         vu_dispatch_rdlock(qi->virtio_dev);
604         pthread_mutex_lock(&qi->vq_lock);
605         vu_queue_push(dev, q, elem, 0);
606         vu_queue_notify(dev, q);
607         pthread_mutex_unlock(&qi->vq_lock);
608         vu_dispatch_unlock(qi->virtio_dev);
609     }
610 
611     pthread_mutex_destroy(&req->ch.lock);
612     g_free(fbuf.mem);
613     free(req);
614 }
615 
616 /* Thread function for individual queues, created when a queue is 'started' */
fv_queue_thread(void * opaque)617 static void *fv_queue_thread(void *opaque)
618 {
619     struct fv_QueueInfo *qi = opaque;
620     struct VuDev *dev = &qi->virtio_dev->dev;
621     struct VuVirtq *q = vu_get_queue(dev, qi->qidx);
622     struct fuse_session *se = qi->virtio_dev->se;
623     GThreadPool *pool = NULL;
624     GList *req_list = NULL;
625 
626     if (se->thread_pool_size) {
627         fuse_log(FUSE_LOG_DEBUG, "%s: Creating thread pool for Queue %d\n",
628                  __func__, qi->qidx);
629         pool = g_thread_pool_new(fv_queue_worker, qi, se->thread_pool_size,
630                                  FALSE, NULL);
631         if (!pool) {
632             fuse_log(FUSE_LOG_ERR, "%s: g_thread_pool_new failed\n", __func__);
633             return NULL;
634         }
635     }
636 
637     fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__,
638              qi->qidx, qi->kick_fd);
639     while (1) {
640         struct pollfd pf[2];
641 
642         pf[0].fd = qi->kick_fd;
643         pf[0].events = POLLIN;
644         pf[0].revents = 0;
645         pf[1].fd = qi->kill_fd;
646         pf[1].events = POLLIN;
647         pf[1].revents = 0;
648 
649         fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for Queue %d event\n", __func__,
650                  qi->qidx);
651         int poll_res = ppoll(pf, 2, NULL, NULL);
652 
653         if (poll_res == -1) {
654             if (errno == EINTR) {
655                 fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
656                          __func__);
657                 continue;
658             }
659             fuse_log(FUSE_LOG_ERR, "fv_queue_thread ppoll: %m\n");
660             break;
661         }
662         assert(poll_res >= 1);
663         if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
664             fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x Queue %d\n",
665                      __func__, pf[0].revents, qi->qidx);
666             break;
667         }
668         if (pf[1].revents & (POLLERR | POLLHUP | POLLNVAL)) {
669             fuse_log(FUSE_LOG_ERR,
670                      "%s: Unexpected poll revents %x Queue %d killfd\n",
671                      __func__, pf[1].revents, qi->qidx);
672             break;
673         }
674         if (pf[1].revents) {
675             fuse_log(FUSE_LOG_INFO, "%s: kill event on queue %d - quitting\n",
676                      __func__, qi->qidx);
677             break;
678         }
679         assert(pf[0].revents & POLLIN);
680         fuse_log(FUSE_LOG_DEBUG, "%s: Got queue event on Queue %d\n", __func__,
681                  qi->qidx);
682 
683         eventfd_t evalue;
684         if (eventfd_read(qi->kick_fd, &evalue)) {
685             fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n");
686             break;
687         }
688         /* Mutual exclusion with virtio_loop() */
689         vu_dispatch_rdlock(qi->virtio_dev);
690         pthread_mutex_lock(&qi->vq_lock);
691         /* out is from guest, in is too guest */
692         unsigned int in_bytes, out_bytes;
693         vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0);
694 
695         fuse_log(FUSE_LOG_DEBUG,
696                  "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n",
697                  __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes);
698 
699         while (1) {
700             FVRequest *req = vu_queue_pop(dev, q, sizeof(FVRequest));
701             if (!req) {
702                 break;
703             }
704 
705             req->reply_sent = false;
706 
707             if (!se->thread_pool_size) {
708                 req_list = g_list_prepend(req_list, req);
709             } else {
710                 g_thread_pool_push(pool, req, NULL);
711             }
712         }
713 
714         pthread_mutex_unlock(&qi->vq_lock);
715         vu_dispatch_unlock(qi->virtio_dev);
716 
717         /* Process all the requests. */
718         if (!se->thread_pool_size && req_list != NULL) {
719             req_list = g_list_reverse(req_list);
720             g_list_foreach(req_list, fv_queue_worker, qi);
721             g_list_free(req_list);
722             req_list = NULL;
723         }
724     }
725 
726     if (pool) {
727         g_thread_pool_free(pool, FALSE, TRUE);
728     }
729 
730     return NULL;
731 }
732 
fv_queue_cleanup_thread(struct fv_VuDev * vud,int qidx)733 static void fv_queue_cleanup_thread(struct fv_VuDev *vud, int qidx)
734 {
735     int ret;
736     struct fv_QueueInfo *ourqi;
737 
738     assert(qidx < vud->nqueues);
739     ourqi = vud->qi[qidx];
740 
741     /* Kill the thread */
742     if (eventfd_write(ourqi->kill_fd, 1)) {
743         fuse_log(FUSE_LOG_ERR, "Eventfd_write for queue %d: %s\n",
744                  qidx, strerror(errno));
745     }
746     ret = pthread_join(ourqi->thread, NULL);
747     if (ret) {
748         fuse_log(FUSE_LOG_ERR, "%s: Failed to join thread idx %d err %d\n",
749                  __func__, qidx, ret);
750     }
751     pthread_mutex_destroy(&ourqi->vq_lock);
752     close(ourqi->kill_fd);
753     ourqi->kick_fd = -1;
754     g_free(vud->qi[qidx]);
755     vud->qi[qidx] = NULL;
756 }
757 
758 /* Callback from libvhost-user on start or stop of a queue */
fv_queue_set_started(VuDev * dev,int qidx,bool started)759 static void fv_queue_set_started(VuDev *dev, int qidx, bool started)
760 {
761     struct fv_VuDev *vud = container_of(dev, struct fv_VuDev, dev);
762     struct fv_QueueInfo *ourqi;
763 
764     fuse_log(FUSE_LOG_INFO, "%s: qidx=%d started=%d\n", __func__, qidx,
765              started);
766     assert(qidx >= 0);
767 
768     /*
769      * Ignore additional request queues for now.  passthrough_ll.c must be
770      * audited for thread-safety issues first.  It was written with a
771      * well-behaved client in mind and may not protect against all types of
772      * races yet.
773      */
774     if (qidx > 1) {
775         fuse_log(FUSE_LOG_ERR,
776                  "%s: multiple request queues not yet implemented, please only "
777                  "configure 1 request queue\n",
778                  __func__);
779         exit(EXIT_FAILURE);
780     }
781 
782     if (started) {
783         /* Fire up a thread to watch this queue */
784         if (qidx >= vud->nqueues) {
785             vud->qi = g_realloc_n(vud->qi, qidx + 1, sizeof(vud->qi[0]));
786             memset(vud->qi + vud->nqueues, 0,
787                    sizeof(vud->qi[0]) * (1 + (qidx - vud->nqueues)));
788             vud->nqueues = qidx + 1;
789         }
790         if (!vud->qi[qidx]) {
791             vud->qi[qidx] = g_new0(struct fv_QueueInfo, 1);
792             vud->qi[qidx]->virtio_dev = vud;
793             vud->qi[qidx]->qidx = qidx;
794         } else {
795             /* Shouldn't have been started */
796             assert(vud->qi[qidx]->kick_fd == -1);
797         }
798         ourqi = vud->qi[qidx];
799         ourqi->kick_fd = dev->vq[qidx].kick_fd;
800 
801         ourqi->kill_fd = eventfd(0, EFD_CLOEXEC | EFD_SEMAPHORE);
802         assert(ourqi->kill_fd != -1);
803         pthread_mutex_init(&ourqi->vq_lock, NULL);
804 
805         if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) {
806             fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n",
807                      __func__, qidx);
808             assert(0);
809         }
810     } else {
811         /*
812          * Temporarily drop write-lock taken in virtio_loop() so that
813          * the queue thread doesn't block in virtio_send_msg().
814          */
815         vu_dispatch_unlock(vud);
816         fv_queue_cleanup_thread(vud, qidx);
817         vu_dispatch_wrlock(vud);
818     }
819 }
820 
fv_queue_order(VuDev * dev,int qidx)821 static bool fv_queue_order(VuDev *dev, int qidx)
822 {
823     return false;
824 }
825 
826 static const VuDevIface fv_iface = {
827     .get_features = fv_get_features,
828     .set_features = fv_set_features,
829 
830     /* Don't need process message, we've not got any at vhost-user level */
831     .queue_set_started = fv_queue_set_started,
832 
833     .queue_is_processed_in_order = fv_queue_order,
834 };
835 
836 /*
837  * Main loop; this mostly deals with events on the vhost-user
838  * socket itself, and not actual fuse data.
839  */
virtio_loop(struct fuse_session * se)840 int virtio_loop(struct fuse_session *se)
841 {
842     fuse_log(FUSE_LOG_INFO, "%s: Entry\n", __func__);
843 
844     while (!fuse_session_exited(se)) {
845         struct pollfd pf[1];
846         bool ok;
847         pf[0].fd = se->vu_socketfd;
848         pf[0].events = POLLIN;
849         pf[0].revents = 0;
850 
851         fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for VU event\n", __func__);
852         int poll_res = ppoll(pf, 1, NULL, NULL);
853 
854         if (poll_res == -1) {
855             if (errno == EINTR) {
856                 fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
857                          __func__);
858                 continue;
859             }
860             fuse_log(FUSE_LOG_ERR, "virtio_loop ppoll: %m\n");
861             break;
862         }
863         assert(poll_res == 1);
864         if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
865             fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x\n", __func__,
866                      pf[0].revents);
867             break;
868         }
869         assert(pf[0].revents & POLLIN);
870         fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__);
871         /* Mutual exclusion with fv_queue_thread() */
872         vu_dispatch_wrlock(se->virtio_dev);
873 
874         ok = vu_dispatch(&se->virtio_dev->dev);
875 
876         vu_dispatch_unlock(se->virtio_dev);
877 
878         if (!ok) {
879             fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__);
880             break;
881         }
882     }
883 
884     /*
885      * Make sure all fv_queue_thread()s quit on exit, as we're about to
886      * free virtio dev and fuse session, no one should access them anymore.
887      */
888     for (int i = 0; i < se->virtio_dev->nqueues; i++) {
889         if (!se->virtio_dev->qi[i]) {
890             continue;
891         }
892 
893         fuse_log(FUSE_LOG_INFO, "%s: Stopping queue %d thread\n", __func__, i);
894         fv_queue_cleanup_thread(se->virtio_dev, i);
895     }
896 
897     fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__);
898 
899     return 0;
900 }
901 
strreplace(char * s,char old,char new)902 static void strreplace(char *s, char old, char new)
903 {
904     for (; *s; ++s) {
905         if (*s == old) {
906             *s = new;
907         }
908     }
909 }
910 
fv_socket_lock(struct fuse_session * se)911 static bool fv_socket_lock(struct fuse_session *se)
912 {
913     g_autofree gchar *sk_name = NULL;
914     g_autofree gchar *pidfile = NULL;
915     g_autofree gchar *dir = NULL;
916     Error *local_err = NULL;
917 
918     dir = qemu_get_local_state_pathname("run/virtiofsd");
919 
920     if (g_mkdir_with_parents(dir, S_IRWXU) < 0) {
921         fuse_log(FUSE_LOG_ERR, "%s: Failed to create directory %s: %s\n",
922                  __func__, dir, strerror(errno));
923         return false;
924     }
925 
926     sk_name = g_strdup(se->vu_socket_path);
927     strreplace(sk_name, '/', '.');
928     pidfile = g_strdup_printf("%s/%s.pid", dir, sk_name);
929 
930     if (!qemu_write_pidfile(pidfile, &local_err)) {
931         error_report_err(local_err);
932         return false;
933     }
934 
935     return true;
936 }
937 
fv_create_listen_socket(struct fuse_session * se)938 static int fv_create_listen_socket(struct fuse_session *se)
939 {
940     struct sockaddr_un un;
941     mode_t old_umask;
942 
943     /* Nothing to do if fd is already initialized */
944     if (se->vu_listen_fd >= 0) {
945         return 0;
946     }
947 
948     if (strlen(se->vu_socket_path) >= sizeof(un.sun_path)) {
949         fuse_log(FUSE_LOG_ERR, "Socket path too long\n");
950         return -1;
951     }
952 
953     if (!strlen(se->vu_socket_path)) {
954         fuse_log(FUSE_LOG_ERR, "Socket path is empty\n");
955         return -1;
956     }
957 
958     /* Check the vu_socket_path is already used */
959     if (!fv_socket_lock(se)) {
960         return -1;
961     }
962 
963     /*
964      * Create the Unix socket to communicate with qemu
965      * based on QEMU's vhost-user-bridge
966      */
967     unlink(se->vu_socket_path);
968     strcpy(un.sun_path, se->vu_socket_path);
969     size_t addr_len = sizeof(un);
970 
971     int listen_sock = socket(AF_UNIX, SOCK_STREAM, 0);
972     if (listen_sock == -1) {
973         fuse_log(FUSE_LOG_ERR, "vhost socket creation: %m\n");
974         return -1;
975     }
976     un.sun_family = AF_UNIX;
977 
978     /*
979      * Unfortunately bind doesn't let you set the mask on the socket,
980      * so set umask appropriately and restore it later.
981      */
982     if (se->vu_socket_group) {
983         old_umask = umask(S_IROTH | S_IWOTH | S_IXOTH);
984     } else {
985         old_umask = umask(S_IRGRP | S_IWGRP | S_IXGRP |
986                           S_IROTH | S_IWOTH | S_IXOTH);
987     }
988     if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) {
989         fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n");
990         close(listen_sock);
991         umask(old_umask);
992         return -1;
993     }
994     if (se->vu_socket_group) {
995         struct group *g = getgrnam(se->vu_socket_group);
996         if (g) {
997             if (chown(se->vu_socket_path, -1, g->gr_gid) == -1) {
998                 fuse_log(FUSE_LOG_WARNING,
999                          "vhost socket failed to set group to %s (%d): %m\n",
1000                          se->vu_socket_group, g->gr_gid);
1001             }
1002         }
1003     }
1004     umask(old_umask);
1005 
1006     if (listen(listen_sock, 1) == -1) {
1007         fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n");
1008         close(listen_sock);
1009         return -1;
1010     }
1011 
1012     se->vu_listen_fd = listen_sock;
1013     return 0;
1014 }
1015 
virtio_session_mount(struct fuse_session * se)1016 int virtio_session_mount(struct fuse_session *se)
1017 {
1018     int ret;
1019 
1020     /*
1021      * Test that unshare(CLONE_FS) works. fv_queue_worker() will need it. It's
1022      * an unprivileged system call but some Docker/Moby versions are known to
1023      * reject it via seccomp when CAP_SYS_ADMIN is not given.
1024      *
1025      * Note that the program is single-threaded here so this syscall has no
1026      * visible effect and is safe to make.
1027      */
1028     ret = unshare(CLONE_FS);
1029     if (ret == -1 && errno == EPERM) {
1030         fuse_log(FUSE_LOG_ERR, "unshare(CLONE_FS) failed with EPERM. If "
1031                 "running in a container please check that the container "
1032                 "runtime seccomp policy allows unshare.\n");
1033         return -1;
1034     }
1035 
1036     ret = fv_create_listen_socket(se);
1037     if (ret < 0) {
1038         return ret;
1039     }
1040 
1041     se->fd = -1;
1042 
1043     fuse_log(FUSE_LOG_INFO, "%s: Waiting for vhost-user socket connection...\n",
1044              __func__);
1045     int data_sock = accept(se->vu_listen_fd, NULL, NULL);
1046     if (data_sock == -1) {
1047         fuse_log(FUSE_LOG_ERR, "vhost socket accept: %m\n");
1048         close(se->vu_listen_fd);
1049         return -1;
1050     }
1051     close(se->vu_listen_fd);
1052     se->vu_listen_fd = -1;
1053     fuse_log(FUSE_LOG_INFO, "%s: Received vhost-user socket connection\n",
1054              __func__);
1055 
1056     /* TODO: Some cleanup/deallocation! */
1057     se->virtio_dev = g_new0(struct fv_VuDev, 1);
1058 
1059     se->vu_socketfd = data_sock;
1060     se->virtio_dev->se = se;
1061     pthread_rwlock_init(&se->virtio_dev->vu_dispatch_rwlock, NULL);
1062     if (!vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, NULL,
1063                  fv_set_watch, fv_remove_watch, &fv_iface)) {
1064         fuse_log(FUSE_LOG_ERR, "%s: vu_init failed\n", __func__);
1065         return -1;
1066     }
1067 
1068     return 0;
1069 }
1070 
virtio_session_close(struct fuse_session * se)1071 void virtio_session_close(struct fuse_session *se)
1072 {
1073     close(se->vu_socketfd);
1074 
1075     if (!se->virtio_dev) {
1076         return;
1077     }
1078 
1079     g_free(se->virtio_dev->qi);
1080     pthread_rwlock_destroy(&se->virtio_dev->vu_dispatch_rwlock);
1081     g_free(se->virtio_dev);
1082     se->virtio_dev = NULL;
1083 }
1084