xref: /reactos/sdk/lib/drivers/virtio/VirtIORing.c (revision 69931a4a)
1 /*
2  * Virtio ring manipulation routines
3  *
4  * Copyright 2017 Red Hat, Inc.
5  *
6  * Authors:
7  *  Ladi Prosek <lprosek@redhat.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met :
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and / or other materials provided with the distribution.
17  * 3. Neither the names of the copyright holders nor the names of their contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 #include "osdep.h"
33 #include "virtio_pci.h"
34 #include "VirtIO.h"
35 #include "kdebugprint.h"
36 #include "virtio_ring.h"
37 #include "windows/virtio_ring_allocation.h"
38 
39 #define DESC_INDEX(num, i) ((i) & ((num) - 1))
40 
41  /* This marks a buffer as continuing via the next field. */
42 #define VIRTQ_DESC_F_NEXT	1
43 /* This marks a buffer as write-only (otherwise read-only). */
44 #define VIRTQ_DESC_F_WRITE	2
45 /* This means the buffer contains a list of buffer descriptors. */
46 #define VIRTQ_DESC_F_INDIRECT	4
47 
48 /* The Host uses this in used->flags to advise the Guest: don't kick me when
49 * you add a buffer.  It's unreliable, so it's simply an optimization.  Guest
50 * will still kick if it's out of buffers. */
51 #define VIRTQ_USED_F_NO_NOTIFY	1
52 /* The Guest uses this in avail->flags to advise the Host: don't interrupt me
53 * when you consume a buffer.  It's unreliable, so it's simply an
54 * optimization.  */
55 #define VIRTQ_AVAIL_F_NO_INTERRUPT	1
56 
57 #pragma warning (push)
58 #pragma warning (disable:4200)
59 
60 #include <pshpack1.h>
61 
62 /* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
63 struct vring_desc {
64     /* Address (guest-physical). */
65     __virtio64 addr;
66     /* Length. */
67     __virtio32 len;
68     /* The flags as indicated above. */
69     __virtio16 flags;
70     /* We chain unused descriptors via this, too */
71     __virtio16 next;
72 };
73 
74 struct vring_avail {
75     __virtio16 flags;
76     __virtio16 idx;
77     __virtio16 ring[];
78 };
79 
80 /* u32 is used here for ids for padding reasons. */
81 struct vring_used_elem {
82     /* Index of start of used descriptor chain. */
83     __virtio32 id;
84     /* Total length of the descriptor chain which was used (written to) */
85     __virtio32 len;
86 };
87 
88 struct vring_used {
89     __virtio16 flags;
90     __virtio16 idx;
91     struct vring_used_elem ring[];
92 };
93 
94 #include <poppack.h>
95 
96 /* Alignment requirements for vring elements.
97 * When using pre-virtio 1.0 layout, these fall out naturally.
98 */
99 #define VRING_AVAIL_ALIGN_SIZE 2
100 #define VRING_USED_ALIGN_SIZE 4
101 #define VRING_DESC_ALIGN_SIZE 16
102 
103 /* The standard layout for the ring is a continuous chunk of memory which looks
104 * like this.  We assume num is a power of 2.
105 *
106 * struct vring
107 * {
108 *	// The actual descriptors (16 bytes each)
109 *	struct vring_desc desc[num];
110 *
111 *	// A ring of available descriptor heads with free-running index.
112 *	__virtio16 avail_flags;
113 *	__virtio16 avail_idx;
114 *	__virtio16 available[num];
115 *	__virtio16 used_event_idx;
116 *
117 *	// Padding to the next align boundary.
118 *	char pad[];
119 *
120 *	// A ring of used descriptor heads with free-running index.
121 *	__virtio16 used_flags;
122 *	__virtio16 used_idx;
123 *	struct vring_used_elem used[num];
124 *	__virtio16 avail_event_idx;
125 * };
126 */
127 /* We publish the used event index at the end of the available ring, and vice
128 * versa. They are at the end for backwards compatibility. */
129 
130 struct vring {
131     unsigned int num;
132 
133     struct vring_desc *desc;
134 
135     struct vring_avail *avail;
136 
137     struct vring_used *used;
138 };
139 
140 #define vring_used_event(vr) ((vr)->avail->ring[(vr)->num])
141 #define vring_avail_event(vr) (*(__virtio16 *)&(vr)->used->ring[(vr)->num])
142 
143 static inline void vring_init(struct vring *vr, unsigned int num, void *p,
144     unsigned long align)
145 {
146     vr->num = num;
147     vr->desc = (struct vring_desc *)p;
148     vr->avail = (struct vring_avail *)((__u8 *)p + num * sizeof(struct vring_desc));
149     vr->used = (struct vring_used *)(((ULONG_PTR)&vr->avail->ring[num] + sizeof(__virtio16)
150         + align - 1) & ~((ULONG_PTR)align - 1));
151 }
152 
153 static inline unsigned vring_size_split(unsigned int num, unsigned long align)
154 {
155 #pragma warning (push)
156 #pragma warning (disable:4319)
157     return ((sizeof(struct vring_desc) * num + sizeof(__virtio16) * (3 + num)
158         + align - 1) & ~(align - 1))
159         + sizeof(__virtio16) * 3 + sizeof(struct vring_used_elem) * num;
160 #pragma warning(pop)
161 }
162 
163 /* The following is used with USED_EVENT_IDX and AVAIL_EVENT_IDX */
164 /* Assuming a given event_idx value from the other side, if
165 * we have just incremented index from old to new_idx,
166 * should we trigger an event? */
167 static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
168 {
169     /* Note: Xen has similar logic for notification hold-off
170     * in include/xen/interface/io/ring.h with req_event and req_prod
171     * corresponding to event_idx + 1 and new_idx respectively.
172     * Note also that req_event and req_prod in Xen start at 1,
173     * event indexes in virtio start at 0. */
174     return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
175 }
176 
177 struct virtqueue_split {
178     struct virtqueue vq;
179     struct vring vring;
180     struct {
181         u16 flags;
182         u16 idx;
183     } master_vring_avail;
184     unsigned int num_unused;
185     unsigned int num_added_since_kick;
186     u16 first_unused;
187     u16 last_used;
188     void *opaque[];
189 };
190 
191 #define splitvq(vq) ((struct virtqueue_split *)vq)
192 
193 #pragma warning (pop)
194 
195  /* Returns the index of the first unused descriptor */
196 static inline u16 get_unused_desc(struct virtqueue_split *vq)
197 {
198     u16 idx = vq->first_unused;
199     ASSERT(vq->num_unused > 0);
200 
201     vq->first_unused = vq->vring.desc[idx].next;
202     vq->num_unused--;
203     return idx;
204 }
205 
206 /* Marks the descriptor chain starting at index idx as unused */
207 static inline void put_unused_desc_chain(struct virtqueue_split *vq, u16 idx)
208 {
209     u16 start = idx;
210 
211     vq->opaque[idx] = NULL;
212     while (vq->vring.desc[idx].flags & VIRTQ_DESC_F_NEXT) {
213         idx = vq->vring.desc[idx].next;
214         vq->num_unused++;
215     }
216 
217     vq->vring.desc[idx].flags = VIRTQ_DESC_F_NEXT;
218     vq->vring.desc[idx].next = vq->first_unused;
219     vq->num_unused++;
220 
221     vq->first_unused = start;
222 }
223 
224 /* Adds a buffer to a virtqueue, returns 0 on success, negative number on error */
225 static int virtqueue_add_buf_split(
226     struct virtqueue *_vq,    /* the queue */
227     struct scatterlist sg[], /* sg array of length out + in */
228     unsigned int out,        /* number of driver->device buffer descriptors in sg */
229     unsigned int in,         /* number of device->driver buffer descriptors in sg */
230     void *opaque,            /* later returned from virtqueue_get_buf */
231     void *va_indirect,       /* VA of the indirect page or NULL */
232     ULONGLONG phys_indirect) /* PA of the indirect page or 0 */
233 {
234     struct virtqueue_split *vq = splitvq(_vq);
235     struct vring *vring = &vq->vring;
236     unsigned int i;
237     u16 idx;
238 
239     if (va_indirect && (out + in) > 1 && vq->num_unused > 0) {
240         /* Use one indirect descriptor */
241         struct vring_desc *desc = (struct vring_desc *)va_indirect;
242 
243         for (i = 0; i < out + in; i++) {
244             desc[i].flags = (i < out ? 0 : VIRTQ_DESC_F_WRITE);
245             desc[i].flags |= VIRTQ_DESC_F_NEXT;
246             desc[i].addr = sg[i].physAddr.QuadPart;
247             desc[i].len = sg[i].length;
248             desc[i].next = (u16)i + 1;
249         }
250         desc[i - 1].flags &= ~VIRTQ_DESC_F_NEXT;
251 
252         idx = get_unused_desc(vq);
253         vq->vring.desc[idx].flags = VIRTQ_DESC_F_INDIRECT;
254         vq->vring.desc[idx].addr = phys_indirect;
255         vq->vring.desc[idx].len = i * sizeof(struct vring_desc);
256 
257         vq->opaque[idx] = opaque;
258     } else {
259         u16 last_idx;
260 
261         /* Use out + in regular descriptors */
262         if (out + in > vq->num_unused) {
263             return -ENOSPC;
264         }
265 
266         /* First descriptor */
267         idx = last_idx = get_unused_desc(vq);
268         vq->opaque[idx] = opaque;
269 
270         vring->desc[idx].addr = sg[0].physAddr.QuadPart;
271         vring->desc[idx].len = sg[0].length;
272         vring->desc[idx].flags = VIRTQ_DESC_F_NEXT;
273         if (out == 0) {
274             vring->desc[idx].flags |= VIRTQ_DESC_F_WRITE;
275         }
276         vring->desc[idx].next = vq->first_unused;
277 
278         /* The rest of descriptors */
279         for (i = 1; i < out + in; i++) {
280             last_idx = get_unused_desc(vq);
281 
282             vring->desc[last_idx].addr = sg[i].physAddr.QuadPart;
283             vring->desc[last_idx].len = sg[i].length;
284             vring->desc[last_idx].flags = VIRTQ_DESC_F_NEXT;
285             if (i >= out) {
286                 vring->desc[last_idx].flags |= VIRTQ_DESC_F_WRITE;
287             }
288             vring->desc[last_idx].next = vq->first_unused;
289         }
290         vring->desc[last_idx].flags &= ~VIRTQ_DESC_F_NEXT;
291     }
292 
293     /* Write the first descriptor into the available ring */
294     vring->avail->ring[DESC_INDEX(vring->num, vq->master_vring_avail.idx)] = idx;
295     KeMemoryBarrier();
296     vring->avail->idx = ++vq->master_vring_avail.idx;
297     vq->num_added_since_kick++;
298 
299     return 0;
300 }
301 
302 /* Gets the opaque pointer associated with a returned buffer, or NULL if no buffer is available */
303 static void *virtqueue_get_buf_split(
304     struct virtqueue *_vq, /* the queue */
305     unsigned int *len)    /* number of bytes returned by the device */
306 {
307     struct virtqueue_split *vq = splitvq(_vq);
308     void *opaque;
309     u16 idx;
310 
311     if (vq->last_used == (int)vq->vring.used->idx) {
312         /* No descriptor index in the used ring */
313         return NULL;
314     }
315     KeMemoryBarrier();
316 
317     idx = DESC_INDEX(vq->vring.num, vq->last_used);
318     *len = vq->vring.used->ring[idx].len;
319 
320     /* Get the first used descriptor */
321     idx = (u16)vq->vring.used->ring[idx].id;
322     opaque = vq->opaque[idx];
323 
324     /* Put all descriptors back to the free list */
325     put_unused_desc_chain(vq, idx);
326 
327     vq->last_used++;
328     if (_vq->vdev->event_suppression_enabled && virtqueue_is_interrupt_enabled(_vq)) {
329         vring_used_event(&vq->vring) = vq->last_used;
330         KeMemoryBarrier();
331     }
332 
333     ASSERT(opaque != NULL);
334     return opaque;
335 }
336 
337 /* Returns true if at least one returned buffer is available, false otherwise */
338 static BOOLEAN virtqueue_has_buf_split(struct virtqueue *_vq)
339 {
340     struct virtqueue_split *vq = splitvq(_vq);
341     return (vq->last_used != vq->vring.used->idx);
342 }
343 
344 /* Returns true if the device should be notified, false otherwise */
345 static bool virtqueue_kick_prepare_split(struct virtqueue *_vq)
346 {
347     struct virtqueue_split *vq = splitvq(_vq);
348     bool wrap_around;
349     u16 old, new;
350     KeMemoryBarrier();
351 
352     wrap_around = (vq->num_added_since_kick >= (1 << 16));
353 
354     old = (u16)(vq->master_vring_avail.idx - vq->num_added_since_kick);
355     new = vq->master_vring_avail.idx;
356     vq->num_added_since_kick = 0;
357 
358     if (_vq->vdev->event_suppression_enabled) {
359         return wrap_around || (bool)vring_need_event(vring_avail_event(&vq->vring), new, old);
360     } else {
361         return !(vq->vring.used->flags & VIRTQ_USED_F_NO_NOTIFY);
362     }
363 }
364 
365 /* Notifies the device even if it's not necessary according to the event suppression logic */
366 static void virtqueue_kick_always_split(struct virtqueue *_vq)
367 {
368     struct virtqueue_split *vq = splitvq(_vq);
369     KeMemoryBarrier();
370     vq->num_added_since_kick = 0;
371     virtqueue_notify(_vq);
372 }
373 
374 /* Enables interrupts on a virtqueue and returns false if the queue has at least one returned
375  * buffer available to be fetched by virtqueue_get_buf, true otherwise */
376 static bool virtqueue_enable_cb_split(struct virtqueue *_vq)
377 {
378     struct virtqueue_split *vq = splitvq(_vq);
379     if (!virtqueue_is_interrupt_enabled(_vq)) {
380         vq->master_vring_avail.flags &= ~VIRTQ_AVAIL_F_NO_INTERRUPT;
381         if (!_vq->vdev->event_suppression_enabled)
382         {
383             vq->vring.avail->flags = vq->master_vring_avail.flags;
384         }
385     }
386 
387     vring_used_event(&vq->vring) = vq->last_used;
388     KeMemoryBarrier();
389     return (vq->last_used == vq->vring.used->idx);
390 }
391 
392 /* Enables interrupts on a virtqueue after ~3/4 of the currently pushed buffers have been
393  * returned, returns false if this condition currently holds, false otherwise */
394 static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq)
395 {
396     struct virtqueue_split *vq = splitvq(_vq);
397     u16 bufs;
398 
399     if (!virtqueue_is_interrupt_enabled(_vq)) {
400         vq->master_vring_avail.flags &= ~VIRTQ_AVAIL_F_NO_INTERRUPT;
401         if (!_vq->vdev->event_suppression_enabled)
402         {
403             vq->vring.avail->flags = vq->master_vring_avail.flags;
404         }
405     }
406 
407     /* Note that 3/4 is an arbitrary threshold */
408     bufs = (u16)(vq->master_vring_avail.idx - vq->last_used) * 3 / 4;
409     vring_used_event(&vq->vring) = vq->last_used + bufs;
410     KeMemoryBarrier();
411     return ((vq->vring.used->idx - vq->last_used) <= bufs);
412 }
413 
414 /* Disables interrupts on a virtqueue */
415 static void virtqueue_disable_cb_split(struct virtqueue *_vq)
416 {
417     struct virtqueue_split *vq = splitvq(_vq);
418     if (virtqueue_is_interrupt_enabled(_vq)) {
419         vq->master_vring_avail.flags |= VIRTQ_AVAIL_F_NO_INTERRUPT;
420         if (!_vq->vdev->event_suppression_enabled)
421         {
422             vq->vring.avail->flags = vq->master_vring_avail.flags;
423         }
424     }
425 }
426 
427 /* Returns true if interrupts are enabled on a virtqueue, false otherwise */
428 static BOOLEAN virtqueue_is_interrupt_enabled_split(struct virtqueue *_vq)
429 {
430     struct virtqueue_split *vq = splitvq(_vq);
431     return !(vq->master_vring_avail.flags & VIRTQ_AVAIL_F_NO_INTERRUPT);
432 }
433 
434 /* Re-initializes an already initialized virtqueue */
435 static void virtqueue_shutdown_split(struct virtqueue *_vq)
436 {
437     struct virtqueue_split *vq = splitvq(_vq);
438     unsigned int num = vq->vring.num;
439     void *pages = vq->vring.desc;
440     unsigned int vring_align = _vq->vdev->addr ? PAGE_SIZE : SMP_CACHE_BYTES;
441 
442     RtlZeroMemory(pages, vring_size_split(num, vring_align));
443     (void)vring_new_virtqueue_split(
444         _vq->index,
445         vq->vring.num,
446         vring_align,
447         _vq->vdev,
448         pages,
449         _vq->notification_cb,
450         vq);
451 }
452 
453 /* Gets the opaque pointer associated with a not-yet-returned buffer, or NULL if no buffer is available
454  * to aid drivers with cleaning up all data on virtqueue shutdown */
455 static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq)
456 {
457     struct virtqueue_split *vq = splitvq(_vq);
458     u16 idx;
459     void *opaque = NULL;
460 
461     for (idx = 0; idx < (u16)vq->vring.num; idx++) {
462         opaque = vq->opaque[idx];
463         if (opaque) {
464             put_unused_desc_chain(vq, idx);
465             vq->vring.avail->idx = --vq->master_vring_avail.idx;
466             break;
467         }
468     }
469     return opaque;
470 }
471 
472 /* Returns the size of the virtqueue structure including
473  * additional size for per-descriptor data */
474 unsigned int vring_control_block_size(u16 qsize, bool packed)
475 {
476     unsigned int res;
477     if (packed) {
478         return vring_control_block_size_packed(qsize);
479     }
480     res = sizeof(struct virtqueue_split);
481     res += sizeof(void *) * qsize;
482     return res;
483 }
484 
485 /* Initializes a new virtqueue using already allocated memory */
486 struct virtqueue *vring_new_virtqueue_split(
487     unsigned int index,                 /* virtqueue index */
488     unsigned int num,                   /* virtqueue size (always a power of 2) */
489     unsigned int vring_align,           /* vring alignment requirement */
490     VirtIODevice *vdev,                 /* the virtio device owning the queue */
491     void *pages,                        /* vring memory */
492     void(*notify)(struct virtqueue *), /* notification callback */
493     void *control)                      /* virtqueue memory */
494 {
495     struct virtqueue_split *vq = splitvq(control);
496     u16 i;
497 
498     if (DESC_INDEX(num, num) != 0) {
499         DPrintf(0, "Virtqueue length %u is not a power of 2\n", num);
500         return NULL;
501     }
502 
503     RtlZeroMemory(vq, sizeof(*vq) + num * sizeof(void *));
504 
505     vring_init(&vq->vring, num, pages, vring_align);
506     vq->vq.vdev = vdev;
507     vq->vq.notification_cb = notify;
508     vq->vq.index = index;
509 
510     /* Build a linked list of unused descriptors */
511     vq->num_unused = num;
512     vq->first_unused = 0;
513     for (i = 0; i < num - 1; i++) {
514         vq->vring.desc[i].flags = VIRTQ_DESC_F_NEXT;
515         vq->vring.desc[i].next = i + 1;
516     }
517     vq->vq.avail_va = vq->vring.avail;
518     vq->vq.used_va = vq->vring.used;
519     vq->vq.add_buf = virtqueue_add_buf_split;
520     vq->vq.detach_unused_buf = virtqueue_detach_unused_buf_split;
521     vq->vq.disable_cb = virtqueue_disable_cb_split;
522     vq->vq.enable_cb = virtqueue_enable_cb_split;
523     vq->vq.enable_cb_delayed = virtqueue_enable_cb_delayed_split;
524     vq->vq.get_buf = virtqueue_get_buf_split;
525     vq->vq.has_buf = virtqueue_has_buf_split;
526     vq->vq.is_interrupt_enabled = virtqueue_is_interrupt_enabled_split;
527     vq->vq.kick_always = virtqueue_kick_always_split;
528     vq->vq.kick_prepare = virtqueue_kick_prepare_split;
529     vq->vq.shutdown = virtqueue_shutdown_split;
530     return &vq->vq;
531 }
532 
533 /* Negotiates virtio transport features */
534 void vring_transport_features(
535     VirtIODevice *vdev,
536     u64 *features) /* points to device features on entry and driver accepted features on return */
537 {
538     unsigned int i;
539 
540     for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
541         if (i != VIRTIO_RING_F_INDIRECT_DESC &&
542             i != VIRTIO_RING_F_EVENT_IDX &&
543             i != VIRTIO_F_VERSION_1) {
544             virtio_feature_disable(*features, i);
545         }
546     }
547 }
548 
549 /* Returns the max number of scatter-gather elements that fit in an indirect pages */
550 u32 virtio_get_indirect_page_capacity()
551 {
552     return PAGE_SIZE / sizeof(struct vring_desc);
553 }
554 
555 unsigned long vring_size(unsigned int num, unsigned long align, bool packed)
556 {
557     if (packed) {
558         return vring_size_packed(num, align);
559     } else {
560         return vring_size_split(num, align);
561     }
562 }
563