1 /*
2 * Virtio ring manipulation routines
3 *
4 * Copyright 2017 Red Hat, Inc.
5 *
6 * Authors:
7 * Ladi Prosek <lprosek@redhat.com>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met :
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and / or other materials provided with the distribution.
17 * 3. Neither the names of the copyright holders nor the names of their contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32 #include "osdep.h"
33 #include "virtio_pci.h"
34 #include "VirtIO.h"
35 #include "kdebugprint.h"
36 #include "virtio_ring.h"
37 #include "windows/virtio_ring_allocation.h"
38
39 #define DESC_INDEX(num, i) ((i) & ((num) - 1))
40
41 /* This marks a buffer as continuing via the next field. */
42 #define VIRTQ_DESC_F_NEXT 1
43 /* This marks a buffer as write-only (otherwise read-only). */
44 #define VIRTQ_DESC_F_WRITE 2
45 /* This means the buffer contains a list of buffer descriptors. */
46 #define VIRTQ_DESC_F_INDIRECT 4
47
48 /* The Host uses this in used->flags to advise the Guest: don't kick me when
49 * you add a buffer. It's unreliable, so it's simply an optimization. Guest
50 * will still kick if it's out of buffers. */
51 #define VIRTQ_USED_F_NO_NOTIFY 1
52 /* The Guest uses this in avail->flags to advise the Host: don't interrupt me
53 * when you consume a buffer. It's unreliable, so it's simply an
54 * optimization. */
55 #define VIRTQ_AVAIL_F_NO_INTERRUPT 1
56
57 #pragma warning (push)
58 #pragma warning (disable:4200)
59
60 #include <pshpack1.h>
61
62 /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */
63 struct vring_desc {
64 /* Address (guest-physical). */
65 __virtio64 addr;
66 /* Length. */
67 __virtio32 len;
68 /* The flags as indicated above. */
69 __virtio16 flags;
70 /* We chain unused descriptors via this, too */
71 __virtio16 next;
72 };
73
74 struct vring_avail {
75 __virtio16 flags;
76 __virtio16 idx;
77 __virtio16 ring[];
78 };
79
80 /* u32 is used here for ids for padding reasons. */
81 struct vring_used_elem {
82 /* Index of start of used descriptor chain. */
83 __virtio32 id;
84 /* Total length of the descriptor chain which was used (written to) */
85 __virtio32 len;
86 };
87
88 struct vring_used {
89 __virtio16 flags;
90 __virtio16 idx;
91 struct vring_used_elem ring[];
92 };
93
94 #include <poppack.h>
95
96 /* Alignment requirements for vring elements.
97 * When using pre-virtio 1.0 layout, these fall out naturally.
98 */
99 #define VRING_AVAIL_ALIGN_SIZE 2
100 #define VRING_USED_ALIGN_SIZE 4
101 #define VRING_DESC_ALIGN_SIZE 16
102
103 /* The standard layout for the ring is a continuous chunk of memory which looks
104 * like this. We assume num is a power of 2.
105 *
106 * struct vring
107 * {
108 * // The actual descriptors (16 bytes each)
109 * struct vring_desc desc[num];
110 *
111 * // A ring of available descriptor heads with free-running index.
112 * __virtio16 avail_flags;
113 * __virtio16 avail_idx;
114 * __virtio16 available[num];
115 * __virtio16 used_event_idx;
116 *
117 * // Padding to the next align boundary.
118 * char pad[];
119 *
120 * // A ring of used descriptor heads with free-running index.
121 * __virtio16 used_flags;
122 * __virtio16 used_idx;
123 * struct vring_used_elem used[num];
124 * __virtio16 avail_event_idx;
125 * };
126 */
127 /* We publish the used event index at the end of the available ring, and vice
128 * versa. They are at the end for backwards compatibility. */
129
130 struct vring {
131 unsigned int num;
132
133 struct vring_desc *desc;
134
135 struct vring_avail *avail;
136
137 struct vring_used *used;
138 };
139
140 #define vring_used_event(vr) ((vr)->avail->ring[(vr)->num])
141 #define vring_avail_event(vr) (*(__virtio16 *)&(vr)->used->ring[(vr)->num])
142
vring_init(struct vring * vr,unsigned int num,void * p,unsigned long align)143 static inline void vring_init(struct vring *vr, unsigned int num, void *p,
144 unsigned long align)
145 {
146 vr->num = num;
147 vr->desc = (struct vring_desc *)p;
148 vr->avail = (struct vring_avail *)((__u8 *)p + num * sizeof(struct vring_desc));
149 vr->used = (struct vring_used *)(((ULONG_PTR)&vr->avail->ring[num] + sizeof(__virtio16)
150 + align - 1) & ~((ULONG_PTR)align - 1));
151 }
152
vring_size_split(unsigned int num,unsigned long align)153 static inline unsigned vring_size_split(unsigned int num, unsigned long align)
154 {
155 #pragma warning (push)
156 #pragma warning (disable:4319)
157 return ((sizeof(struct vring_desc) * num + sizeof(__virtio16) * (3 + num)
158 + align - 1) & ~(align - 1))
159 + sizeof(__virtio16) * 3 + sizeof(struct vring_used_elem) * num;
160 #pragma warning(pop)
161 }
162
163 /* The following is used with USED_EVENT_IDX and AVAIL_EVENT_IDX */
164 /* Assuming a given event_idx value from the other side, if
165 * we have just incremented index from old to new_idx,
166 * should we trigger an event? */
vring_need_event(__u16 event_idx,__u16 new_idx,__u16 old)167 static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
168 {
169 /* Note: Xen has similar logic for notification hold-off
170 * in include/xen/interface/io/ring.h with req_event and req_prod
171 * corresponding to event_idx + 1 and new_idx respectively.
172 * Note also that req_event and req_prod in Xen start at 1,
173 * event indexes in virtio start at 0. */
174 return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
175 }
176
177 struct virtqueue_split {
178 struct virtqueue vq;
179 struct vring vring;
180 struct {
181 u16 flags;
182 u16 idx;
183 } master_vring_avail;
184 unsigned int num_unused;
185 unsigned int num_added_since_kick;
186 u16 first_unused;
187 u16 last_used;
188 void *opaque[];
189 };
190
191 #define splitvq(vq) ((struct virtqueue_split *)vq)
192
193 #pragma warning (pop)
194
195 /* Returns the index of the first unused descriptor */
get_unused_desc(struct virtqueue_split * vq)196 static inline u16 get_unused_desc(struct virtqueue_split *vq)
197 {
198 u16 idx = vq->first_unused;
199 ASSERT(vq->num_unused > 0);
200
201 vq->first_unused = vq->vring.desc[idx].next;
202 vq->num_unused--;
203 return idx;
204 }
205
206 /* Marks the descriptor chain starting at index idx as unused */
put_unused_desc_chain(struct virtqueue_split * vq,u16 idx)207 static inline void put_unused_desc_chain(struct virtqueue_split *vq, u16 idx)
208 {
209 u16 start = idx;
210
211 vq->opaque[idx] = NULL;
212 while (vq->vring.desc[idx].flags & VIRTQ_DESC_F_NEXT) {
213 idx = vq->vring.desc[idx].next;
214 vq->num_unused++;
215 }
216
217 vq->vring.desc[idx].flags = VIRTQ_DESC_F_NEXT;
218 vq->vring.desc[idx].next = vq->first_unused;
219 vq->num_unused++;
220
221 vq->first_unused = start;
222 }
223
224 /* Adds a buffer to a virtqueue, returns 0 on success, negative number on error */
virtqueue_add_buf_split(struct virtqueue * _vq,struct scatterlist sg[],unsigned int out,unsigned int in,void * opaque,void * va_indirect,ULONGLONG phys_indirect)225 static int virtqueue_add_buf_split(
226 struct virtqueue *_vq, /* the queue */
227 struct scatterlist sg[], /* sg array of length out + in */
228 unsigned int out, /* number of driver->device buffer descriptors in sg */
229 unsigned int in, /* number of device->driver buffer descriptors in sg */
230 void *opaque, /* later returned from virtqueue_get_buf */
231 void *va_indirect, /* VA of the indirect page or NULL */
232 ULONGLONG phys_indirect) /* PA of the indirect page or 0 */
233 {
234 struct virtqueue_split *vq = splitvq(_vq);
235 struct vring *vring = &vq->vring;
236 unsigned int i;
237 u16 idx;
238
239 if (va_indirect && (out + in) > 1 && vq->num_unused > 0) {
240 /* Use one indirect descriptor */
241 struct vring_desc *desc = (struct vring_desc *)va_indirect;
242
243 for (i = 0; i < out + in; i++) {
244 desc[i].flags = (i < out ? 0 : VIRTQ_DESC_F_WRITE);
245 desc[i].flags |= VIRTQ_DESC_F_NEXT;
246 desc[i].addr = sg[i].physAddr.QuadPart;
247 desc[i].len = sg[i].length;
248 desc[i].next = (u16)i + 1;
249 }
250 desc[i - 1].flags &= ~VIRTQ_DESC_F_NEXT;
251
252 idx = get_unused_desc(vq);
253 vq->vring.desc[idx].flags = VIRTQ_DESC_F_INDIRECT;
254 vq->vring.desc[idx].addr = phys_indirect;
255 vq->vring.desc[idx].len = i * sizeof(struct vring_desc);
256
257 vq->opaque[idx] = opaque;
258 } else {
259 u16 last_idx;
260
261 /* Use out + in regular descriptors */
262 if (out + in > vq->num_unused) {
263 return -ENOSPC;
264 }
265
266 /* First descriptor */
267 idx = last_idx = get_unused_desc(vq);
268 vq->opaque[idx] = opaque;
269
270 vring->desc[idx].addr = sg[0].physAddr.QuadPart;
271 vring->desc[idx].len = sg[0].length;
272 vring->desc[idx].flags = VIRTQ_DESC_F_NEXT;
273 if (out == 0) {
274 vring->desc[idx].flags |= VIRTQ_DESC_F_WRITE;
275 }
276 vring->desc[idx].next = vq->first_unused;
277
278 /* The rest of descriptors */
279 for (i = 1; i < out + in; i++) {
280 last_idx = get_unused_desc(vq);
281
282 vring->desc[last_idx].addr = sg[i].physAddr.QuadPart;
283 vring->desc[last_idx].len = sg[i].length;
284 vring->desc[last_idx].flags = VIRTQ_DESC_F_NEXT;
285 if (i >= out) {
286 vring->desc[last_idx].flags |= VIRTQ_DESC_F_WRITE;
287 }
288 vring->desc[last_idx].next = vq->first_unused;
289 }
290 vring->desc[last_idx].flags &= ~VIRTQ_DESC_F_NEXT;
291 }
292
293 /* Write the first descriptor into the available ring */
294 vring->avail->ring[DESC_INDEX(vring->num, vq->master_vring_avail.idx)] = idx;
295 KeMemoryBarrier();
296 vring->avail->idx = ++vq->master_vring_avail.idx;
297 vq->num_added_since_kick++;
298
299 return 0;
300 }
301
302 /* Gets the opaque pointer associated with a returned buffer, or NULL if no buffer is available */
virtqueue_get_buf_split(struct virtqueue * _vq,unsigned int * len)303 static void *virtqueue_get_buf_split(
304 struct virtqueue *_vq, /* the queue */
305 unsigned int *len) /* number of bytes returned by the device */
306 {
307 struct virtqueue_split *vq = splitvq(_vq);
308 void *opaque;
309 u16 idx;
310
311 if (vq->last_used == (int)vq->vring.used->idx) {
312 /* No descriptor index in the used ring */
313 return NULL;
314 }
315 KeMemoryBarrier();
316
317 idx = DESC_INDEX(vq->vring.num, vq->last_used);
318 *len = vq->vring.used->ring[idx].len;
319
320 /* Get the first used descriptor */
321 idx = (u16)vq->vring.used->ring[idx].id;
322 opaque = vq->opaque[idx];
323
324 /* Put all descriptors back to the free list */
325 put_unused_desc_chain(vq, idx);
326
327 vq->last_used++;
328 if (_vq->vdev->event_suppression_enabled && virtqueue_is_interrupt_enabled(_vq)) {
329 vring_used_event(&vq->vring) = vq->last_used;
330 KeMemoryBarrier();
331 }
332
333 ASSERT(opaque != NULL);
334 return opaque;
335 }
336
337 /* Returns true if at least one returned buffer is available, false otherwise */
virtqueue_has_buf_split(struct virtqueue * _vq)338 static BOOLEAN virtqueue_has_buf_split(struct virtqueue *_vq)
339 {
340 struct virtqueue_split *vq = splitvq(_vq);
341 return (vq->last_used != vq->vring.used->idx);
342 }
343
344 /* Returns true if the device should be notified, false otherwise */
virtqueue_kick_prepare_split(struct virtqueue * _vq)345 static bool virtqueue_kick_prepare_split(struct virtqueue *_vq)
346 {
347 struct virtqueue_split *vq = splitvq(_vq);
348 bool wrap_around;
349 u16 old, new;
350 KeMemoryBarrier();
351
352 wrap_around = (vq->num_added_since_kick >= (1 << 16));
353
354 old = (u16)(vq->master_vring_avail.idx - vq->num_added_since_kick);
355 new = vq->master_vring_avail.idx;
356 vq->num_added_since_kick = 0;
357
358 if (_vq->vdev->event_suppression_enabled) {
359 return wrap_around || (bool)vring_need_event(vring_avail_event(&vq->vring), new, old);
360 } else {
361 return !(vq->vring.used->flags & VIRTQ_USED_F_NO_NOTIFY);
362 }
363 }
364
365 /* Notifies the device even if it's not necessary according to the event suppression logic */
virtqueue_kick_always_split(struct virtqueue * _vq)366 static void virtqueue_kick_always_split(struct virtqueue *_vq)
367 {
368 struct virtqueue_split *vq = splitvq(_vq);
369 KeMemoryBarrier();
370 vq->num_added_since_kick = 0;
371 virtqueue_notify(_vq);
372 }
373
374 /* Enables interrupts on a virtqueue and returns false if the queue has at least one returned
375 * buffer available to be fetched by virtqueue_get_buf, true otherwise */
virtqueue_enable_cb_split(struct virtqueue * _vq)376 static bool virtqueue_enable_cb_split(struct virtqueue *_vq)
377 {
378 struct virtqueue_split *vq = splitvq(_vq);
379 if (!virtqueue_is_interrupt_enabled(_vq)) {
380 vq->master_vring_avail.flags &= ~VIRTQ_AVAIL_F_NO_INTERRUPT;
381 if (!_vq->vdev->event_suppression_enabled)
382 {
383 vq->vring.avail->flags = vq->master_vring_avail.flags;
384 }
385 }
386
387 vring_used_event(&vq->vring) = vq->last_used;
388 KeMemoryBarrier();
389 return (vq->last_used == vq->vring.used->idx);
390 }
391
392 /* Enables interrupts on a virtqueue after ~3/4 of the currently pushed buffers have been
393 * returned, returns false if this condition currently holds, false otherwise */
virtqueue_enable_cb_delayed_split(struct virtqueue * _vq)394 static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq)
395 {
396 struct virtqueue_split *vq = splitvq(_vq);
397 u16 bufs;
398
399 if (!virtqueue_is_interrupt_enabled(_vq)) {
400 vq->master_vring_avail.flags &= ~VIRTQ_AVAIL_F_NO_INTERRUPT;
401 if (!_vq->vdev->event_suppression_enabled)
402 {
403 vq->vring.avail->flags = vq->master_vring_avail.flags;
404 }
405 }
406
407 /* Note that 3/4 is an arbitrary threshold */
408 bufs = (u16)(vq->master_vring_avail.idx - vq->last_used) * 3 / 4;
409 vring_used_event(&vq->vring) = vq->last_used + bufs;
410 KeMemoryBarrier();
411 return ((vq->vring.used->idx - vq->last_used) <= bufs);
412 }
413
414 /* Disables interrupts on a virtqueue */
virtqueue_disable_cb_split(struct virtqueue * _vq)415 static void virtqueue_disable_cb_split(struct virtqueue *_vq)
416 {
417 struct virtqueue_split *vq = splitvq(_vq);
418 if (virtqueue_is_interrupt_enabled(_vq)) {
419 vq->master_vring_avail.flags |= VIRTQ_AVAIL_F_NO_INTERRUPT;
420 if (!_vq->vdev->event_suppression_enabled)
421 {
422 vq->vring.avail->flags = vq->master_vring_avail.flags;
423 }
424 }
425 }
426
427 /* Returns true if interrupts are enabled on a virtqueue, false otherwise */
virtqueue_is_interrupt_enabled_split(struct virtqueue * _vq)428 static BOOLEAN virtqueue_is_interrupt_enabled_split(struct virtqueue *_vq)
429 {
430 struct virtqueue_split *vq = splitvq(_vq);
431 return !(vq->master_vring_avail.flags & VIRTQ_AVAIL_F_NO_INTERRUPT);
432 }
433
434 /* Re-initializes an already initialized virtqueue */
virtqueue_shutdown_split(struct virtqueue * _vq)435 static void virtqueue_shutdown_split(struct virtqueue *_vq)
436 {
437 struct virtqueue_split *vq = splitvq(_vq);
438 unsigned int num = vq->vring.num;
439 void *pages = vq->vring.desc;
440 unsigned int vring_align = _vq->vdev->addr ? PAGE_SIZE : SMP_CACHE_BYTES;
441
442 RtlZeroMemory(pages, vring_size_split(num, vring_align));
443 (void)vring_new_virtqueue_split(
444 _vq->index,
445 vq->vring.num,
446 vring_align,
447 _vq->vdev,
448 pages,
449 _vq->notification_cb,
450 vq);
451 }
452
453 /* Gets the opaque pointer associated with a not-yet-returned buffer, or NULL if no buffer is available
454 * to aid drivers with cleaning up all data on virtqueue shutdown */
virtqueue_detach_unused_buf_split(struct virtqueue * _vq)455 static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq)
456 {
457 struct virtqueue_split *vq = splitvq(_vq);
458 u16 idx;
459 void *opaque = NULL;
460
461 for (idx = 0; idx < (u16)vq->vring.num; idx++) {
462 opaque = vq->opaque[idx];
463 if (opaque) {
464 put_unused_desc_chain(vq, idx);
465 vq->vring.avail->idx = --vq->master_vring_avail.idx;
466 break;
467 }
468 }
469 return opaque;
470 }
471
472 /* Returns the size of the virtqueue structure including
473 * additional size for per-descriptor data */
vring_control_block_size(u16 qsize,bool packed)474 unsigned int vring_control_block_size(u16 qsize, bool packed)
475 {
476 unsigned int res;
477 if (packed) {
478 return vring_control_block_size_packed(qsize);
479 }
480 res = sizeof(struct virtqueue_split);
481 res += sizeof(void *) * qsize;
482 return res;
483 }
484
485 /* Initializes a new virtqueue using already allocated memory */
vring_new_virtqueue_split(unsigned int index,unsigned int num,unsigned int vring_align,VirtIODevice * vdev,void * pages,void (* notify)(struct virtqueue *),void * control)486 struct virtqueue *vring_new_virtqueue_split(
487 unsigned int index, /* virtqueue index */
488 unsigned int num, /* virtqueue size (always a power of 2) */
489 unsigned int vring_align, /* vring alignment requirement */
490 VirtIODevice *vdev, /* the virtio device owning the queue */
491 void *pages, /* vring memory */
492 void(*notify)(struct virtqueue *), /* notification callback */
493 void *control) /* virtqueue memory */
494 {
495 struct virtqueue_split *vq = splitvq(control);
496 u16 i;
497
498 if (DESC_INDEX(num, num) != 0) {
499 DPrintf(0, "Virtqueue length %u is not a power of 2\n", num);
500 return NULL;
501 }
502
503 RtlZeroMemory(vq, sizeof(*vq) + num * sizeof(void *));
504
505 vring_init(&vq->vring, num, pages, vring_align);
506 vq->vq.vdev = vdev;
507 vq->vq.notification_cb = notify;
508 vq->vq.index = index;
509
510 /* Build a linked list of unused descriptors */
511 vq->num_unused = num;
512 vq->first_unused = 0;
513 for (i = 0; i < num - 1; i++) {
514 vq->vring.desc[i].flags = VIRTQ_DESC_F_NEXT;
515 vq->vring.desc[i].next = i + 1;
516 }
517 vq->vq.avail_va = vq->vring.avail;
518 vq->vq.used_va = vq->vring.used;
519 vq->vq.add_buf = virtqueue_add_buf_split;
520 vq->vq.detach_unused_buf = virtqueue_detach_unused_buf_split;
521 vq->vq.disable_cb = virtqueue_disable_cb_split;
522 vq->vq.enable_cb = virtqueue_enable_cb_split;
523 vq->vq.enable_cb_delayed = virtqueue_enable_cb_delayed_split;
524 vq->vq.get_buf = virtqueue_get_buf_split;
525 vq->vq.has_buf = virtqueue_has_buf_split;
526 vq->vq.is_interrupt_enabled = virtqueue_is_interrupt_enabled_split;
527 vq->vq.kick_always = virtqueue_kick_always_split;
528 vq->vq.kick_prepare = virtqueue_kick_prepare_split;
529 vq->vq.shutdown = virtqueue_shutdown_split;
530 return &vq->vq;
531 }
532
533 /* Negotiates virtio transport features */
vring_transport_features(VirtIODevice * vdev,u64 * features)534 void vring_transport_features(
535 VirtIODevice *vdev,
536 u64 *features) /* points to device features on entry and driver accepted features on return */
537 {
538 unsigned int i;
539
540 for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
541 if (i != VIRTIO_RING_F_INDIRECT_DESC &&
542 i != VIRTIO_RING_F_EVENT_IDX &&
543 i != VIRTIO_F_VERSION_1) {
544 virtio_feature_disable(*features, i);
545 }
546 }
547 }
548
549 /* Returns the max number of scatter-gather elements that fit in an indirect pages */
virtio_get_indirect_page_capacity()550 u32 virtio_get_indirect_page_capacity()
551 {
552 return PAGE_SIZE / sizeof(struct vring_desc);
553 }
554
vring_size(unsigned int num,unsigned long align,bool packed)555 unsigned long vring_size(unsigned int num, unsigned long align, bool packed)
556 {
557 if (packed) {
558 return vring_size_packed(num, align);
559 } else {
560 return vring_size_split(num, align);
561 }
562 }
563