1 /*
2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * This file and its contents are supplied under the terms of the
28  * Common Development and Distribution License ("CDDL"), version 1.0.
29  * You may only use this file in accordance with the terms of version
30  * 1.0 of the CDDL.
31  *
32  * A full copy of the text of the CDDL should have accompanied this
33  * source.  A copy of the CDDL is also available via the Internet at
34  * http://www.illumos.org/license/CDDL.
35  *
36  * Copyright 2015 Pluribus Networks Inc.
37  * Copyright 2019 Joyent, Inc.
38  * Copyright 2021 Oxide Computer Company
39  */
40 
41 
42 #include <sys/disp.h>
43 
44 #include "viona_impl.h"
45 
46 #define	VRING_MAX_LEN		32768
47 
48 /* Layout and sizing as defined in the spec for a legacy-style virtqueue */
49 
50 #define	LEGACY_VQ_ALIGN		PAGESIZE
51 
52 #define	LEGACY_DESC_SZ(qsz)	((qsz) * sizeof (struct virtio_desc))
53 /*
54  * Available ring consists of avail_idx (uint16_t), flags (uint16_t), qsz avail
55  * descriptors (uint16_t each), and (optional) used_event (uint16_t).
56  */
57 #define	LEGACY_AVAIL_SZ(qsz)	(((qsz) + 3) * sizeof (uint16_t))
58 /*
59  * Used ring consists of used_idx (uint16_t), flags (uint16_t), qsz used
60  * descriptors (two uint32_t each), and (optional) avail_event (uint16_t).
61  */
62 #define	LEGACY_USED_SZ(qsz)	\
63 	((qsz) * sizeof (struct virtio_used) + 3 * sizeof (uint16_t))
64 
65 #define	LEGACY_AVAIL_FLAGS_OFF(qsz)	LEGACY_DESC_SZ(qsz)
66 #define	LEGACY_AVAIL_IDX_OFF(qsz)	\
67 	(LEGACY_DESC_SZ(qsz) + sizeof (uint16_t))
68 #define	LEGACY_AVAIL_ENT_OFF(qsz, idx)	\
69 	(LEGACY_DESC_SZ(qsz) + (2 + (idx)) * sizeof (uint16_t))
70 
71 #define	LEGACY_USED_FLAGS_OFF(qsz)	\
72 	P2ROUNDUP(LEGACY_DESC_SZ(qsz) + LEGACY_AVAIL_SZ(qsz), LEGACY_VQ_ALIGN)
73 #define	LEGACY_USED_IDX_OFF(qsz)	\
74 	(LEGACY_USED_FLAGS_OFF(qsz) + sizeof (uint16_t))
75 #define	LEGACY_USED_ENT_OFF(qsz, idx)	\
76 	(LEGACY_USED_FLAGS_OFF(qsz) + 2 * sizeof (uint16_t) + \
77 	(idx) * sizeof (struct virtio_used))
78 
79 #define	LEGACY_VQ_SIZE(qsz)	\
80 	(LEGACY_USED_FLAGS_OFF(qsz) + \
81 	P2ROUNDUP(LEGACY_USED_SZ(qsz), LEGACY_VQ_ALIGN))
82 #define	LEGACY_VQ_PAGES(qsz)	(LEGACY_VQ_SIZE(qsz) / PAGESIZE)
83 
84 struct vq_held_region {
85 	struct iovec	*vhr_iov;
86 	vmm_page_t	*vhr_head;
87 	vmm_page_t	*vhr_tail;
88 	/* Length of iovec array supplied in `vhr_iov` */
89 	uint_t		vhr_niov;
90 	/*
91 	 * Index into vhr_iov, indicating the next "free" entry (following the
92 	 * last entry which has valid contents).
93 	 */
94 	uint_t		vhr_idx;
95 };
96 typedef struct vq_held_region vq_held_region_t;
97 
98 static boolean_t viona_ring_map(viona_vring_t *);
99 static void viona_ring_unmap(viona_vring_t *);
100 static kthread_t *viona_create_worker(viona_vring_t *);
101 
102 static vmm_page_t *
103 vq_page_hold(viona_vring_t *ring, uint64_t gpa, bool writable)
104 {
105 	ASSERT3P(ring->vr_lease, !=, NULL);
106 
107 	int prot = PROT_READ;
108 	if (writable) {
109 		prot |= PROT_WRITE;
110 	}
111 
112 	return (vmm_drv_page_hold(ring->vr_lease, gpa, prot));
113 }
114 
115 /*
116  * Establish a hold on the page(s) which back the region of guest memory covered
117  * by [gpa, gpa + len).  The host-kernel-virtual pointers to those pages are
118  * stored in the iovec array supplied in `region`, along with the chain of
119  * vmm_page_t entries representing the held pages.  Since guest memory
120  * carries no guarantees of being physically contiguous (on the host), it is
121  * assumed that an iovec entry will be required for each PAGESIZE section
122  * covered by the specified `gpa` and `len` range.  For each iovec entry
123  * successfully populated by holding a page, `vhr_idx` will be incremented so it
124  * references the next available iovec entry (or `vhr_niov`, if the iovec array
125  * is full).  The responsibility for releasing the `vmm_page_t` chain (stored in
126  * `vhr_head` and `vhr_tail`) resides with the caller, regardless of the result.
127  */
128 static int
129 vq_region_hold(viona_vring_t *ring, uint64_t gpa, uint32_t len,
130     bool writable, vq_held_region_t *region)
131 {
132 	const uint32_t front_offset = gpa & PAGEOFFSET;
133 	const uint32_t front_len = MIN(len, PAGESIZE - front_offset);
134 	uint_t pages = 1;
135 	vmm_page_t *vmp;
136 	caddr_t buf;
137 
138 	ASSERT3U(region->vhr_idx, <, region->vhr_niov);
139 
140 	if (front_len < len) {
141 		pages += P2ROUNDUP((uint64_t)(len - front_len),
142 		    PAGESIZE) / PAGESIZE;
143 	}
144 	if (pages > (region->vhr_niov - region->vhr_idx)) {
145 		return (E2BIG);
146 	}
147 
148 	vmp = vq_page_hold(ring, gpa & PAGEMASK, writable);
149 	if (vmp == NULL) {
150 		return (EFAULT);
151 	}
152 	buf = (caddr_t)vmm_drv_page_readable(vmp);
153 
154 	region->vhr_iov[region->vhr_idx].iov_base = buf + front_offset;
155 	region->vhr_iov[region->vhr_idx].iov_len = front_len;
156 	region->vhr_idx++;
157 	gpa += front_len;
158 	len -= front_len;
159 	if (region->vhr_head == NULL) {
160 		region->vhr_head = vmp;
161 		region->vhr_tail = vmp;
162 	} else {
163 		vmm_drv_page_chain(region->vhr_tail, vmp);
164 		region->vhr_tail = vmp;
165 	}
166 
167 	for (uint_t i = 1; i < pages; i++) {
168 		ASSERT3U(gpa & PAGEOFFSET, ==, 0);
169 
170 		vmp = vq_page_hold(ring, gpa, writable);
171 		if (vmp == NULL) {
172 			return (EFAULT);
173 		}
174 		buf = (caddr_t)vmm_drv_page_readable(vmp);
175 
176 		const uint32_t chunk_len = MIN(len, PAGESIZE);
177 		region->vhr_iov[region->vhr_idx].iov_base = buf;
178 		region->vhr_iov[region->vhr_idx].iov_len = chunk_len;
179 		region->vhr_idx++;
180 		gpa += chunk_len;
181 		len -= chunk_len;
182 		vmm_drv_page_chain(region->vhr_tail, vmp);
183 		region->vhr_tail = vmp;
184 	}
185 
186 	return (0);
187 }
188 
189 static boolean_t
190 viona_ring_lease_expire_cb(void *arg)
191 {
192 	viona_vring_t *ring = arg;
193 
194 	mutex_enter(&ring->vr_lock);
195 	cv_broadcast(&ring->vr_cv);
196 	mutex_exit(&ring->vr_lock);
197 
198 	/* The lease will be broken asynchronously. */
199 	return (B_FALSE);
200 }
201 
202 static void
203 viona_ring_lease_drop(viona_vring_t *ring)
204 {
205 	ASSERT(MUTEX_HELD(&ring->vr_lock));
206 
207 	if (ring->vr_lease != NULL) {
208 		vmm_hold_t *hold = ring->vr_link->l_vm_hold;
209 
210 		ASSERT(hold != NULL);
211 
212 		/*
213 		 * Without an active lease, the ring mappings cannot be
214 		 * considered valid.
215 		 */
216 		viona_ring_unmap(ring);
217 
218 		vmm_drv_lease_break(hold, ring->vr_lease);
219 		ring->vr_lease = NULL;
220 	}
221 }
222 
223 boolean_t
224 viona_ring_lease_renew(viona_vring_t *ring)
225 {
226 	vmm_hold_t *hold = ring->vr_link->l_vm_hold;
227 
228 	ASSERT(hold != NULL);
229 	ASSERT(MUTEX_HELD(&ring->vr_lock));
230 
231 	viona_ring_lease_drop(ring);
232 
233 	/*
234 	 * Lease renewal will fail if the VM has requested that all holds be
235 	 * cleaned up.
236 	 */
237 	ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
238 	    ring);
239 	if (ring->vr_lease != NULL) {
240 		/* A ring undergoing renewal will need valid guest mappings */
241 		if (ring->vr_pa != 0 && ring->vr_size != 0) {
242 			/*
243 			 * If new mappings cannot be established, consider the
244 			 * lease renewal a failure.
245 			 */
246 			if (!viona_ring_map(ring)) {
247 				viona_ring_lease_drop(ring);
248 				return (B_FALSE);
249 			}
250 		}
251 	}
252 	return (ring->vr_lease != NULL);
253 }
254 
255 void
256 viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
257 {
258 	ring->vr_link = link;
259 	mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
260 	cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
261 	mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
262 	mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
263 }
264 
265 static void
266 viona_ring_misc_free(viona_vring_t *ring)
267 {
268 	const uint_t qsz = ring->vr_size;
269 
270 	viona_tx_ring_free(ring, qsz);
271 }
272 
273 void
274 viona_ring_free(viona_vring_t *ring)
275 {
276 	mutex_destroy(&ring->vr_lock);
277 	cv_destroy(&ring->vr_cv);
278 	mutex_destroy(&ring->vr_a_mutex);
279 	mutex_destroy(&ring->vr_u_mutex);
280 	ring->vr_link = NULL;
281 }
282 
283 int
284 viona_ring_init(viona_link_t *link, uint16_t idx, uint16_t qsz, uint64_t pa)
285 {
286 	viona_vring_t *ring;
287 	kthread_t *t;
288 	int err = 0;
289 
290 	if (idx >= VIONA_VQ_MAX) {
291 		return (EINVAL);
292 	}
293 	if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
294 		return (EINVAL);
295 	}
296 	if ((pa & (LEGACY_VQ_ALIGN - 1)) != 0) {
297 		return (EINVAL);
298 	}
299 
300 	ring = &link->l_vrings[idx];
301 	mutex_enter(&ring->vr_lock);
302 	if (ring->vr_state != VRS_RESET) {
303 		mutex_exit(&ring->vr_lock);
304 		return (EBUSY);
305 	}
306 	VERIFY(ring->vr_state_flags == 0);
307 
308 	ring->vr_lease = NULL;
309 	if (!viona_ring_lease_renew(ring)) {
310 		err = EBUSY;
311 		goto fail;
312 	}
313 
314 	ring->vr_size = qsz;
315 	ring->vr_mask = (ring->vr_size - 1);
316 	ring->vr_pa = pa;
317 	if (!viona_ring_map(ring)) {
318 		err = EINVAL;
319 		goto fail;
320 	}
321 
322 	/* Initialize queue indexes */
323 	ring->vr_cur_aidx = 0;
324 	ring->vr_cur_uidx = 0;
325 
326 	if (idx == VIONA_VQ_TX) {
327 		viona_tx_ring_alloc(ring, qsz);
328 	}
329 
330 	/* Zero out MSI-X configuration */
331 	ring->vr_msi_addr = 0;
332 	ring->vr_msi_msg = 0;
333 
334 	/* Clear the stats */
335 	bzero(&ring->vr_stats, sizeof (ring->vr_stats));
336 
337 	t = viona_create_worker(ring);
338 	if (t == NULL) {
339 		err = ENOMEM;
340 		goto fail;
341 	}
342 	ring->vr_worker_thread = t;
343 	ring->vr_state = VRS_SETUP;
344 	cv_broadcast(&ring->vr_cv);
345 	mutex_exit(&ring->vr_lock);
346 	return (0);
347 
348 fail:
349 	viona_ring_lease_drop(ring);
350 	viona_ring_misc_free(ring);
351 	ring->vr_size = 0;
352 	ring->vr_mask = 0;
353 	ring->vr_pa = 0;
354 	mutex_exit(&ring->vr_lock);
355 	return (err);
356 }
357 
358 int
359 viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
360 {
361 	mutex_enter(&ring->vr_lock);
362 	if (ring->vr_state == VRS_RESET) {
363 		mutex_exit(&ring->vr_lock);
364 		return (0);
365 	}
366 
367 	if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
368 		ring->vr_state_flags |= VRSF_REQ_STOP;
369 		cv_broadcast(&ring->vr_cv);
370 	}
371 	while (ring->vr_state != VRS_RESET) {
372 		if (!heed_signals) {
373 			cv_wait(&ring->vr_cv, &ring->vr_lock);
374 		} else {
375 			int rs;
376 
377 			rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
378 			if (rs <= 0 && ring->vr_state != VRS_RESET) {
379 				mutex_exit(&ring->vr_lock);
380 				return (EINTR);
381 			}
382 		}
383 	}
384 	mutex_exit(&ring->vr_lock);
385 	return (0);
386 }
387 
388 static boolean_t
389 viona_ring_map(viona_vring_t *ring)
390 {
391 	const uint16_t qsz = ring->vr_size;
392 	uintptr_t pa = ring->vr_pa;
393 
394 	ASSERT3U(qsz, !=, 0);
395 	ASSERT3U(qsz, <=, VRING_MAX_LEN);
396 	ASSERT3U(pa, !=, 0);
397 	ASSERT3U(pa & (LEGACY_VQ_ALIGN - 1), ==, 0);
398 	ASSERT3U(LEGACY_VQ_ALIGN, ==, PAGESIZE);
399 	ASSERT(MUTEX_HELD(&ring->vr_lock));
400 	ASSERT3P(ring->vr_map_pages, ==, NULL);
401 
402 	const uint_t npages = LEGACY_VQ_PAGES(qsz);
403 	ring->vr_map_pages = kmem_zalloc(npages * sizeof (void *), KM_SLEEP);
404 
405 	vmm_page_t *prev = NULL;
406 
407 	for (uint_t i = 0; i < npages; i++, pa += PAGESIZE) {
408 		vmm_page_t *vmp;
409 
410 		vmp = vq_page_hold(ring, pa, true);
411 		if (vmp == NULL) {
412 			viona_ring_unmap(ring);
413 			return (B_FALSE);
414 		}
415 
416 		/*
417 		 * Keep the first page has the head of the chain, appending all
418 		 * subsequent pages to the tail.
419 		 */
420 		if (prev == NULL) {
421 			ring->vr_map_hold = vmp;
422 		} else {
423 			vmm_drv_page_chain(prev, vmp);
424 		}
425 		prev = vmp;
426 		ring->vr_map_pages[i] = vmm_drv_page_writable(vmp);
427 	}
428 
429 	return (B_TRUE);
430 }
431 
432 static void
433 viona_ring_unmap(viona_vring_t *ring)
434 {
435 	ASSERT(MUTEX_HELD(&ring->vr_lock));
436 
437 	void **map = ring->vr_map_pages;
438 	if (map != NULL) {
439 		const uint_t npages = LEGACY_VQ_PAGES(ring->vr_size);
440 		kmem_free(map, npages * sizeof (void *));
441 		ring->vr_map_pages = NULL;
442 
443 		vmm_drv_page_release_chain(ring->vr_map_hold);
444 		ring->vr_map_hold = NULL;
445 	} else {
446 		ASSERT3P(ring->vr_map_hold, ==, NULL);
447 	}
448 }
449 
450 static inline void *
451 viona_ring_addr(viona_vring_t *ring, uint_t off)
452 {
453 	ASSERT3P(ring->vr_map_pages, !=, NULL);
454 	ASSERT3U(LEGACY_VQ_SIZE(ring->vr_size), >, off);
455 
456 	const uint_t page_num = off / PAGESIZE;
457 	const uint_t page_off = off % PAGESIZE;
458 	return ((caddr_t)ring->vr_map_pages[page_num] + page_off);
459 }
460 
461 void
462 viona_intr_ring(viona_vring_t *ring, boolean_t skip_flags_check)
463 {
464 	if (!skip_flags_check) {
465 		volatile uint16_t *avail_flags = viona_ring_addr(ring,
466 		    LEGACY_AVAIL_FLAGS_OFF(ring->vr_size));
467 
468 		if ((*avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0) {
469 			return;
470 		}
471 	}
472 
473 	mutex_enter(&ring->vr_lock);
474 	uint64_t addr = ring->vr_msi_addr;
475 	uint64_t msg = ring->vr_msi_msg;
476 	mutex_exit(&ring->vr_lock);
477 	if (addr != 0) {
478 		/* Deliver the interrupt directly, if so configured... */
479 		(void) vmm_drv_msi(ring->vr_lease, addr, msg);
480 	} else {
481 		/* ... otherwise, leave it to userspace */
482 		if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
483 			pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
484 		}
485 	}
486 }
487 
488 static void
489 viona_worker(void *arg)
490 {
491 	viona_vring_t *ring = (viona_vring_t *)arg;
492 	viona_link_t *link = ring->vr_link;
493 	proc_t *p = ttoproc(curthread);
494 
495 	mutex_enter(&ring->vr_lock);
496 	VERIFY3U(ring->vr_state, ==, VRS_SETUP);
497 
498 	/* Bail immediately if ring shutdown or process exit was requested */
499 	if (VRING_NEED_BAIL(ring, p)) {
500 		goto cleanup;
501 	}
502 
503 	/* Report worker thread as alive and notify creator */
504 	ring->vr_state = VRS_INIT;
505 	cv_broadcast(&ring->vr_cv);
506 
507 	while (ring->vr_state_flags == 0) {
508 		/*
509 		 * Keeping lease renewals timely while waiting for the ring to
510 		 * be started is important for avoiding deadlocks.
511 		 */
512 		if (vmm_drv_lease_expired(ring->vr_lease)) {
513 			if (!viona_ring_lease_renew(ring)) {
514 				goto cleanup;
515 			}
516 		}
517 
518 		(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
519 
520 		if (VRING_NEED_BAIL(ring, p)) {
521 			goto cleanup;
522 		}
523 	}
524 
525 	ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
526 	ring->vr_state = VRS_RUN;
527 	ring->vr_state_flags &= ~VRSF_REQ_START;
528 
529 	/* Ensure ring lease is valid first */
530 	if (vmm_drv_lease_expired(ring->vr_lease)) {
531 		if (!viona_ring_lease_renew(ring)) {
532 			goto cleanup;
533 		}
534 	}
535 
536 	/* Process actual work */
537 	if (ring == &link->l_vrings[VIONA_VQ_RX]) {
538 		viona_worker_rx(ring, link);
539 	} else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
540 		viona_worker_tx(ring, link);
541 	} else {
542 		panic("unexpected ring: %p", (void *)ring);
543 	}
544 
545 	VERIFY3U(ring->vr_state, ==, VRS_STOP);
546 
547 cleanup:
548 	if (ring->vr_txdesb != NULL) {
549 		/*
550 		 * Transmit activity must be entirely concluded before the
551 		 * associated descriptors can be cleaned up.
552 		 */
553 		VERIFY(ring->vr_xfer_outstanding == 0);
554 	}
555 	viona_ring_misc_free(ring);
556 
557 	viona_ring_lease_drop(ring);
558 	ring->vr_cur_aidx = 0;
559 	ring->vr_size = 0;
560 	ring->vr_mask = 0;
561 	ring->vr_pa = 0;
562 	ring->vr_state = VRS_RESET;
563 	ring->vr_state_flags = 0;
564 	ring->vr_worker_thread = NULL;
565 	cv_broadcast(&ring->vr_cv);
566 	mutex_exit(&ring->vr_lock);
567 
568 	mutex_enter(&ttoproc(curthread)->p_lock);
569 	lwp_exit();
570 }
571 
572 static kthread_t *
573 viona_create_worker(viona_vring_t *ring)
574 {
575 	k_sigset_t hold_set;
576 	proc_t *p = curproc;
577 	kthread_t *t;
578 	klwp_t *lwp;
579 
580 	ASSERT(MUTEX_HELD(&ring->vr_lock));
581 	ASSERT(ring->vr_state == VRS_RESET);
582 
583 	sigfillset(&hold_set);
584 	lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
585 	    minclsyspri - 1, &hold_set, curthread->t_cid, 0);
586 	if (lwp == NULL) {
587 		return (NULL);
588 	}
589 
590 	t = lwptot(lwp);
591 	mutex_enter(&p->p_lock);
592 	t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
593 	lwp_create_done(t);
594 	mutex_exit(&p->p_lock);
595 
596 	return (t);
597 }
598 
599 void
600 vq_read_desc(viona_vring_t *ring, uint16_t idx, struct virtio_desc *descp)
601 {
602 	const uint_t entry_off = idx * sizeof (struct virtio_desc);
603 
604 	ASSERT3U(idx, <, ring->vr_size);
605 
606 	bcopy(viona_ring_addr(ring, entry_off), descp, sizeof (*descp));
607 }
608 
609 static uint16_t
610 vq_read_avail(viona_vring_t *ring, uint16_t idx)
611 {
612 	ASSERT3U(idx, <, ring->vr_size);
613 
614 	volatile uint16_t *avail_ent =
615 	    viona_ring_addr(ring, LEGACY_AVAIL_ENT_OFF(ring->vr_size, idx));
616 	return (*avail_ent);
617 }
618 
619 /*
620  * Given a buffer descriptor `desc`, attempt to map the pages backing that
621  * region of guest physical memory, taking into account that there are no
622  * guarantees about guest-contiguous pages being host-contiguous.
623  */
624 static int
625 vq_map_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
626     vq_held_region_t *region)
627 {
628 	int err;
629 
630 	if (desc->vd_len == 0) {
631 		VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
632 		    uint32_t, desc->vd_len);
633 		VIONA_RING_STAT_INCR(ring, desc_bad_len);
634 		return (EINVAL);
635 	}
636 
637 	err = vq_region_hold(ring, desc->vd_addr, desc->vd_len,
638 	    (desc->vd_flags & VRING_DESC_F_WRITE) != 0, region);
639 	switch (err) {
640 	case E2BIG:
641 		VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
642 		VIONA_RING_STAT_INCR(ring, too_many_desc);
643 		break;
644 	case EFAULT:
645 		VIONA_PROBE_BAD_RING_ADDR(ring, desc->vd_addr);
646 		VIONA_RING_STAT_INCR(ring, bad_ring_addr);
647 		break;
648 	default:
649 		break;
650 	}
651 
652 	return (err);
653 }
654 
655 /*
656  * Walk an indirect buffer descriptor `desc`, attempting to map the pages
657  * backing the regions of guest memory covered by its contituent descriptors.
658  */
659 static int
660 vq_map_indir_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
661     vq_held_region_t *region)
662 {
663 	const uint16_t indir_count = desc->vd_len / sizeof (struct virtio_desc);
664 
665 	if ((desc->vd_len & 0xf) != 0 || indir_count == 0 ||
666 	    indir_count > ring->vr_size ||
667 	    desc->vd_addr > (desc->vd_addr + desc->vd_len)) {
668 		VIONA_PROBE2(indir_bad_len, viona_vring_t *, ring,
669 		    uint32_t, desc->vd_len);
670 		VIONA_RING_STAT_INCR(ring, indir_bad_len);
671 		return (EINVAL);
672 	}
673 
674 	uint16_t indir_next = 0;
675 	const uint8_t *buf = NULL;
676 	uint64_t buf_gpa = UINT64_MAX;
677 	vmm_page_t *vmp = NULL;
678 	int err = 0;
679 
680 	for (;;) {
681 		uint64_t indir_gpa =
682 		    desc->vd_addr + (indir_next * sizeof (struct virtio_desc));
683 		uint64_t indir_page = indir_gpa & PAGEMASK;
684 		struct virtio_desc vp;
685 
686 		/*
687 		 * Get a mapping for the page that the next indirect descriptor
688 		 * resides in, if has not already been done.
689 		 */
690 		if (indir_page != buf_gpa) {
691 			if (vmp != NULL) {
692 				vmm_drv_page_release(vmp);
693 			}
694 			vmp = vq_page_hold(ring, indir_page, false);
695 			if (vmp == NULL) {
696 				VIONA_PROBE_BAD_RING_ADDR(ring, indir_page);
697 				VIONA_RING_STAT_INCR(ring, bad_ring_addr);
698 				err = EFAULT;
699 				break;
700 			}
701 			buf_gpa = indir_page;
702 			buf = vmm_drv_page_readable(vmp);
703 		}
704 
705 		/*
706 		 * A copy of the indirect descriptor is made here, rather than
707 		 * simply using a reference pointer.  This prevents malicious or
708 		 * erroneous guest writes to the descriptor from fooling the
709 		 * flags/bounds verification through a race.
710 		 */
711 		bcopy(buf + (indir_gpa - indir_page), &vp, sizeof (vp));
712 
713 		if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
714 			VIONA_PROBE1(indir_bad_nest, viona_vring_t *, ring);
715 			VIONA_RING_STAT_INCR(ring, indir_bad_nest);
716 			err = EINVAL;
717 			break;
718 		} else if (vp.vd_len == 0) {
719 			VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
720 			    uint32_t, vp.vd_len);
721 			VIONA_RING_STAT_INCR(ring, desc_bad_len);
722 			err = EINVAL;
723 			break;
724 		}
725 
726 		err = vq_map_desc_bufs(ring, &vp, region);
727 		if (err != 0) {
728 			break;
729 		}
730 
731 		/* Successfully reach the end of the indir chain */
732 		if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) {
733 			break;
734 		}
735 		if (region->vhr_idx >= region->vhr_niov) {
736 			VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
737 			VIONA_RING_STAT_INCR(ring, too_many_desc);
738 			err = E2BIG;
739 			break;
740 		}
741 
742 		indir_next = vp.vd_next;
743 		if (indir_next >= indir_count) {
744 			VIONA_PROBE3(indir_bad_next, viona_vring_t *, ring,
745 			    uint16_t, indir_next, uint16_t, indir_count);
746 			VIONA_RING_STAT_INCR(ring, indir_bad_next);
747 			err = EINVAL;
748 			break;
749 		}
750 	}
751 
752 	if (vmp != NULL) {
753 		vmm_drv_page_release(vmp);
754 	}
755 	return (err);
756 }
757 
758 int
759 vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov,
760     uint16_t *cookie, vmm_page_t **chain)
761 {
762 	uint16_t ndesc, idx, head, next;
763 	struct virtio_desc vdir;
764 	vq_held_region_t region = {
765 		.vhr_niov = niov,
766 		.vhr_iov = iov,
767 	};
768 
769 	ASSERT(iov != NULL);
770 	ASSERT(niov > 0 && niov < INT_MAX);
771 	ASSERT(*chain == NULL);
772 
773 	mutex_enter(&ring->vr_a_mutex);
774 	idx = ring->vr_cur_aidx;
775 	ndesc = viona_ring_num_avail(ring);
776 
777 	if (ndesc == 0) {
778 		mutex_exit(&ring->vr_a_mutex);
779 		return (0);
780 	}
781 	if (ndesc > ring->vr_size) {
782 		/*
783 		 * Despite the fact that the guest has provided an 'avail_idx'
784 		 * which indicates that an impossible number of descriptors are
785 		 * available, continue on and attempt to process the next one.
786 		 *
787 		 * The transgression will not escape the probe or stats though.
788 		 */
789 		VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
790 		    uint16_t, ndesc);
791 		VIONA_RING_STAT_INCR(ring, ndesc_too_high);
792 	}
793 
794 	head = vq_read_avail(ring, idx & ring->vr_mask);
795 	next = head;
796 
797 	for (region.vhr_idx = 0; region.vhr_idx < niov; next = vdir.vd_next) {
798 		if (next >= ring->vr_size) {
799 			VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
800 			    uint16_t, next);
801 			VIONA_RING_STAT_INCR(ring, bad_idx);
802 			break;
803 		}
804 
805 		vq_read_desc(ring, next, &vdir);
806 		if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
807 			if (vq_map_desc_bufs(ring, &vdir, &region) != 0) {
808 				break;
809 			}
810 		} else {
811 			/*
812 			 * Per the specification (Virtio 1.1 S2.6.5.3.1):
813 			 *   A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
814 			 *   and VIRTQ_DESC_F_NEXT in `flags`.
815 			 */
816 			if ((vdir.vd_flags & VRING_DESC_F_NEXT) != 0) {
817 				VIONA_PROBE3(indir_bad_next,
818 				    viona_vring_t *, ring,
819 				    uint16_t, next, uint16_t, 0);
820 				VIONA_RING_STAT_INCR(ring, indir_bad_next);
821 				break;
822 			}
823 
824 			if (vq_map_indir_desc_bufs(ring, &vdir, &region) != 0) {
825 				break;
826 			}
827 		}
828 
829 		if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
830 			ring->vr_cur_aidx++;
831 			mutex_exit(&ring->vr_a_mutex);
832 
833 			*cookie = head;
834 			*chain = region.vhr_head;
835 			return (region.vhr_idx);
836 		}
837 	}
838 
839 	mutex_exit(&ring->vr_a_mutex);
840 	if (region.vhr_head != NULL) {
841 		/*
842 		 * If any pages were held prior to encountering an error, we
843 		 * must release them now.
844 		 */
845 		vmm_drv_page_release_chain(region.vhr_head);
846 	}
847 	return (-1);
848 }
849 
850 
851 static void
852 vq_write_used_ent(viona_vring_t *ring, uint16_t idx, uint16_t cookie,
853     uint32_t len)
854 {
855 	/*
856 	 * In a larger ring, entry could be split across pages, so be sure to
857 	 * account for that when configuring the transfer by looking up the ID
858 	 * and length addresses separately, rather than an address for a
859 	 * combined `struct virtio_used`.
860 	 */
861 	const uint_t used_id_off = LEGACY_USED_ENT_OFF(ring->vr_size, idx);
862 	const uint_t used_len_off = used_id_off + sizeof (uint32_t);
863 	volatile uint32_t *idp = viona_ring_addr(ring, used_id_off);
864 	volatile uint32_t *lenp = viona_ring_addr(ring, used_len_off);
865 
866 	ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
867 
868 	*idp = cookie;
869 	*lenp = len;
870 }
871 
872 static void
873 vq_write_used_idx(viona_vring_t *ring, uint16_t idx)
874 {
875 	ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
876 
877 	volatile uint16_t *used_idx =
878 	    viona_ring_addr(ring, LEGACY_USED_IDX_OFF(ring->vr_size));
879 	*used_idx = idx;
880 }
881 
882 void
883 vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
884 {
885 	uint16_t uidx;
886 
887 	mutex_enter(&ring->vr_u_mutex);
888 
889 	uidx = ring->vr_cur_uidx;
890 	vq_write_used_ent(ring, uidx & ring->vr_mask, cookie, len);
891 	uidx++;
892 	membar_producer();
893 
894 	vq_write_used_idx(ring, uidx);
895 	ring->vr_cur_uidx = uidx;
896 
897 	mutex_exit(&ring->vr_u_mutex);
898 }
899 
900 void
901 vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem)
902 {
903 	uint16_t uidx;
904 
905 	mutex_enter(&ring->vr_u_mutex);
906 
907 	uidx = ring->vr_cur_uidx;
908 
909 	for (uint_t i = 0; i < num_bufs; i++, uidx++) {
910 		vq_write_used_ent(ring, uidx & ring->vr_mask, elem[i].id,
911 		    elem[i].len);
912 	}
913 
914 	membar_producer();
915 	vq_write_used_idx(ring, uidx);
916 	ring->vr_cur_uidx = uidx;
917 
918 	mutex_exit(&ring->vr_u_mutex);
919 }
920 
921 /*
922  * Set USED_NO_NOTIFY on VQ so guest elides doorbell calls for new entries.
923  */
924 void
925 viona_ring_disable_notify(viona_vring_t *ring)
926 {
927 	volatile uint16_t *used_flags =
928 	    viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
929 
930 	*used_flags |= VRING_USED_F_NO_NOTIFY;
931 }
932 
933 /*
934  * Clear USED_NO_NOTIFY on VQ so guest resumes doorbell calls for new entries.
935  */
936 void
937 viona_ring_enable_notify(viona_vring_t *ring)
938 {
939 	volatile uint16_t *used_flags =
940 	    viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
941 
942 	*used_flags &= ~VRING_USED_F_NO_NOTIFY;
943 }
944 
945 /*
946  * Return the number of available descriptors in the vring taking care of the
947  * 16-bit index wraparound.
948  *
949  * Note: If the number of apparently available descriptors is larger than the
950  * ring size (due to guest misbehavior), this check will still report the
951  * positive count of descriptors.
952  */
953 uint16_t
954 viona_ring_num_avail(viona_vring_t *ring)
955 {
956 	volatile uint16_t *avail_idx =
957 	    viona_ring_addr(ring, LEGACY_AVAIL_IDX_OFF(ring->vr_size));
958 
959 	return (*avail_idx - ring->vr_cur_aidx);
960 }
961