1 /*
2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * This file and its contents are supplied under the terms of the
28  * Common Development and Distribution License ("CDDL"), version 1.0.
29  * You may only use this file in accordance with the terms of version
30  * 1.0 of the CDDL.
31  *
32  * A full copy of the text of the CDDL should have accompanied this
33  * source.  A copy of the CDDL is also available via the Internet at
34  * http://www.illumos.org/license/CDDL.
35  *
36  * Copyright 2015 Pluribus Networks Inc.
37  * Copyright 2019 Joyent, Inc.
38  * Copyright 2022 Oxide Computer Company
39  */
40 
41 
42 #include <sys/disp.h>
43 
44 #include "viona_impl.h"
45 
46 #define	VRING_MAX_LEN		32768
47 
48 /* Layout and sizing as defined in the spec for a legacy-style virtqueue */
49 
50 #define	LEGACY_VQ_ALIGN		PAGESIZE
51 
52 #define	LEGACY_DESC_SZ(qsz)	((qsz) * sizeof (struct virtio_desc))
53 /*
54  * Available ring consists of avail_idx (uint16_t), flags (uint16_t), qsz avail
55  * descriptors (uint16_t each), and (optional) used_event (uint16_t).
56  */
57 #define	LEGACY_AVAIL_SZ(qsz)	(((qsz) + 3) * sizeof (uint16_t))
58 /*
59  * Used ring consists of used_idx (uint16_t), flags (uint16_t), qsz used
60  * descriptors (two uint32_t each), and (optional) avail_event (uint16_t).
61  */
62 #define	LEGACY_USED_SZ(qsz)	\
63 	((qsz) * sizeof (struct virtio_used) + 3 * sizeof (uint16_t))
64 
65 #define	LEGACY_AVAIL_FLAGS_OFF(qsz)	LEGACY_DESC_SZ(qsz)
66 #define	LEGACY_AVAIL_IDX_OFF(qsz)	\
67 	(LEGACY_DESC_SZ(qsz) + sizeof (uint16_t))
68 #define	LEGACY_AVAIL_ENT_OFF(qsz, idx)	\
69 	(LEGACY_DESC_SZ(qsz) + (2 + (idx)) * sizeof (uint16_t))
70 
71 #define	LEGACY_USED_FLAGS_OFF(qsz)	\
72 	P2ROUNDUP(LEGACY_DESC_SZ(qsz) + LEGACY_AVAIL_SZ(qsz), LEGACY_VQ_ALIGN)
73 #define	LEGACY_USED_IDX_OFF(qsz)	\
74 	(LEGACY_USED_FLAGS_OFF(qsz) + sizeof (uint16_t))
75 #define	LEGACY_USED_ENT_OFF(qsz, idx)	\
76 	(LEGACY_USED_FLAGS_OFF(qsz) + 2 * sizeof (uint16_t) + \
77 	(idx) * sizeof (struct virtio_used))
78 
79 #define	LEGACY_VQ_SIZE(qsz)	\
80 	(LEGACY_USED_FLAGS_OFF(qsz) + \
81 	P2ROUNDUP(LEGACY_USED_SZ(qsz), LEGACY_VQ_ALIGN))
82 #define	LEGACY_VQ_PAGES(qsz)	(LEGACY_VQ_SIZE(qsz) / PAGESIZE)
83 
84 struct vq_held_region {
85 	struct iovec	*vhr_iov;
86 	vmm_page_t	*vhr_head;
87 	vmm_page_t	*vhr_tail;
88 	/* Length of iovec array supplied in `vhr_iov` */
89 	uint_t		vhr_niov;
90 	/*
91 	 * Index into vhr_iov, indicating the next "free" entry (following the
92 	 * last entry which has valid contents).
93 	 */
94 	uint_t		vhr_idx;
95 };
96 typedef struct vq_held_region vq_held_region_t;
97 
98 static boolean_t viona_ring_map(viona_vring_t *);
99 static void viona_ring_unmap(viona_vring_t *);
100 static kthread_t *viona_create_worker(viona_vring_t *);
101 
102 static vmm_page_t *
103 vq_page_hold(viona_vring_t *ring, uint64_t gpa, bool writable)
104 {
105 	ASSERT3P(ring->vr_lease, !=, NULL);
106 
107 	int prot = PROT_READ;
108 	if (writable) {
109 		prot |= PROT_WRITE;
110 	}
111 
112 	return (vmm_drv_page_hold(ring->vr_lease, gpa, prot));
113 }
114 
115 /*
116  * Establish a hold on the page(s) which back the region of guest memory covered
117  * by [gpa, gpa + len).  The host-kernel-virtual pointers to those pages are
118  * stored in the iovec array supplied in `region`, along with the chain of
119  * vmm_page_t entries representing the held pages.  Since guest memory
120  * carries no guarantees of being physically contiguous (on the host), it is
121  * assumed that an iovec entry will be required for each PAGESIZE section
122  * covered by the specified `gpa` and `len` range.  For each iovec entry
123  * successfully populated by holding a page, `vhr_idx` will be incremented so it
124  * references the next available iovec entry (or `vhr_niov`, if the iovec array
125  * is full).  The responsibility for releasing the `vmm_page_t` chain (stored in
126  * `vhr_head` and `vhr_tail`) resides with the caller, regardless of the result.
127  */
128 static int
129 vq_region_hold(viona_vring_t *ring, uint64_t gpa, uint32_t len,
130     bool writable, vq_held_region_t *region)
131 {
132 	const uint32_t front_offset = gpa & PAGEOFFSET;
133 	const uint32_t front_len = MIN(len, PAGESIZE - front_offset);
134 	uint_t pages = 1;
135 	vmm_page_t *vmp;
136 	caddr_t buf;
137 
138 	ASSERT3U(region->vhr_idx, <, region->vhr_niov);
139 
140 	if (front_len < len) {
141 		pages += P2ROUNDUP((uint64_t)(len - front_len),
142 		    PAGESIZE) / PAGESIZE;
143 	}
144 	if (pages > (region->vhr_niov - region->vhr_idx)) {
145 		return (E2BIG);
146 	}
147 
148 	vmp = vq_page_hold(ring, gpa & PAGEMASK, writable);
149 	if (vmp == NULL) {
150 		return (EFAULT);
151 	}
152 	buf = (caddr_t)vmm_drv_page_readable(vmp);
153 
154 	region->vhr_iov[region->vhr_idx].iov_base = buf + front_offset;
155 	region->vhr_iov[region->vhr_idx].iov_len = front_len;
156 	region->vhr_idx++;
157 	gpa += front_len;
158 	len -= front_len;
159 	if (region->vhr_head == NULL) {
160 		region->vhr_head = vmp;
161 		region->vhr_tail = vmp;
162 	} else {
163 		vmm_drv_page_chain(region->vhr_tail, vmp);
164 		region->vhr_tail = vmp;
165 	}
166 
167 	for (uint_t i = 1; i < pages; i++) {
168 		ASSERT3U(gpa & PAGEOFFSET, ==, 0);
169 
170 		vmp = vq_page_hold(ring, gpa, writable);
171 		if (vmp == NULL) {
172 			return (EFAULT);
173 		}
174 		buf = (caddr_t)vmm_drv_page_readable(vmp);
175 
176 		const uint32_t chunk_len = MIN(len, PAGESIZE);
177 		region->vhr_iov[region->vhr_idx].iov_base = buf;
178 		region->vhr_iov[region->vhr_idx].iov_len = chunk_len;
179 		region->vhr_idx++;
180 		gpa += chunk_len;
181 		len -= chunk_len;
182 		vmm_drv_page_chain(region->vhr_tail, vmp);
183 		region->vhr_tail = vmp;
184 	}
185 
186 	return (0);
187 }
188 
189 static boolean_t
190 viona_ring_lease_expire_cb(void *arg)
191 {
192 	viona_vring_t *ring = arg;
193 
194 	mutex_enter(&ring->vr_lock);
195 	cv_broadcast(&ring->vr_cv);
196 	mutex_exit(&ring->vr_lock);
197 
198 	/* The lease will be broken asynchronously. */
199 	return (B_FALSE);
200 }
201 
202 static void
203 viona_ring_lease_drop(viona_vring_t *ring)
204 {
205 	ASSERT(MUTEX_HELD(&ring->vr_lock));
206 
207 	if (ring->vr_lease != NULL) {
208 		vmm_hold_t *hold = ring->vr_link->l_vm_hold;
209 
210 		ASSERT(hold != NULL);
211 
212 		/*
213 		 * Without an active lease, the ring mappings cannot be
214 		 * considered valid.
215 		 */
216 		viona_ring_unmap(ring);
217 
218 		vmm_drv_lease_break(hold, ring->vr_lease);
219 		ring->vr_lease = NULL;
220 	}
221 }
222 
223 boolean_t
224 viona_ring_lease_renew(viona_vring_t *ring)
225 {
226 	vmm_hold_t *hold = ring->vr_link->l_vm_hold;
227 
228 	ASSERT(hold != NULL);
229 	ASSERT(MUTEX_HELD(&ring->vr_lock));
230 
231 	viona_ring_lease_drop(ring);
232 
233 	/*
234 	 * Lease renewal will fail if the VM has requested that all holds be
235 	 * cleaned up.
236 	 */
237 	ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
238 	    ring);
239 	if (ring->vr_lease != NULL) {
240 		/* A ring undergoing renewal will need valid guest mappings */
241 		if (ring->vr_pa != 0 && ring->vr_size != 0) {
242 			/*
243 			 * If new mappings cannot be established, consider the
244 			 * lease renewal a failure.
245 			 */
246 			if (!viona_ring_map(ring)) {
247 				viona_ring_lease_drop(ring);
248 				return (B_FALSE);
249 			}
250 		}
251 	}
252 	return (ring->vr_lease != NULL);
253 }
254 
255 void
256 viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
257 {
258 	ring->vr_link = link;
259 	mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
260 	cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
261 	mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
262 	mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
263 }
264 
265 static void
266 viona_ring_misc_free(viona_vring_t *ring)
267 {
268 	const uint_t qsz = ring->vr_size;
269 
270 	viona_tx_ring_free(ring, qsz);
271 }
272 
273 void
274 viona_ring_free(viona_vring_t *ring)
275 {
276 	mutex_destroy(&ring->vr_lock);
277 	cv_destroy(&ring->vr_cv);
278 	mutex_destroy(&ring->vr_a_mutex);
279 	mutex_destroy(&ring->vr_u_mutex);
280 	ring->vr_link = NULL;
281 }
282 
283 int
284 viona_ring_init(viona_link_t *link, uint16_t idx,
285     const struct viona_ring_params *params)
286 {
287 	viona_vring_t *ring;
288 	kthread_t *t;
289 	int err = 0;
290 	const uint16_t qsz = params->vrp_size;
291 	const uint64_t pa = params->vrp_pa;
292 
293 	if (idx >= VIONA_VQ_MAX) {
294 		return (EINVAL);
295 	}
296 
297 	if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
298 		return (EINVAL);
299 	}
300 	if ((pa & (LEGACY_VQ_ALIGN - 1)) != 0) {
301 		return (EINVAL);
302 	}
303 
304 	ring = &link->l_vrings[idx];
305 	mutex_enter(&ring->vr_lock);
306 	if (ring->vr_state != VRS_RESET) {
307 		mutex_exit(&ring->vr_lock);
308 		return (EBUSY);
309 	}
310 	VERIFY(ring->vr_state_flags == 0);
311 
312 	ring->vr_lease = NULL;
313 	if (!viona_ring_lease_renew(ring)) {
314 		err = EBUSY;
315 		goto fail;
316 	}
317 
318 	ring->vr_size = qsz;
319 	ring->vr_mask = (ring->vr_size - 1);
320 	ring->vr_pa = pa;
321 	if (!viona_ring_map(ring)) {
322 		err = EINVAL;
323 		goto fail;
324 	}
325 
326 	/* Initialize queue indexes */
327 	ring->vr_cur_aidx = params->vrp_avail_idx;
328 	ring->vr_cur_uidx = params->vrp_used_idx;
329 
330 	if (idx == VIONA_VQ_TX) {
331 		viona_tx_ring_alloc(ring, qsz);
332 	}
333 
334 	/* Zero out MSI-X configuration */
335 	ring->vr_msi_addr = 0;
336 	ring->vr_msi_msg = 0;
337 
338 	/* Clear the stats */
339 	bzero(&ring->vr_stats, sizeof (ring->vr_stats));
340 
341 	t = viona_create_worker(ring);
342 	if (t == NULL) {
343 		err = ENOMEM;
344 		goto fail;
345 	}
346 	ring->vr_worker_thread = t;
347 	ring->vr_state = VRS_SETUP;
348 	cv_broadcast(&ring->vr_cv);
349 	mutex_exit(&ring->vr_lock);
350 	return (0);
351 
352 fail:
353 	viona_ring_lease_drop(ring);
354 	viona_ring_misc_free(ring);
355 	ring->vr_size = 0;
356 	ring->vr_mask = 0;
357 	ring->vr_pa = 0;
358 	ring->vr_cur_aidx = 0;
359 	ring->vr_cur_uidx = 0;
360 	mutex_exit(&ring->vr_lock);
361 	return (err);
362 }
363 
364 int
365 viona_ring_get_state(viona_link_t *link, uint16_t idx,
366     struct viona_ring_params *params)
367 {
368 	viona_vring_t *ring;
369 
370 	if (idx >= VIONA_VQ_MAX) {
371 		return (EINVAL);
372 	}
373 
374 	ring = &link->l_vrings[idx];
375 	mutex_enter(&ring->vr_lock);
376 
377 	params->vrp_size = ring->vr_size;
378 	params->vrp_pa = ring->vr_pa;
379 
380 	if (ring->vr_state == VRS_RUN) {
381 		/* On a running ring, we must heed the avail/used locks */
382 		mutex_enter(&ring->vr_a_mutex);
383 		params->vrp_avail_idx = ring->vr_cur_aidx;
384 		mutex_exit(&ring->vr_a_mutex);
385 		mutex_enter(&ring->vr_u_mutex);
386 		params->vrp_used_idx = ring->vr_cur_uidx;
387 		mutex_exit(&ring->vr_u_mutex);
388 	} else {
389 		/* Otherwise vr_lock is adequate protection */
390 		params->vrp_avail_idx = ring->vr_cur_aidx;
391 		params->vrp_used_idx = ring->vr_cur_uidx;
392 	}
393 
394 	mutex_exit(&ring->vr_lock);
395 
396 	return (0);
397 }
398 
399 int
400 viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
401 {
402 	mutex_enter(&ring->vr_lock);
403 	if (ring->vr_state == VRS_RESET) {
404 		mutex_exit(&ring->vr_lock);
405 		return (0);
406 	}
407 
408 	if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
409 		ring->vr_state_flags |= VRSF_REQ_STOP;
410 		cv_broadcast(&ring->vr_cv);
411 	}
412 	while (ring->vr_state != VRS_RESET) {
413 		if (!heed_signals) {
414 			cv_wait(&ring->vr_cv, &ring->vr_lock);
415 		} else {
416 			int rs;
417 
418 			rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
419 			if (rs <= 0 && ring->vr_state != VRS_RESET) {
420 				mutex_exit(&ring->vr_lock);
421 				return (EINTR);
422 			}
423 		}
424 	}
425 	mutex_exit(&ring->vr_lock);
426 	return (0);
427 }
428 
429 static boolean_t
430 viona_ring_map(viona_vring_t *ring)
431 {
432 	const uint16_t qsz = ring->vr_size;
433 	uintptr_t pa = ring->vr_pa;
434 
435 	ASSERT3U(qsz, !=, 0);
436 	ASSERT3U(qsz, <=, VRING_MAX_LEN);
437 	ASSERT3U(pa, !=, 0);
438 	ASSERT3U(pa & (LEGACY_VQ_ALIGN - 1), ==, 0);
439 	ASSERT3U(LEGACY_VQ_ALIGN, ==, PAGESIZE);
440 	ASSERT(MUTEX_HELD(&ring->vr_lock));
441 	ASSERT3P(ring->vr_map_pages, ==, NULL);
442 
443 	const uint_t npages = LEGACY_VQ_PAGES(qsz);
444 	ring->vr_map_pages = kmem_zalloc(npages * sizeof (void *), KM_SLEEP);
445 
446 	vmm_page_t *prev = NULL;
447 
448 	for (uint_t i = 0; i < npages; i++, pa += PAGESIZE) {
449 		vmm_page_t *vmp;
450 
451 		vmp = vq_page_hold(ring, pa, true);
452 		if (vmp == NULL) {
453 			viona_ring_unmap(ring);
454 			return (B_FALSE);
455 		}
456 
457 		/*
458 		 * Keep the first page has the head of the chain, appending all
459 		 * subsequent pages to the tail.
460 		 */
461 		if (prev == NULL) {
462 			ring->vr_map_hold = vmp;
463 		} else {
464 			vmm_drv_page_chain(prev, vmp);
465 		}
466 		prev = vmp;
467 		ring->vr_map_pages[i] = vmm_drv_page_writable(vmp);
468 	}
469 
470 	return (B_TRUE);
471 }
472 
473 static void
474 viona_ring_unmap(viona_vring_t *ring)
475 {
476 	ASSERT(MUTEX_HELD(&ring->vr_lock));
477 
478 	void **map = ring->vr_map_pages;
479 	if (map != NULL) {
480 		const uint_t npages = LEGACY_VQ_PAGES(ring->vr_size);
481 		kmem_free(map, npages * sizeof (void *));
482 		ring->vr_map_pages = NULL;
483 
484 		vmm_drv_page_release_chain(ring->vr_map_hold);
485 		ring->vr_map_hold = NULL;
486 	} else {
487 		ASSERT3P(ring->vr_map_hold, ==, NULL);
488 	}
489 }
490 
491 static inline void *
492 viona_ring_addr(viona_vring_t *ring, uint_t off)
493 {
494 	ASSERT3P(ring->vr_map_pages, !=, NULL);
495 	ASSERT3U(LEGACY_VQ_SIZE(ring->vr_size), >, off);
496 
497 	const uint_t page_num = off / PAGESIZE;
498 	const uint_t page_off = off % PAGESIZE;
499 	return ((caddr_t)ring->vr_map_pages[page_num] + page_off);
500 }
501 
502 void
503 viona_intr_ring(viona_vring_t *ring, boolean_t skip_flags_check)
504 {
505 	if (!skip_flags_check) {
506 		volatile uint16_t *avail_flags = viona_ring_addr(ring,
507 		    LEGACY_AVAIL_FLAGS_OFF(ring->vr_size));
508 
509 		if ((*avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0) {
510 			return;
511 		}
512 	}
513 
514 	mutex_enter(&ring->vr_lock);
515 	uint64_t addr = ring->vr_msi_addr;
516 	uint64_t msg = ring->vr_msi_msg;
517 	mutex_exit(&ring->vr_lock);
518 	if (addr != 0) {
519 		/* Deliver the interrupt directly, if so configured... */
520 		(void) vmm_drv_msi(ring->vr_lease, addr, msg);
521 	} else {
522 		/* ... otherwise, leave it to userspace */
523 		if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
524 			pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
525 		}
526 	}
527 }
528 
529 static inline bool
530 vring_stop_req(const viona_vring_t *ring)
531 {
532 	return ((ring->vr_state_flags & VRSF_REQ_STOP) != 0);
533 }
534 
535 static inline bool
536 vring_pause_req(const viona_vring_t *ring)
537 {
538 	return ((ring->vr_state_flags & VRSF_REQ_PAUSE) != 0);
539 }
540 
541 static inline bool
542 vring_start_req(const viona_vring_t *ring)
543 {
544 	return ((ring->vr_state_flags & VRSF_REQ_START) != 0);
545 }
546 
547 /*
548  * Check if vring worker thread should bail out.  This will heed indications
549  * that the containing process is exiting, as well as requests to stop or pause
550  * the ring.  The `stop_only` parameter controls if pause requests are ignored
551  * (true) or checked (false).
552  *
553  * Caller should hold vr_lock.
554  */
555 static bool
556 vring_need_bail_ext(const viona_vring_t *ring, bool stop_only)
557 {
558 	ASSERT(MUTEX_HELD(&ring->vr_lock));
559 
560 	if (vring_stop_req(ring) ||
561 	    (!stop_only && vring_pause_req(ring))) {
562 		return (true);
563 	}
564 
565 	kthread_t *t = ring->vr_worker_thread;
566 	if (t != NULL) {
567 		proc_t *p = ttoproc(t);
568 
569 		ASSERT(p != NULL);
570 		if ((p->p_flag & SEXITING) != 0) {
571 			return (true);
572 		}
573 	}
574 	return (false);
575 }
576 
577 bool
578 vring_need_bail(const viona_vring_t *ring)
579 {
580 	return (vring_need_bail_ext(ring, false));
581 }
582 
583 int
584 viona_ring_pause(viona_vring_t *ring)
585 {
586 	mutex_enter(&ring->vr_lock);
587 	switch (ring->vr_state) {
588 	case VRS_RESET:
589 	case VRS_SETUP:
590 	case VRS_INIT:
591 		/*
592 		 * For rings which have not yet started (even those in the
593 		 * VRS_SETUP and VRS_INIT phases, where there a running worker
594 		 * thread (waiting to be released to do its intended task), it
595 		 * is adequate to simply clear any start request, to keep them
596 		 * from proceeding into the actual work processing function.
597 		 */
598 		ring->vr_state_flags &= ~VRSF_REQ_START;
599 		mutex_exit(&ring->vr_lock);
600 		return (0);
601 
602 	case VRS_STOP:
603 		if ((ring->vr_state_flags & VRSF_REQ_STOP) != 0) {
604 			/* A ring on its way to RESET cannot be paused. */
605 			mutex_exit(&ring->vr_lock);
606 			return (EBUSY);
607 		}
608 		/* FALLTHROUGH */
609 	case VRS_RUN:
610 		ring->vr_state_flags |= VRSF_REQ_PAUSE;
611 		cv_broadcast(&ring->vr_cv);
612 		break;
613 
614 	default:
615 		panic("invalid ring state %d", ring->vr_state);
616 		break;
617 	}
618 
619 	for (;;) {
620 		int res = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
621 
622 		if (ring->vr_state == VRS_INIT ||
623 		    (ring->vr_state_flags & VRSF_REQ_PAUSE) == 0) {
624 			/* Ring made it to (or through) paused state */
625 			mutex_exit(&ring->vr_lock);
626 			return (0);
627 		}
628 		if (res == 0) {
629 			/* interrupted by signal */
630 			mutex_exit(&ring->vr_lock);
631 			return (EINTR);
632 		}
633 	}
634 	/* NOTREACHED */
635 }
636 
637 static void
638 viona_worker(void *arg)
639 {
640 	viona_vring_t *ring = (viona_vring_t *)arg;
641 	viona_link_t *link = ring->vr_link;
642 
643 	mutex_enter(&ring->vr_lock);
644 	VERIFY3U(ring->vr_state, ==, VRS_SETUP);
645 
646 	/* Bail immediately if ring shutdown or process exit was requested */
647 	if (vring_need_bail_ext(ring, true)) {
648 		goto ring_reset;
649 	}
650 
651 	/* Report worker thread as alive and notify creator */
652 ring_init:
653 	ring->vr_state = VRS_INIT;
654 	cv_broadcast(&ring->vr_cv);
655 
656 	while (!vring_start_req(ring)) {
657 		/*
658 		 * Keeping lease renewals timely while waiting for the ring to
659 		 * be started is important for avoiding deadlocks.
660 		 */
661 		if (vmm_drv_lease_expired(ring->vr_lease)) {
662 			if (!viona_ring_lease_renew(ring)) {
663 				goto ring_reset;
664 			}
665 		}
666 
667 		(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
668 
669 		if (vring_pause_req(ring)) {
670 			/* We are already paused in the INIT state. */
671 			ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
672 		}
673 		if (vring_need_bail_ext(ring, true)) {
674 			goto ring_reset;
675 		}
676 	}
677 
678 	ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
679 	ring->vr_state = VRS_RUN;
680 	ring->vr_state_flags &= ~VRSF_REQ_START;
681 
682 	/* Ensure ring lease is valid first */
683 	if (vmm_drv_lease_expired(ring->vr_lease)) {
684 		if (!viona_ring_lease_renew(ring)) {
685 			goto ring_reset;
686 		}
687 	}
688 
689 	/* Process actual work */
690 	if (ring == &link->l_vrings[VIONA_VQ_RX]) {
691 		viona_worker_rx(ring, link);
692 	} else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
693 		viona_worker_tx(ring, link);
694 	} else {
695 		panic("unexpected ring: %p", (void *)ring);
696 	}
697 
698 	VERIFY3U(ring->vr_state, ==, VRS_STOP);
699 	VERIFY3U(ring->vr_xfer_outstanding, ==, 0);
700 
701 	/* Respond to a pause request if the ring is not required to stop */
702 	if (vring_pause_req(ring)) {
703 		ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
704 
705 		if (!vring_need_bail_ext(ring, true)) {
706 			goto ring_init;
707 		}
708 	}
709 
710 ring_reset:
711 	viona_ring_misc_free(ring);
712 
713 	viona_ring_lease_drop(ring);
714 	ring->vr_cur_aidx = 0;
715 	ring->vr_size = 0;
716 	ring->vr_mask = 0;
717 	ring->vr_pa = 0;
718 	ring->vr_state = VRS_RESET;
719 	ring->vr_state_flags = 0;
720 	ring->vr_worker_thread = NULL;
721 	cv_broadcast(&ring->vr_cv);
722 	mutex_exit(&ring->vr_lock);
723 
724 	mutex_enter(&ttoproc(curthread)->p_lock);
725 	lwp_exit();
726 }
727 
728 static kthread_t *
729 viona_create_worker(viona_vring_t *ring)
730 {
731 	k_sigset_t hold_set;
732 	proc_t *p = curproc;
733 	kthread_t *t;
734 	klwp_t *lwp;
735 
736 	ASSERT(MUTEX_HELD(&ring->vr_lock));
737 	ASSERT(ring->vr_state == VRS_RESET);
738 
739 	sigfillset(&hold_set);
740 	lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
741 	    minclsyspri - 1, &hold_set, curthread->t_cid, 0);
742 	if (lwp == NULL) {
743 		return (NULL);
744 	}
745 
746 	t = lwptot(lwp);
747 	mutex_enter(&p->p_lock);
748 	t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
749 	lwp_create_done(t);
750 	mutex_exit(&p->p_lock);
751 
752 	return (t);
753 }
754 
755 void
756 vq_read_desc(viona_vring_t *ring, uint16_t idx, struct virtio_desc *descp)
757 {
758 	const uint_t entry_off = idx * sizeof (struct virtio_desc);
759 
760 	ASSERT3U(idx, <, ring->vr_size);
761 
762 	bcopy(viona_ring_addr(ring, entry_off), descp, sizeof (*descp));
763 }
764 
765 static uint16_t
766 vq_read_avail(viona_vring_t *ring, uint16_t idx)
767 {
768 	ASSERT3U(idx, <, ring->vr_size);
769 
770 	volatile uint16_t *avail_ent =
771 	    viona_ring_addr(ring, LEGACY_AVAIL_ENT_OFF(ring->vr_size, idx));
772 	return (*avail_ent);
773 }
774 
775 /*
776  * Given a buffer descriptor `desc`, attempt to map the pages backing that
777  * region of guest physical memory, taking into account that there are no
778  * guarantees about guest-contiguous pages being host-contiguous.
779  */
780 static int
781 vq_map_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
782     vq_held_region_t *region)
783 {
784 	int err;
785 
786 	if (desc->vd_len == 0) {
787 		VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
788 		    uint32_t, desc->vd_len);
789 		VIONA_RING_STAT_INCR(ring, desc_bad_len);
790 		return (EINVAL);
791 	}
792 
793 	err = vq_region_hold(ring, desc->vd_addr, desc->vd_len,
794 	    (desc->vd_flags & VRING_DESC_F_WRITE) != 0, region);
795 	switch (err) {
796 	case E2BIG:
797 		VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
798 		VIONA_RING_STAT_INCR(ring, too_many_desc);
799 		break;
800 	case EFAULT:
801 		VIONA_PROBE_BAD_RING_ADDR(ring, desc->vd_addr);
802 		VIONA_RING_STAT_INCR(ring, bad_ring_addr);
803 		break;
804 	default:
805 		break;
806 	}
807 
808 	return (err);
809 }
810 
811 /*
812  * Walk an indirect buffer descriptor `desc`, attempting to map the pages
813  * backing the regions of guest memory covered by its contituent descriptors.
814  */
815 static int
816 vq_map_indir_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
817     vq_held_region_t *region)
818 {
819 	const uint16_t indir_count = desc->vd_len / sizeof (struct virtio_desc);
820 
821 	if ((desc->vd_len & 0xf) != 0 || indir_count == 0 ||
822 	    indir_count > ring->vr_size ||
823 	    desc->vd_addr > (desc->vd_addr + desc->vd_len)) {
824 		VIONA_PROBE2(indir_bad_len, viona_vring_t *, ring,
825 		    uint32_t, desc->vd_len);
826 		VIONA_RING_STAT_INCR(ring, indir_bad_len);
827 		return (EINVAL);
828 	}
829 
830 	uint16_t indir_next = 0;
831 	const uint8_t *buf = NULL;
832 	uint64_t buf_gpa = UINT64_MAX;
833 	vmm_page_t *vmp = NULL;
834 	int err = 0;
835 
836 	for (;;) {
837 		uint64_t indir_gpa =
838 		    desc->vd_addr + (indir_next * sizeof (struct virtio_desc));
839 		uint64_t indir_page = indir_gpa & PAGEMASK;
840 		struct virtio_desc vp;
841 
842 		/*
843 		 * Get a mapping for the page that the next indirect descriptor
844 		 * resides in, if has not already been done.
845 		 */
846 		if (indir_page != buf_gpa) {
847 			if (vmp != NULL) {
848 				vmm_drv_page_release(vmp);
849 			}
850 			vmp = vq_page_hold(ring, indir_page, false);
851 			if (vmp == NULL) {
852 				VIONA_PROBE_BAD_RING_ADDR(ring, indir_page);
853 				VIONA_RING_STAT_INCR(ring, bad_ring_addr);
854 				err = EFAULT;
855 				break;
856 			}
857 			buf_gpa = indir_page;
858 			buf = vmm_drv_page_readable(vmp);
859 		}
860 
861 		/*
862 		 * A copy of the indirect descriptor is made here, rather than
863 		 * simply using a reference pointer.  This prevents malicious or
864 		 * erroneous guest writes to the descriptor from fooling the
865 		 * flags/bounds verification through a race.
866 		 */
867 		bcopy(buf + (indir_gpa - indir_page), &vp, sizeof (vp));
868 
869 		if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
870 			VIONA_PROBE1(indir_bad_nest, viona_vring_t *, ring);
871 			VIONA_RING_STAT_INCR(ring, indir_bad_nest);
872 			err = EINVAL;
873 			break;
874 		} else if (vp.vd_len == 0) {
875 			VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
876 			    uint32_t, vp.vd_len);
877 			VIONA_RING_STAT_INCR(ring, desc_bad_len);
878 			err = EINVAL;
879 			break;
880 		}
881 
882 		err = vq_map_desc_bufs(ring, &vp, region);
883 		if (err != 0) {
884 			break;
885 		}
886 
887 		/* Successfully reach the end of the indir chain */
888 		if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) {
889 			break;
890 		}
891 		if (region->vhr_idx >= region->vhr_niov) {
892 			VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
893 			VIONA_RING_STAT_INCR(ring, too_many_desc);
894 			err = E2BIG;
895 			break;
896 		}
897 
898 		indir_next = vp.vd_next;
899 		if (indir_next >= indir_count) {
900 			VIONA_PROBE3(indir_bad_next, viona_vring_t *, ring,
901 			    uint16_t, indir_next, uint16_t, indir_count);
902 			VIONA_RING_STAT_INCR(ring, indir_bad_next);
903 			err = EINVAL;
904 			break;
905 		}
906 	}
907 
908 	if (vmp != NULL) {
909 		vmm_drv_page_release(vmp);
910 	}
911 	return (err);
912 }
913 
914 int
915 vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov,
916     uint16_t *cookie, vmm_page_t **chain)
917 {
918 	uint16_t ndesc, idx, head, next;
919 	struct virtio_desc vdir;
920 	vq_held_region_t region = {
921 		.vhr_niov = niov,
922 		.vhr_iov = iov,
923 	};
924 
925 	ASSERT(iov != NULL);
926 	ASSERT(niov > 0 && niov < INT_MAX);
927 	ASSERT(*chain == NULL);
928 
929 	mutex_enter(&ring->vr_a_mutex);
930 	idx = ring->vr_cur_aidx;
931 	ndesc = viona_ring_num_avail(ring);
932 
933 	if (ndesc == 0) {
934 		mutex_exit(&ring->vr_a_mutex);
935 		return (0);
936 	}
937 	if (ndesc > ring->vr_size) {
938 		/*
939 		 * Despite the fact that the guest has provided an 'avail_idx'
940 		 * which indicates that an impossible number of descriptors are
941 		 * available, continue on and attempt to process the next one.
942 		 *
943 		 * The transgression will not escape the probe or stats though.
944 		 */
945 		VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
946 		    uint16_t, ndesc);
947 		VIONA_RING_STAT_INCR(ring, ndesc_too_high);
948 	}
949 
950 	head = vq_read_avail(ring, idx & ring->vr_mask);
951 	next = head;
952 
953 	for (region.vhr_idx = 0; region.vhr_idx < niov; next = vdir.vd_next) {
954 		if (next >= ring->vr_size) {
955 			VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
956 			    uint16_t, next);
957 			VIONA_RING_STAT_INCR(ring, bad_idx);
958 			break;
959 		}
960 
961 		vq_read_desc(ring, next, &vdir);
962 		if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
963 			if (vq_map_desc_bufs(ring, &vdir, &region) != 0) {
964 				break;
965 			}
966 		} else {
967 			/*
968 			 * Per the specification (Virtio 1.1 S2.6.5.3.1):
969 			 *   A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
970 			 *   and VIRTQ_DESC_F_NEXT in `flags`.
971 			 */
972 			if ((vdir.vd_flags & VRING_DESC_F_NEXT) != 0) {
973 				VIONA_PROBE3(indir_bad_next,
974 				    viona_vring_t *, ring,
975 				    uint16_t, next, uint16_t, 0);
976 				VIONA_RING_STAT_INCR(ring, indir_bad_next);
977 				break;
978 			}
979 
980 			if (vq_map_indir_desc_bufs(ring, &vdir, &region) != 0) {
981 				break;
982 			}
983 		}
984 
985 		if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
986 			ring->vr_cur_aidx++;
987 			mutex_exit(&ring->vr_a_mutex);
988 
989 			*cookie = head;
990 			*chain = region.vhr_head;
991 			return (region.vhr_idx);
992 		}
993 	}
994 
995 	mutex_exit(&ring->vr_a_mutex);
996 	if (region.vhr_head != NULL) {
997 		/*
998 		 * If any pages were held prior to encountering an error, we
999 		 * must release them now.
1000 		 */
1001 		vmm_drv_page_release_chain(region.vhr_head);
1002 	}
1003 	return (-1);
1004 }
1005 
1006 
1007 static void
1008 vq_write_used_ent(viona_vring_t *ring, uint16_t idx, uint16_t cookie,
1009     uint32_t len)
1010 {
1011 	/*
1012 	 * In a larger ring, entry could be split across pages, so be sure to
1013 	 * account for that when configuring the transfer by looking up the ID
1014 	 * and length addresses separately, rather than an address for a
1015 	 * combined `struct virtio_used`.
1016 	 */
1017 	const uint_t used_id_off = LEGACY_USED_ENT_OFF(ring->vr_size, idx);
1018 	const uint_t used_len_off = used_id_off + sizeof (uint32_t);
1019 	volatile uint32_t *idp = viona_ring_addr(ring, used_id_off);
1020 	volatile uint32_t *lenp = viona_ring_addr(ring, used_len_off);
1021 
1022 	ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1023 
1024 	*idp = cookie;
1025 	*lenp = len;
1026 }
1027 
1028 static void
1029 vq_write_used_idx(viona_vring_t *ring, uint16_t idx)
1030 {
1031 	ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1032 
1033 	volatile uint16_t *used_idx =
1034 	    viona_ring_addr(ring, LEGACY_USED_IDX_OFF(ring->vr_size));
1035 	*used_idx = idx;
1036 }
1037 
1038 void
1039 vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
1040 {
1041 	uint16_t uidx;
1042 
1043 	mutex_enter(&ring->vr_u_mutex);
1044 
1045 	uidx = ring->vr_cur_uidx;
1046 	vq_write_used_ent(ring, uidx & ring->vr_mask, cookie, len);
1047 	uidx++;
1048 	membar_producer();
1049 
1050 	vq_write_used_idx(ring, uidx);
1051 	ring->vr_cur_uidx = uidx;
1052 
1053 	mutex_exit(&ring->vr_u_mutex);
1054 }
1055 
1056 void
1057 vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem)
1058 {
1059 	uint16_t uidx;
1060 
1061 	mutex_enter(&ring->vr_u_mutex);
1062 
1063 	uidx = ring->vr_cur_uidx;
1064 
1065 	for (uint_t i = 0; i < num_bufs; i++, uidx++) {
1066 		vq_write_used_ent(ring, uidx & ring->vr_mask, elem[i].id,
1067 		    elem[i].len);
1068 	}
1069 
1070 	membar_producer();
1071 	vq_write_used_idx(ring, uidx);
1072 	ring->vr_cur_uidx = uidx;
1073 
1074 	mutex_exit(&ring->vr_u_mutex);
1075 }
1076 
1077 /*
1078  * Set USED_NO_NOTIFY on VQ so guest elides doorbell calls for new entries.
1079  */
1080 void
1081 viona_ring_disable_notify(viona_vring_t *ring)
1082 {
1083 	volatile uint16_t *used_flags =
1084 	    viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1085 
1086 	*used_flags |= VRING_USED_F_NO_NOTIFY;
1087 }
1088 
1089 /*
1090  * Clear USED_NO_NOTIFY on VQ so guest resumes doorbell calls for new entries.
1091  */
1092 void
1093 viona_ring_enable_notify(viona_vring_t *ring)
1094 {
1095 	volatile uint16_t *used_flags =
1096 	    viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1097 
1098 	*used_flags &= ~VRING_USED_F_NO_NOTIFY;
1099 }
1100 
1101 /*
1102  * Return the number of available descriptors in the vring taking care of the
1103  * 16-bit index wraparound.
1104  *
1105  * Note: If the number of apparently available descriptors is larger than the
1106  * ring size (due to guest misbehavior), this check will still report the
1107  * positive count of descriptors.
1108  */
1109 uint16_t
1110 viona_ring_num_avail(viona_vring_t *ring)
1111 {
1112 	volatile uint16_t *avail_idx =
1113 	    viona_ring_addr(ring, LEGACY_AVAIL_IDX_OFF(ring->vr_size));
1114 
1115 	return (*avail_idx - ring->vr_cur_aidx);
1116 }
1117