1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 /*
31  * This file and its contents are supplied under the terms of the
32  * Common Development and Distribution License ("CDDL"), version 1.0.
33  * You may only use this file in accordance with the terms of version
34  * 1.0 of the CDDL.
35  *
36  * A full copy of the text of the CDDL should have accompanied this
37  * source.  A copy of the CDDL is also available via the Internet at
38  * http://www.illumos.org/license/CDDL.
39  *
40  * Copyright 2013 Pluribus Networks Inc.
41  * Copyright 2018 Joyent, Inc.
42  */
43 
44 #include <sys/cdefs.h>
45 __FBSDID("$FreeBSD$");
46 
47 #include <sys/param.h>
48 #ifndef WITHOUT_CAPSICUM
49 #include <sys/capsicum.h>
50 #endif
51 #include <sys/linker_set.h>
52 #include <sys/select.h>
53 #include <sys/uio.h>
54 #include <sys/ioctl.h>
55 #include <net/ethernet.h>
56 #ifdef __FreeBSD__
57 #ifndef NETMAP_WITH_LIBS
58 #define NETMAP_WITH_LIBS
59 #endif
60 #include <net/netmap_user.h>
61 #endif
62 
63 #ifndef WITHOUT_CAPSICUM
64 #include <capsicum_helpers.h>
65 #endif
66 #include <err.h>
67 #include <errno.h>
68 #include <fcntl.h>
69 #include <stdio.h>
70 #include <stdlib.h>
71 #include <stdint.h>
72 #include <string.h>
73 #include <strings.h>
74 #include <unistd.h>
75 #include <assert.h>
76 #include <md5.h>
77 #include <pthread.h>
78 #include <pthread_np.h>
79 #include <sysexits.h>
80 #ifndef __FreeBSD__
81 #include <poll.h>
82 #include <libdlpi.h>
83 #endif
84 
85 #include "bhyverun.h"
86 #include "pci_emul.h"
87 #ifdef __FreeBSD__
88 #include "mevent.h"
89 #endif
90 #include "virtio.h"
91 #include "net_utils.h"
92 
93 #define VTNET_RINGSZ	1024
94 
95 #define VTNET_MAXSEGS	256
96 
97 /*
98  * Host capabilities.  Note that we only offer a few of these.
99  */
100 #define	VIRTIO_NET_F_CSUM	(1 <<  0) /* host handles partial cksum */
101 #define	VIRTIO_NET_F_GUEST_CSUM	(1 <<  1) /* guest handles partial cksum */
102 #define	VIRTIO_NET_F_MAC	(1 <<  5) /* host supplies MAC */
103 #define	VIRTIO_NET_F_GSO_DEPREC	(1 <<  6) /* deprecated: host handles GSO */
104 #define	VIRTIO_NET_F_GUEST_TSO4	(1 <<  7) /* guest can rcv TSOv4 */
105 #define	VIRTIO_NET_F_GUEST_TSO6	(1 <<  8) /* guest can rcv TSOv6 */
106 #define	VIRTIO_NET_F_GUEST_ECN	(1 <<  9) /* guest can rcv TSO with ECN */
107 #define	VIRTIO_NET_F_GUEST_UFO	(1 << 10) /* guest can rcv UFO */
108 #define	VIRTIO_NET_F_HOST_TSO4	(1 << 11) /* host can rcv TSOv4 */
109 #define	VIRTIO_NET_F_HOST_TSO6	(1 << 12) /* host can rcv TSOv6 */
110 #define	VIRTIO_NET_F_HOST_ECN	(1 << 13) /* host can rcv TSO with ECN */
111 #define	VIRTIO_NET_F_HOST_UFO	(1 << 14) /* host can rcv UFO */
112 #define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
113 #define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
114 #define	VIRTIO_NET_F_CTRL_VQ	(1 << 17) /* control channel available */
115 #define	VIRTIO_NET_F_CTRL_RX	(1 << 18) /* control channel RX mode support */
116 #define	VIRTIO_NET_F_CTRL_VLAN	(1 << 19) /* control channel VLAN filtering */
117 #define	VIRTIO_NET_F_GUEST_ANNOUNCE \
118 				(1 << 21) /* guest can send gratuitous pkts */
119 
120 #define VTNET_S_HOSTCAPS      \
121   ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \
122     VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC)
123 
124 /*
125  * PCI config-space "registers"
126  */
127 struct virtio_net_config {
128 	uint8_t  mac[6];
129 	uint16_t status;
130 } __packed;
131 
132 /*
133  * Queue definitions.
134  */
135 #define VTNET_RXQ	0
136 #define VTNET_TXQ	1
137 #define VTNET_CTLQ	2	/* NB: not yet supported */
138 
139 #define VTNET_MAXQ	3
140 
141 /*
142  * Fixed network header size
143  */
144 struct virtio_net_rxhdr {
145 	uint8_t		vrh_flags;
146 	uint8_t		vrh_gso_type;
147 	uint16_t	vrh_hdr_len;
148 	uint16_t	vrh_gso_size;
149 	uint16_t	vrh_csum_start;
150 	uint16_t	vrh_csum_offset;
151 	uint16_t	vrh_bufs;
152 } __packed;
153 
154 /*
155  * Debug printf
156  */
157 static int pci_vtnet_debug;
158 #define DPRINTF(params) if (pci_vtnet_debug) printf params
159 #define WPRINTF(params) printf params
160 
161 /*
162  * Per-device softc
163  */
164 struct pci_vtnet_softc {
165 	struct virtio_softc vsc_vs;
166 	struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
167 	pthread_mutex_t vsc_mtx;
168 	struct mevent	*vsc_mevp;
169 
170 #ifdef	__FreeBSD
171 	int		vsc_tapfd;
172 #else
173 	dlpi_handle_t	vsc_dhp;
174 	int		vsc_dlpifd;
175 #endif
176 	struct nm_desc	*vsc_nmd;
177 
178 	int		vsc_rx_ready;
179 	int		resetting;	/* protected by tx_mtx */
180 
181 	uint64_t	vsc_features;	/* negotiated features */
182 
183 	struct virtio_net_config vsc_config;
184 
185 	pthread_mutex_t	rx_mtx;
186 	int		rx_vhdrlen;
187 	int		rx_merge;	/* merged rx bufs in use */
188 
189 	pthread_t 	tx_tid;
190 	pthread_mutex_t	tx_mtx;
191 	pthread_cond_t	tx_cond;
192 	int		tx_in_progress;
193 
194 	void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc);
195 	void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov,
196 			     int iovcnt, int len);
197 };
198 
199 static void pci_vtnet_reset(void *);
200 /* static void pci_vtnet_notify(void *, struct vqueue_info *); */
201 static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
202 static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
203 static void pci_vtnet_neg_features(void *, uint64_t);
204 
205 static struct virtio_consts vtnet_vi_consts = {
206 	"vtnet",		/* our name */
207 	VTNET_MAXQ - 1,		/* we currently support 2 virtqueues */
208 	sizeof(struct virtio_net_config), /* config reg size */
209 	pci_vtnet_reset,	/* reset */
210 	NULL,			/* device-wide qnotify -- not used */
211 	pci_vtnet_cfgread,	/* read PCI config */
212 	pci_vtnet_cfgwrite,	/* write PCI config */
213 	pci_vtnet_neg_features,	/* apply negotiated features */
214 	VTNET_S_HOSTCAPS,	/* our capabilities */
215 };
216 
217 static void
218 pci_vtnet_reset(void *vsc)
219 {
220 	struct pci_vtnet_softc *sc = vsc;
221 
222 	DPRINTF(("vtnet: device reset requested !\n"));
223 
224 	/* Acquire the RX lock to block RX processing. */
225 	pthread_mutex_lock(&sc->rx_mtx);
226 
227 	/* Set sc->resetting and give a chance to the TX thread to stop. */
228 	pthread_mutex_lock(&sc->tx_mtx);
229 	sc->resetting = 1;
230 	while (sc->tx_in_progress) {
231 		pthread_mutex_unlock(&sc->tx_mtx);
232 		usleep(10000);
233 		pthread_mutex_lock(&sc->tx_mtx);
234 	}
235 
236 	sc->vsc_rx_ready = 0;
237 	sc->rx_merge = 1;
238 	sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
239 
240 	/*
241 	 * Now reset rings, MSI-X vectors, and negotiated capabilities.
242 	 * Do that with the TX lock held, since we need to reset
243 	 * sc->resetting.
244 	 */
245 	vi_reset_dev(&sc->vsc_vs);
246 
247 	sc->resetting = 0;
248 	pthread_mutex_unlock(&sc->tx_mtx);
249 	pthread_mutex_unlock(&sc->rx_mtx);
250 }
251 
252 /*
253  * Called to send a buffer chain out to the tap device
254  */
255 #ifdef __FreeBSD__
256 static void
257 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
258 		 int len)
259 {
260 	static char pad[60]; /* all zero bytes */
261 
262 	if (sc->vsc_tapfd == -1)
263 		return;
264 
265 	/*
266 	 * If the length is < 60, pad out to that and add the
267 	 * extra zero'd segment to the iov. It is guaranteed that
268 	 * there is always an extra iov available by the caller.
269 	 */
270 	if (len < 60) {
271 		iov[iovcnt].iov_base = pad;
272 		iov[iovcnt].iov_len = 60 - len;
273 		iovcnt++;
274 	}
275 	(void) writev(sc->vsc_tapfd, iov, iovcnt);
276 }
277 #else
278 static void
279 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
280 		 int len)
281 {
282 	int i;
283 
284 	for (i = 0; i < iovcnt; i++) {
285 		(void) dlpi_send(sc->vsc_dhp, NULL, 0,
286 		    iov[i].iov_base, iov[i].iov_len, NULL);
287 	}
288 }
289 #endif /* __FreeBSD__ */
290 
291 #ifdef __FreeBSD__
292 /*
293  *  Called when there is read activity on the tap file descriptor.
294  * Each buffer posted by the guest is assumed to be able to contain
295  * an entire ethernet frame + rx header.
296  *  MP note: the dummybuf is only used for discarding frames, so there
297  * is no need for it to be per-vtnet or locked.
298  */
299 static uint8_t dummybuf[2048];
300 #endif /* __FreeBSD__ */
301 
302 static __inline struct iovec *
303 rx_iov_trim(struct iovec *iov, int *niov, int tlen)
304 {
305 	struct iovec *riov;
306 
307 	/* XXX short-cut: assume first segment is >= tlen */
308 	assert(iov[0].iov_len >= tlen);
309 
310 	iov[0].iov_len -= tlen;
311 	if (iov[0].iov_len == 0) {
312 		assert(*niov > 1);
313 		*niov -= 1;
314 		riov = &iov[1];
315 	} else {
316 		iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
317 		riov = &iov[0];
318 	}
319 
320 	return (riov);
321 }
322 
323 static void
324 pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
325 {
326 	struct iovec iov[VTNET_MAXSEGS], *riov;
327 	struct vqueue_info *vq;
328 	void *vrx;
329 	int n;
330 #ifdef	__FreeBSD__
331 	int len;
332 #else
333 	size_t len;
334 	int ret;
335 #endif
336 	uint16_t idx;
337 
338 	/*
339 	 * Should never be called without a valid tap fd
340 	 */
341 #ifdef	__FreeBSD__
342 	assert(sc->vsc_tapfd != -1);
343 #else
344 	assert(sc->vsc_dlpifd != -1);
345 #endif
346 
347 	/*
348 	 * But, will be called when the rx ring hasn't yet
349 	 * been set up.
350 	 */
351 	if (!sc->vsc_rx_ready) {
352 #ifdef	__FreeBSD__
353 		/*
354 		 * Drop the packet and try later.
355 		 */
356 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
357 #endif
358 		return;
359 	}
360 
361 	/*
362 	 * Check for available rx buffers
363 	 */
364 	vq = &sc->vsc_queues[VTNET_RXQ];
365 	if (!vq_has_descs(vq)) {
366 		/*
367 		 * Drop the packet and try later.  Interrupt on
368 		 * empty, if that's negotiated.
369 		 */
370 #ifdef	__FreeBSD__
371 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
372 #endif
373 		vq_endchains(vq, 1);
374 		return;
375 	}
376 
377 	do {
378 		/*
379 		 * Get descriptor chain
380 		 */
381 		n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
382 		assert(n >= 1 && n <= VTNET_MAXSEGS);
383 
384 		/*
385 		 * Get a pointer to the rx header, and use the
386 		 * data immediately following it for the packet buffer.
387 		 */
388 		vrx = iov[0].iov_base;
389 		riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
390 #ifdef	__FreeBSD__
391 		len = readv(sc->vsc_tapfd, riov, n);
392 #else
393 		len = riov[0].iov_len;
394 		ret = dlpi_recv(sc->vsc_dhp, NULL, NULL,
395 		    (uint8_t *)riov[0].iov_base, &len, 0, NULL);
396 		if (ret != DLPI_SUCCESS) {
397 			errno = EWOULDBLOCK;
398 			len = 0;
399 		}
400 #endif
401 		if (len <= 0 && errno == EWOULDBLOCK) {
402 			/*
403 			 * No more packets, but still some avail ring
404 			 * entries.  Interrupt if needed/appropriate.
405 			 */
406 			vq_retchain(vq);
407 			vq_endchains(vq, 0);
408 			return;
409 		}
410 
411 		/*
412 		 * The only valid field in the rx packet header is the
413 		 * number of buffers if merged rx bufs were negotiated.
414 		 */
415 		memset(vrx, 0, sc->rx_vhdrlen);
416 
417 		if (sc->rx_merge) {
418 			struct virtio_net_rxhdr *vrxh;
419 
420 			vrxh = vrx;
421 			vrxh->vrh_bufs = 1;
422 		}
423 
424 		/*
425 		 * Release this chain and handle more chains.
426 		 */
427 		vq_relchain(vq, idx, len + sc->rx_vhdrlen);
428 	} while (vq_has_descs(vq));
429 
430 	/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
431 	vq_endchains(vq, 1);
432 }
433 
434 #ifdef __FreeBSD__
435 static __inline int
436 pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
437 {
438 	int r, i;
439 	int len = 0;
440 
441 	for (r = nmd->cur_tx_ring; ; ) {
442 		struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r);
443 		uint32_t cur, idx;
444 		char *buf;
445 
446 		if (nm_ring_empty(ring)) {
447 			r++;
448 			if (r > nmd->last_tx_ring)
449 				r = nmd->first_tx_ring;
450 			if (r == nmd->cur_tx_ring)
451 				break;
452 			continue;
453 		}
454 		cur = ring->cur;
455 		idx = ring->slot[cur].buf_idx;
456 		buf = NETMAP_BUF(ring, idx);
457 
458 		for (i = 0; i < iovcnt; i++) {
459 			if (len + iov[i].iov_len > 2048)
460 				break;
461 			memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len);
462 			len += iov[i].iov_len;
463 		}
464 		ring->slot[cur].len = len;
465 		ring->head = ring->cur = nm_ring_next(ring, cur);
466 		nmd->cur_tx_ring = r;
467 		ioctl(nmd->fd, NIOCTXSYNC, NULL);
468 		break;
469 	}
470 
471 	return (len);
472 }
473 
474 static __inline int
475 pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
476 {
477 	int len = 0;
478 	int i = 0;
479 	int r;
480 
481 	for (r = nmd->cur_rx_ring; ; ) {
482 		struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r);
483 		uint32_t cur, idx;
484 		char *buf;
485 		size_t left;
486 
487 		if (nm_ring_empty(ring)) {
488 			r++;
489 			if (r > nmd->last_rx_ring)
490 				r = nmd->first_rx_ring;
491 			if (r == nmd->cur_rx_ring)
492 				break;
493 			continue;
494 		}
495 		cur = ring->cur;
496 		idx = ring->slot[cur].buf_idx;
497 		buf = NETMAP_BUF(ring, idx);
498 		left = ring->slot[cur].len;
499 
500 		for (i = 0; i < iovcnt && left > 0; i++) {
501 			if (iov[i].iov_len > left)
502 				iov[i].iov_len = left;
503 			memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len);
504 			len += iov[i].iov_len;
505 			left -= iov[i].iov_len;
506 		}
507 		ring->head = ring->cur = nm_ring_next(ring, cur);
508 		nmd->cur_rx_ring = r;
509 		ioctl(nmd->fd, NIOCRXSYNC, NULL);
510 		break;
511 	}
512 	for (; i < iovcnt; i++)
513 		iov[i].iov_len = 0;
514 
515 	return (len);
516 }
517 
518 /*
519  * Called to send a buffer chain out to the vale port
520  */
521 static void
522 pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
523 		    int len)
524 {
525 	static char pad[60]; /* all zero bytes */
526 
527 	if (sc->vsc_nmd == NULL)
528 		return;
529 
530 	/*
531 	 * If the length is < 60, pad out to that and add the
532 	 * extra zero'd segment to the iov. It is guaranteed that
533 	 * there is always an extra iov available by the caller.
534 	 */
535 	if (len < 60) {
536 		iov[iovcnt].iov_base = pad;
537 		iov[iovcnt].iov_len = 60 - len;
538 		iovcnt++;
539 	}
540 	(void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt);
541 }
542 
543 static void
544 pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc)
545 {
546 	struct iovec iov[VTNET_MAXSEGS], *riov;
547 	struct vqueue_info *vq;
548 	void *vrx;
549 	int len, n;
550 	uint16_t idx;
551 
552 	/*
553 	 * Should never be called without a valid netmap descriptor
554 	 */
555 	assert(sc->vsc_nmd != NULL);
556 
557 	/*
558 	 * But, will be called when the rx ring hasn't yet
559 	 * been set up.
560 	 */
561 	if (!sc->vsc_rx_ready) {
562 		/*
563 		 * Drop the packet and try later.
564 		 */
565 		(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
566 		return;
567 	}
568 
569 	/*
570 	 * Check for available rx buffers
571 	 */
572 	vq = &sc->vsc_queues[VTNET_RXQ];
573 	if (!vq_has_descs(vq)) {
574 		/*
575 		 * Drop the packet and try later.  Interrupt on
576 		 * empty, if that's negotiated.
577 		 */
578 		(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
579 		vq_endchains(vq, 1);
580 		return;
581 	}
582 
583 	do {
584 		/*
585 		 * Get descriptor chain.
586 		 */
587 		n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
588 		assert(n >= 1 && n <= VTNET_MAXSEGS);
589 
590 		/*
591 		 * Get a pointer to the rx header, and use the
592 		 * data immediately following it for the packet buffer.
593 		 */
594 		vrx = iov[0].iov_base;
595 		riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
596 
597 		len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n);
598 
599 		if (len == 0) {
600 			/*
601 			 * No more packets, but still some avail ring
602 			 * entries.  Interrupt if needed/appropriate.
603 			 */
604 			vq_retchain(vq);
605 			vq_endchains(vq, 0);
606 			return;
607 		}
608 
609 		/*
610 		 * The only valid field in the rx packet header is the
611 		 * number of buffers if merged rx bufs were negotiated.
612 		 */
613 		memset(vrx, 0, sc->rx_vhdrlen);
614 
615 		if (sc->rx_merge) {
616 			struct virtio_net_rxhdr *vrxh;
617 
618 			vrxh = vrx;
619 			vrxh->vrh_bufs = 1;
620 		}
621 
622 		/*
623 		 * Release this chain and handle more chains.
624 		 */
625 		vq_relchain(vq, idx, len + sc->rx_vhdrlen);
626 	} while (vq_has_descs(vq));
627 
628 	/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
629 	vq_endchains(vq, 1);
630 }
631 #endif /* __FreeBSD__ */
632 
633 #ifdef __FreeBSD__
634 static void
635 pci_vtnet_rx_callback(int fd, enum ev_type type, void *param)
636 {
637 	struct pci_vtnet_softc *sc = param;
638 
639 	pthread_mutex_lock(&sc->rx_mtx);
640 	sc->pci_vtnet_rx(sc);
641 	pthread_mutex_unlock(&sc->rx_mtx);
642 
643 }
644 #else
645 static void *
646 pci_vtnet_poll_thread(void *param)
647 {
648 	struct pci_vtnet_softc *sc = param;
649 	pollfd_t pollset;
650 
651 	pollset.fd = sc->vsc_dlpifd;
652 	pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND;
653 
654 	for (;;) {
655 		if (poll(&pollset, 1, -1) < 0) {
656 			if (errno == EINTR)
657 				continue;
658 			fprintf(stderr, "pci_vtnet_poll_thread poll() error %d\n", errno);
659 			continue;
660 		}
661 		pthread_mutex_lock(&sc->vsc_mtx);
662 		pci_vtnet_tap_rx(sc);
663 		pthread_mutex_unlock(&sc->vsc_mtx);
664 	}
665 
666 	return (NULL);
667 }
668 #endif /* __FreeBSD__ */
669 
670 static void
671 pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
672 {
673 	struct pci_vtnet_softc *sc = vsc;
674 
675 	/*
676 	 * A qnotify means that the rx process can now begin
677 	 */
678 	if (sc->vsc_rx_ready == 0) {
679 		sc->vsc_rx_ready = 1;
680 		vq_kick_disable(vq);
681 	}
682 }
683 
684 static void
685 pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
686 {
687 	struct iovec iov[VTNET_MAXSEGS + 1];
688 	int i, n;
689 	int plen, tlen;
690 	uint16_t idx;
691 
692 	/*
693 	 * Obtain chain of descriptors.  The first one is
694 	 * really the header descriptor, so we need to sum
695 	 * up two lengths: packet length and transfer length.
696 	 */
697 	n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
698 	assert(n >= 1 && n <= VTNET_MAXSEGS);
699 	plen = 0;
700 	tlen = iov[0].iov_len;
701 	for (i = 1; i < n; i++) {
702 		plen += iov[i].iov_len;
703 		tlen += iov[i].iov_len;
704 	}
705 
706 	DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
707 	sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen);
708 
709 	/* chain is processed, release it and set tlen */
710 	vq_relchain(vq, idx, tlen);
711 }
712 
713 static void
714 pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
715 {
716 	struct pci_vtnet_softc *sc = vsc;
717 
718 	/*
719 	 * Any ring entries to process?
720 	 */
721 	if (!vq_has_descs(vq))
722 		return;
723 
724 	/* Signal the tx thread for processing */
725 	pthread_mutex_lock(&sc->tx_mtx);
726 	vq_kick_disable(vq);
727 	if (sc->tx_in_progress == 0)
728 		pthread_cond_signal(&sc->tx_cond);
729 	pthread_mutex_unlock(&sc->tx_mtx);
730 }
731 
732 /*
733  * Thread which will handle processing of TX desc
734  */
735 static void *
736 pci_vtnet_tx_thread(void *param)
737 {
738 	struct pci_vtnet_softc *sc = param;
739 	struct vqueue_info *vq;
740 	int error;
741 
742 	vq = &sc->vsc_queues[VTNET_TXQ];
743 
744 	/*
745 	 * Let us wait till the tx queue pointers get initialised &
746 	 * first tx signaled
747 	 */
748 	pthread_mutex_lock(&sc->tx_mtx);
749 	error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
750 	assert(error == 0);
751 
752 	for (;;) {
753 		/* note - tx mutex is locked here */
754 		while (sc->resetting || !vq_has_descs(vq)) {
755 			vq_kick_enable(vq);
756 			if (!sc->resetting && vq_has_descs(vq))
757 				break;
758 
759 			sc->tx_in_progress = 0;
760 			error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
761 			assert(error == 0);
762 		}
763 		vq_kick_disable(vq);
764 		sc->tx_in_progress = 1;
765 		pthread_mutex_unlock(&sc->tx_mtx);
766 
767 		do {
768 			/*
769 			 * Run through entries, placing them into
770 			 * iovecs and sending when an end-of-packet
771 			 * is found
772 			 */
773 			pci_vtnet_proctx(sc, vq);
774 		} while (vq_has_descs(vq));
775 
776 		/*
777 		 * Generate an interrupt if needed.
778 		 */
779 		vq_endchains(vq, 1);
780 
781 		pthread_mutex_lock(&sc->tx_mtx);
782 	}
783 	return (NULL);
784 }
785 
786 #ifdef __FreeBSD__
787 static void
788 pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
789 {
790 
791 	DPRINTF(("vtnet: control qnotify!\n\r"));
792 }
793 #endif /* __FreeBSD__ */
794 
795 static void
796 pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname)
797 {
798 	char tbuf[80];
799 #ifndef WITHOUT_CAPSICUM
800 	cap_rights_t rights;
801 #endif
802 #ifndef	__FreeBSD__
803 	uchar_t physaddr[DLPI_PHYSADDR_MAX];
804 	size_t physaddrlen = DLPI_PHYSADDR_MAX;
805 	int error;
806 #endif
807 
808 	strcpy(tbuf, "/dev/");
809 	strlcat(tbuf, devname, sizeof(tbuf));
810 
811 	sc->pci_vtnet_rx = pci_vtnet_tap_rx;
812 	sc->pci_vtnet_tx = pci_vtnet_tap_tx;
813 #ifdef	__FreeBSD__
814 	sc->vsc_tapfd = open(tbuf, O_RDWR);
815 	if (sc->vsc_tapfd == -1) {
816 		WPRINTF(("open of tap device %s failed\n", tbuf));
817 		return;
818 	}
819 
820 	/*
821 	 * Set non-blocking and register for read
822 	 * notifications with the event loop
823 	 */
824 	int opt = 1;
825 	if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
826 		WPRINTF(("tap device O_NONBLOCK failed\n"));
827 		close(sc->vsc_tapfd);
828 		sc->vsc_tapfd = -1;
829 	}
830 
831 #ifndef WITHOUT_CAPSICUM
832 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
833 	if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1)
834 		errx(EX_OSERR, "Unable to apply rights for sandbox");
835 #endif
836 
837 	sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
838 				  EVF_READ,
839 				  pci_vtnet_rx_callback,
840 				  sc);
841 	if (sc->vsc_mevp == NULL) {
842 		WPRINTF(("Could not register event\n"));
843 		close(sc->vsc_tapfd);
844 		sc->vsc_tapfd = -1;
845 	}
846 #else
847 	if (dlpi_open(devname, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) {
848 		WPRINTF(("open of vnic device %s failed\n", devname));
849 	}
850 
851 	if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr,
852 	    &physaddrlen) != DLPI_SUCCESS) {
853 		WPRINTF(("read MAC address of vnic device %s failed\n",
854 		    devname));
855 	}
856 	if (physaddrlen != ETHERADDRL) {
857 		WPRINTF(("bad MAC address len %d on vnic device %s\n",
858 		    physaddrlen, devname));
859 	}
860 	memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL);
861 
862 	if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) {
863 		WPRINTF(("bind of vnic device %s failed\n", devname));
864 	}
865 
866 	if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) {
867 		WPRINTF(("enable promiscous mode(physical) of vnic device %s "
868 		    "failed\n", devname));
869 	}
870 	if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) {
871 		WPRINTF(("enable promiscous mode(SAP) of vnic device %s "
872 		    "failed\n", devname));
873 	}
874 
875 	sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp);
876 
877 	if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) {
878 		WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n",
879 		    devname));
880 		dlpi_close(sc->vsc_dhp);
881 		sc->vsc_dlpifd = -1;
882 	}
883 
884 	error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc);
885 	assert(error == 0);
886 #endif
887 }
888 
889 #ifdef __FreeBSD__
890 static void
891 pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname)
892 {
893 	sc->pci_vtnet_rx = pci_vtnet_netmap_rx;
894 	sc->pci_vtnet_tx = pci_vtnet_netmap_tx;
895 
896 	sc->vsc_nmd = nm_open(ifname, NULL, 0, 0);
897 	if (sc->vsc_nmd == NULL) {
898 		WPRINTF(("open of netmap device %s failed\n", ifname));
899 		return;
900 	}
901 
902 	sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd,
903 				  EVF_READ,
904 				  pci_vtnet_rx_callback,
905 				  sc);
906 	if (sc->vsc_mevp == NULL) {
907 		WPRINTF(("Could not register event\n"));
908 		nm_close(sc->vsc_nmd);
909 		sc->vsc_nmd = NULL;
910 	}
911 }
912 #endif /* __FreeBSD__ */
913 
914 static int
915 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
916 {
917 	char tname[MAXCOMLEN + 1];
918 	struct pci_vtnet_softc *sc;
919 	const char *env_msi;
920 	char *devname;
921 	char *vtopts;
922 #ifdef __FreeBSD__
923 	int mac_provided;
924 #endif
925 	int use_msix;
926 
927 	sc = calloc(1, sizeof(struct pci_vtnet_softc));
928 
929 	pthread_mutex_init(&sc->vsc_mtx, NULL);
930 
931 	vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues);
932 	sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
933 
934 	sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
935 	sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
936 	sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
937 	sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq;
938 #ifdef __FreeBSD__
939 	sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ;
940         sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq;
941 #endif
942 
943 	/*
944 	 * Use MSI if set by user
945 	 */
946 	use_msix = 1;
947 	if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) {
948 		if (strcasecmp(env_msi, "yes") == 0)
949 			use_msix = 0;
950 	}
951 
952 	/*
953 	 * Attempt to open the tap device and read the MAC address
954 	 * if specified
955 	 */
956 #ifdef	__FreeBSD__
957 	mac_provided = 0;
958 	sc->vsc_tapfd = -1;
959 #endif
960 	sc->vsc_nmd = NULL;
961 	if (opts != NULL) {
962 #ifdef	__FreeBSD__
963 		int err;
964 #endif
965 
966 		devname = vtopts = strdup(opts);
967 		(void) strsep(&vtopts, ",");
968 
969 #ifdef	__FreBSD__
970 		if (vtopts != NULL) {
971 			err = net_parsemac(vtopts, sc->vsc_config.mac);
972 			if (err != 0) {
973 				free(devname);
974 				return (err);
975 			}
976 			mac_provided = 1;
977 		}
978 #endif
979 
980 #ifdef __FreeBSD__
981 		if (strncmp(devname, "vale", 4) == 0)
982 			pci_vtnet_netmap_setup(sc, devname);
983 #endif
984 		if (strncmp(devname, "tap", 3) == 0 ||
985 		    strncmp(devname, "vmnet", 5) == 0)
986 			pci_vtnet_tap_setup(sc, devname);
987 
988 		free(devname);
989 	}
990 
991 #ifdef	__FreeBSD__
992 	if (!mac_provided) {
993 		net_genmac(pi, sc->vsc_config.mac);
994 	}
995 #endif
996 
997 	/* initialize config space */
998 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
999 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
1000 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
1001 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
1002 	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
1003 
1004 	/* Link is up if we managed to open tap device or vale port. */
1005 #ifdef	__FreeBSD__
1006 	sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 ||
1007 #else
1008 	sc->vsc_config.status = (opts == NULL || sc->vsc_dlpifd >= 0 ||
1009 #endif
1010 	    sc->vsc_nmd != NULL);
1011 
1012 	/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
1013 	if (vi_intr_init(&sc->vsc_vs, 1, use_msix))
1014 		return (1);
1015 
1016 	/* use BAR 0 to map config regs in IO space */
1017 	vi_set_io_bar(&sc->vsc_vs, 0);
1018 
1019 	sc->resetting = 0;
1020 
1021 	sc->rx_merge = 1;
1022 	sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
1023 	pthread_mutex_init(&sc->rx_mtx, NULL);
1024 
1025 	/*
1026 	 * Initialize tx semaphore & spawn TX processing thread.
1027 	 * As of now, only one thread for TX desc processing is
1028 	 * spawned.
1029 	 */
1030 	sc->tx_in_progress = 0;
1031 	pthread_mutex_init(&sc->tx_mtx, NULL);
1032 	pthread_cond_init(&sc->tx_cond, NULL);
1033 	pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc);
1034 	snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot,
1035 	    pi->pi_func);
1036 	pthread_set_name_np(sc->tx_tid, tname);
1037 
1038 	return (0);
1039 }
1040 
1041 static int
1042 pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value)
1043 {
1044 	struct pci_vtnet_softc *sc = vsc;
1045 	void *ptr;
1046 
1047 	if (offset < 6) {
1048 		assert(offset + size <= 6);
1049 		/*
1050 		 * The driver is allowed to change the MAC address
1051 		 */
1052 		ptr = &sc->vsc_config.mac[offset];
1053 		memcpy(ptr, &value, size);
1054 	} else {
1055 		/* silently ignore other writes */
1056 		DPRINTF(("vtnet: write to readonly reg %d\n\r", offset));
1057 	}
1058 
1059 	return (0);
1060 }
1061 
1062 static int
1063 pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval)
1064 {
1065 	struct pci_vtnet_softc *sc = vsc;
1066 	void *ptr;
1067 
1068 	ptr = (uint8_t *)&sc->vsc_config + offset;
1069 	memcpy(retval, ptr, size);
1070 	return (0);
1071 }
1072 
1073 static void
1074 pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features)
1075 {
1076 	struct pci_vtnet_softc *sc = vsc;
1077 
1078 	sc->vsc_features = negotiated_features;
1079 
1080 	if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) {
1081 		sc->rx_merge = 0;
1082 		/* non-merge rx header is 2 bytes shorter */
1083 		sc->rx_vhdrlen -= 2;
1084 	}
1085 }
1086 
1087 struct pci_devemu pci_de_vnet = {
1088 	.pe_emu = 	"virtio-net",
1089 	.pe_init =	pci_vtnet_init,
1090 	.pe_barwrite =	vi_pci_write,
1091 	.pe_barread =	vi_pci_read
1092 };
1093 PCI_EMUL_SET(pci_de_vnet);
1094