xref: /freebsd/usr.sbin/bhyve/net_backends.c (revision 1d386b48)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * This file implements multiple network backends (tap, netmap, ...),
30  * to be used by network frontends such as virtio-net and e1000.
31  * The API to access the backend (e.g. send/receive packets, negotiate
32  * features) is exported by net_backends.h.
33  */
34 
35 #include <sys/cdefs.h>
36 #include <sys/types.h>		/* u_short etc */
37 #ifndef WITHOUT_CAPSICUM
38 #include <sys/capsicum.h>
39 #endif
40 #include <sys/ioctl.h>
41 #include <sys/mman.h>
42 #include <sys/uio.h>
43 
44 #include <net/if.h>
45 #if defined(INET6) || defined(INET)
46 #include <net/if_tap.h>
47 #endif
48 #include <net/netmap.h>
49 #include <net/netmap_virt.h>
50 #define NETMAP_WITH_LIBS
51 #include <net/netmap_user.h>
52 
53 #ifndef WITHOUT_CAPSICUM
54 #include <capsicum_helpers.h>
55 #endif
56 #include <err.h>
57 #include <errno.h>
58 #include <fcntl.h>
59 #include <stdio.h>
60 #include <stdlib.h>
61 #include <stdint.h>
62 #include <string.h>
63 #include <unistd.h>
64 #include <sysexits.h>
65 #include <assert.h>
66 #include <pthread.h>
67 #include <pthread_np.h>
68 #include <poll.h>
69 #include <assert.h>
70 
71 #ifdef NETGRAPH
72 #include <sys/param.h>
73 #include <sys/sysctl.h>
74 #include <netgraph.h>
75 #endif
76 
77 #include "config.h"
78 #include "debug.h"
79 #include "iov.h"
80 #include "mevent.h"
81 #include "net_backends.h"
82 #include "pci_emul.h"
83 
84 #include <sys/linker_set.h>
85 
86 /*
87  * Each network backend registers a set of function pointers that are
88  * used to implement the net backends API.
89  * This might need to be exposed if we implement backends in separate files.
90  */
91 struct net_backend {
92 	const char *prefix;	/* prefix matching this backend */
93 
94 	/*
95 	 * Routines used to initialize and cleanup the resources needed
96 	 * by a backend. The cleanup function is used internally,
97 	 * and should not be called by the frontend.
98 	 */
99 	int (*init)(struct net_backend *be, const char *devname,
100 	    nvlist_t *nvl, net_be_rxeof_t cb, void *param);
101 	void (*cleanup)(struct net_backend *be);
102 
103 	/*
104 	 * Called to serve a guest transmit request. The scatter-gather
105 	 * vector provided by the caller has 'iovcnt' elements and contains
106 	 * the packet to send.
107 	 */
108 	ssize_t (*send)(struct net_backend *be, const struct iovec *iov,
109 	    int iovcnt);
110 
111 	/*
112 	 * Get the length of the next packet that can be received from
113 	 * the backend. If no packets are currently available, this
114 	 * function returns 0.
115 	 */
116 	ssize_t (*peek_recvlen)(struct net_backend *be);
117 
118 	/*
119 	 * Called to receive a packet from the backend. When the function
120 	 * returns a positive value 'len', the scatter-gather vector
121 	 * provided by the caller contains a packet with such length.
122 	 * The function returns 0 if the backend doesn't have a new packet to
123 	 * receive.
124 	 */
125 	ssize_t (*recv)(struct net_backend *be, const struct iovec *iov,
126 	    int iovcnt);
127 
128 	/*
129 	 * Ask the backend to enable or disable receive operation in the
130 	 * backend. On return from a disable operation, it is guaranteed
131 	 * that the receive callback won't be called until receive is
132 	 * enabled again. Note however that it is up to the caller to make
133 	 * sure that netbe_recv() is not currently being executed by another
134 	 * thread.
135 	 */
136 	void (*recv_enable)(struct net_backend *be);
137 	void (*recv_disable)(struct net_backend *be);
138 
139 	/*
140 	 * Ask the backend for the virtio-net features it is able to
141 	 * support. Possible features are TSO, UFO and checksum offloading
142 	 * in both rx and tx direction and for both IPv4 and IPv6.
143 	 */
144 	uint64_t (*get_cap)(struct net_backend *be);
145 
146 	/*
147 	 * Tell the backend to enable/disable the specified virtio-net
148 	 * features (capabilities).
149 	 */
150 	int (*set_cap)(struct net_backend *be, uint64_t features,
151 	    unsigned int vnet_hdr_len);
152 
153 	struct pci_vtnet_softc *sc;
154 	int fd;
155 
156 	/*
157 	 * Length of the virtio-net header used by the backend and the
158 	 * frontend, respectively. A zero value means that the header
159 	 * is not used.
160 	 */
161 	unsigned int be_vnet_hdr_len;
162 	unsigned int fe_vnet_hdr_len;
163 
164 	/* Size of backend-specific private data. */
165 	size_t priv_size;
166 
167 	/* Backend-specific private data follows. */
168 };
169 
170 #define	NET_BE_PRIV(be)		((void *)((be) + 1))
171 #define	NET_BE_SIZE(be)		(sizeof(*be) + (be)->priv_size)
172 
173 SET_DECLARE(net_backend_set, struct net_backend);
174 
175 #define VNET_HDR_LEN	sizeof(struct virtio_net_rxhdr)
176 
177 #define WPRINTF(params) PRINTLN params
178 
179 /*
180  * The tap backend
181  */
182 
183 #if defined(INET6) || defined(INET)
184 static const int pf_list[] = {
185 #if defined(INET6)
186 	PF_INET6,
187 #endif
188 #if defined(INET)
189 	PF_INET,
190 #endif
191 };
192 #endif
193 
194 struct tap_priv {
195 	struct mevent *mevp;
196 	/*
197 	 * A bounce buffer that allows us to implement the peek_recvlen
198 	 * callback. In the future we may get the same information from
199 	 * the kevent data.
200 	 */
201 	char bbuf[1 << 16];
202 	ssize_t bbuflen;
203 };
204 
205 static void
206 tap_cleanup(struct net_backend *be)
207 {
208 	struct tap_priv *priv = NET_BE_PRIV(be);
209 
210 	if (priv->mevp) {
211 		mevent_delete(priv->mevp);
212 	}
213 	if (be->fd != -1) {
214 		close(be->fd);
215 		be->fd = -1;
216 	}
217 }
218 
219 static int
220 tap_init(struct net_backend *be, const char *devname,
221     nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
222 {
223 	struct tap_priv *priv = NET_BE_PRIV(be);
224 	char tbuf[80];
225 	int opt = 1;
226 #if defined(INET6) || defined(INET)
227 	struct ifreq ifrq;
228 	int s;
229 #endif
230 #ifndef WITHOUT_CAPSICUM
231 	cap_rights_t rights;
232 #endif
233 
234 	if (cb == NULL) {
235 		WPRINTF(("TAP backend requires non-NULL callback"));
236 		return (-1);
237 	}
238 
239 	strcpy(tbuf, "/dev/");
240 	strlcat(tbuf, devname, sizeof(tbuf));
241 
242 	be->fd = open(tbuf, O_RDWR);
243 	if (be->fd == -1) {
244 		WPRINTF(("open of tap device %s failed", tbuf));
245 		goto error;
246 	}
247 
248 	/*
249 	 * Set non-blocking and register for read
250 	 * notifications with the event loop
251 	 */
252 	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
253 		WPRINTF(("tap device O_NONBLOCK failed"));
254 		goto error;
255 	}
256 
257 #if defined(INET6) || defined(INET)
258 	/*
259 	 * Try to UP the interface rather than relying on
260 	 * net.link.tap.up_on_open.
261 	  */
262 	bzero(&ifrq, sizeof(ifrq));
263 	if (ioctl(be->fd, TAPGIFNAME, &ifrq) < 0) {
264 		WPRINTF(("Could not get interface name"));
265 		goto error;
266 	}
267 
268 	s = -1;
269 	for (size_t i = 0; s == -1 && i < nitems(pf_list); i++)
270 		s = socket(pf_list[i], SOCK_DGRAM, 0);
271 	if (s == -1) {
272 		WPRINTF(("Could open socket"));
273 		goto error;
274 	}
275 
276 	if (ioctl(s, SIOCGIFFLAGS, &ifrq) < 0) {
277 		(void)close(s);
278 		WPRINTF(("Could not get interface flags"));
279 		goto error;
280 	}
281 	ifrq.ifr_flags |= IFF_UP;
282 	if (ioctl(s, SIOCSIFFLAGS, &ifrq) < 0) {
283 		(void)close(s);
284 		WPRINTF(("Could not set interface flags"));
285 		goto error;
286 	}
287 	(void)close(s);
288 #endif
289 
290 #ifndef WITHOUT_CAPSICUM
291 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
292 	if (caph_rights_limit(be->fd, &rights) == -1)
293 		errx(EX_OSERR, "Unable to apply rights for sandbox");
294 #endif
295 
296 	memset(priv->bbuf, 0, sizeof(priv->bbuf));
297 	priv->bbuflen = 0;
298 
299 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
300 	if (priv->mevp == NULL) {
301 		WPRINTF(("Could not register event"));
302 		goto error;
303 	}
304 
305 	return (0);
306 
307 error:
308 	tap_cleanup(be);
309 	return (-1);
310 }
311 
312 /*
313  * Called to send a buffer chain out to the tap device
314  */
315 static ssize_t
316 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
317 {
318 	return (writev(be->fd, iov, iovcnt));
319 }
320 
321 static ssize_t
322 tap_peek_recvlen(struct net_backend *be)
323 {
324 	struct tap_priv *priv = NET_BE_PRIV(be);
325 	ssize_t ret;
326 
327 	if (priv->bbuflen > 0) {
328 		/*
329 		 * We already have a packet in the bounce buffer.
330 		 * Just return its length.
331 		 */
332 		return priv->bbuflen;
333 	}
334 
335 	/*
336 	 * Read the next packet (if any) into the bounce buffer, so
337 	 * that we get to know its length and we can return that
338 	 * to the caller.
339 	 */
340 	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
341 	if (ret < 0 && errno == EWOULDBLOCK) {
342 		return (0);
343 	}
344 
345 	if (ret > 0)
346 		priv->bbuflen = ret;
347 
348 	return (ret);
349 }
350 
351 static ssize_t
352 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
353 {
354 	struct tap_priv *priv = NET_BE_PRIV(be);
355 	ssize_t ret;
356 
357 	if (priv->bbuflen > 0) {
358 		/*
359 		 * A packet is available in the bounce buffer, so
360 		 * we read it from there.
361 		 */
362 		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
363 		    iov, iovcnt, 0);
364 
365 		/* Mark the bounce buffer as empty. */
366 		priv->bbuflen = 0;
367 
368 		return (ret);
369 	}
370 
371 	ret = readv(be->fd, iov, iovcnt);
372 	if (ret < 0 && errno == EWOULDBLOCK) {
373 		return (0);
374 	}
375 
376 	return (ret);
377 }
378 
379 static void
380 tap_recv_enable(struct net_backend *be)
381 {
382 	struct tap_priv *priv = NET_BE_PRIV(be);
383 
384 	mevent_enable(priv->mevp);
385 }
386 
387 static void
388 tap_recv_disable(struct net_backend *be)
389 {
390 	struct tap_priv *priv = NET_BE_PRIV(be);
391 
392 	mevent_disable(priv->mevp);
393 }
394 
395 static uint64_t
396 tap_get_cap(struct net_backend *be __unused)
397 {
398 
399 	return (0); /* no capabilities for now */
400 }
401 
402 static int
403 tap_set_cap(struct net_backend *be __unused, uint64_t features,
404     unsigned vnet_hdr_len)
405 {
406 
407 	return ((features || vnet_hdr_len) ? -1 : 0);
408 }
409 
410 static struct net_backend tap_backend = {
411 	.prefix = "tap",
412 	.priv_size = sizeof(struct tap_priv),
413 	.init = tap_init,
414 	.cleanup = tap_cleanup,
415 	.send = tap_send,
416 	.peek_recvlen = tap_peek_recvlen,
417 	.recv = tap_recv,
418 	.recv_enable = tap_recv_enable,
419 	.recv_disable = tap_recv_disable,
420 	.get_cap = tap_get_cap,
421 	.set_cap = tap_set_cap,
422 };
423 
424 /* A clone of the tap backend, with a different prefix. */
425 static struct net_backend vmnet_backend = {
426 	.prefix = "vmnet",
427 	.priv_size = sizeof(struct tap_priv),
428 	.init = tap_init,
429 	.cleanup = tap_cleanup,
430 	.send = tap_send,
431 	.peek_recvlen = tap_peek_recvlen,
432 	.recv = tap_recv,
433 	.recv_enable = tap_recv_enable,
434 	.recv_disable = tap_recv_disable,
435 	.get_cap = tap_get_cap,
436 	.set_cap = tap_set_cap,
437 };
438 
439 DATA_SET(net_backend_set, tap_backend);
440 DATA_SET(net_backend_set, vmnet_backend);
441 
442 #ifdef NETGRAPH
443 
444 /*
445  * Netgraph backend
446  */
447 
448 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024)
449 
450 static int
451 ng_init(struct net_backend *be, const char *devname __unused,
452 	 nvlist_t *nvl, net_be_rxeof_t cb, void *param)
453 {
454 	struct tap_priv *p = NET_BE_PRIV(be);
455 	struct ngm_connect ngc;
456 	const char *value, *nodename;
457 	int sbsz;
458 	int ctrl_sock;
459 	int flags;
460 	unsigned long maxsbsz;
461 	size_t msbsz;
462 #ifndef WITHOUT_CAPSICUM
463 	cap_rights_t rights;
464 #endif
465 
466 	if (cb == NULL) {
467 		WPRINTF(("Netgraph backend requires non-NULL callback"));
468 		return (-1);
469 	}
470 
471 	be->fd = -1;
472 
473 	memset(&ngc, 0, sizeof(ngc));
474 
475 	value = get_config_value_node(nvl, "path");
476 	if (value == NULL) {
477 		WPRINTF(("path must be provided"));
478 		return (-1);
479 	}
480 	strncpy(ngc.path, value, NG_PATHSIZ - 1);
481 
482 	value = get_config_value_node(nvl, "hook");
483 	if (value == NULL)
484 		value = "vmlink";
485 	strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1);
486 
487 	value = get_config_value_node(nvl, "peerhook");
488 	if (value == NULL) {
489 		WPRINTF(("peer hook must be provided"));
490 		return (-1);
491 	}
492 	strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1);
493 
494 	nodename = get_config_value_node(nvl, "socket");
495 	if (NgMkSockNode(nodename,
496 		&ctrl_sock, &be->fd) < 0) {
497 		WPRINTF(("can't get Netgraph sockets"));
498 		return (-1);
499 	}
500 
501 	if (NgSendMsg(ctrl_sock, ".",
502 		NGM_GENERIC_COOKIE,
503 		NGM_CONNECT, &ngc, sizeof(ngc)) < 0) {
504 		WPRINTF(("can't connect to node"));
505 		close(ctrl_sock);
506 		goto error;
507 	}
508 
509 	close(ctrl_sock);
510 
511 	flags = fcntl(be->fd, F_GETFL);
512 
513 	if (flags < 0) {
514 		WPRINTF(("can't get socket flags"));
515 		goto error;
516 	}
517 
518 	if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) {
519 		WPRINTF(("can't set O_NONBLOCK flag"));
520 		goto error;
521 	}
522 
523 	/*
524 	 * The default ng_socket(4) buffer's size is too low.
525 	 * Calculate the minimum value between NG_SBUF_MAX_SIZE
526 	 * and kern.ipc.maxsockbuf.
527 	 */
528 	msbsz = sizeof(maxsbsz);
529 	if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz,
530 		NULL, 0) < 0) {
531 		WPRINTF(("can't get 'kern.ipc.maxsockbuf' value"));
532 		goto error;
533 	}
534 
535 	/*
536 	 * We can't set the socket buffer size to kern.ipc.maxsockbuf value,
537 	 * as it takes into account the mbuf(9) overhead.
538 	 */
539 	maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES);
540 
541 	sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz);
542 
543 	if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz,
544 		sizeof(sbsz)) < 0) {
545 		WPRINTF(("can't set TX buffer size"));
546 		goto error;
547 	}
548 
549 	if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz,
550 		sizeof(sbsz)) < 0) {
551 		WPRINTF(("can't set RX buffer size"));
552 		goto error;
553 	}
554 
555 #ifndef WITHOUT_CAPSICUM
556 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
557 	if (caph_rights_limit(be->fd, &rights) == -1)
558 		errx(EX_OSERR, "Unable to apply rights for sandbox");
559 #endif
560 
561 	memset(p->bbuf, 0, sizeof(p->bbuf));
562 	p->bbuflen = 0;
563 
564 	p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
565 	if (p->mevp == NULL) {
566 		WPRINTF(("Could not register event"));
567 		goto error;
568 	}
569 
570 	return (0);
571 
572 error:
573 	tap_cleanup(be);
574 	return (-1);
575 }
576 
577 static struct net_backend ng_backend = {
578 	.prefix = "netgraph",
579 	.priv_size = sizeof(struct tap_priv),
580 	.init = ng_init,
581 	.cleanup = tap_cleanup,
582 	.send = tap_send,
583 	.peek_recvlen = tap_peek_recvlen,
584 	.recv = tap_recv,
585 	.recv_enable = tap_recv_enable,
586 	.recv_disable = tap_recv_disable,
587 	.get_cap = tap_get_cap,
588 	.set_cap = tap_set_cap,
589 };
590 
591 DATA_SET(net_backend_set, ng_backend);
592 
593 #endif /* NETGRAPH */
594 
595 /*
596  * The netmap backend
597  */
598 
599 /* The virtio-net features supported by netmap. */
600 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
601 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
602 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
603 		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
604 
605 struct netmap_priv {
606 	char ifname[IFNAMSIZ];
607 	struct nm_desc *nmd;
608 	uint16_t memid;
609 	struct netmap_ring *rx;
610 	struct netmap_ring *tx;
611 	struct mevent *mevp;
612 	net_be_rxeof_t cb;
613 	void *cb_param;
614 };
615 
616 static void
617 nmreq_init(struct nmreq *req, char *ifname)
618 {
619 
620 	memset(req, 0, sizeof(*req));
621 	strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
622 	req->nr_version = NETMAP_API;
623 }
624 
625 static int
626 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
627 {
628 	int err;
629 	struct nmreq req;
630 	struct netmap_priv *priv = NET_BE_PRIV(be);
631 
632 	nmreq_init(&req, priv->ifname);
633 	req.nr_cmd = NETMAP_BDG_VNET_HDR;
634 	req.nr_arg1 = vnet_hdr_len;
635 	err = ioctl(be->fd, NIOCREGIF, &req);
636 	if (err) {
637 		WPRINTF(("Unable to set vnet header length %d",
638 				vnet_hdr_len));
639 		return (err);
640 	}
641 
642 	be->be_vnet_hdr_len = vnet_hdr_len;
643 
644 	return (0);
645 }
646 
647 static int
648 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
649 {
650 	unsigned prev_hdr_len = be->be_vnet_hdr_len;
651 	int ret;
652 
653 	if (vnet_hdr_len == prev_hdr_len) {
654 		return (1);
655 	}
656 
657 	ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
658 	if (ret) {
659 		return (0);
660 	}
661 
662 	netmap_set_vnet_hdr_len(be, prev_hdr_len);
663 
664 	return (1);
665 }
666 
667 static uint64_t
668 netmap_get_cap(struct net_backend *be)
669 {
670 
671 	return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
672 	    NETMAP_FEATURES : 0);
673 }
674 
675 static int
676 netmap_set_cap(struct net_backend *be, uint64_t features __unused,
677     unsigned vnet_hdr_len)
678 {
679 
680 	return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
681 }
682 
683 static int
684 netmap_init(struct net_backend *be, const char *devname,
685     nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
686 {
687 	struct netmap_priv *priv = NET_BE_PRIV(be);
688 
689 	strlcpy(priv->ifname, devname, sizeof(priv->ifname));
690 	priv->ifname[sizeof(priv->ifname) - 1] = '\0';
691 
692 	priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
693 	if (priv->nmd == NULL) {
694 		WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)",
695 			devname, strerror(errno)));
696 		return (-1);
697 	}
698 
699 	priv->memid = priv->nmd->req.nr_arg2;
700 	priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
701 	priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
702 	priv->cb = cb;
703 	priv->cb_param = param;
704 	be->fd = priv->nmd->fd;
705 
706 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
707 	if (priv->mevp == NULL) {
708 		WPRINTF(("Could not register event"));
709 		return (-1);
710 	}
711 
712 	return (0);
713 }
714 
715 static void
716 netmap_cleanup(struct net_backend *be)
717 {
718 	struct netmap_priv *priv = NET_BE_PRIV(be);
719 
720 	if (priv->mevp) {
721 		mevent_delete(priv->mevp);
722 	}
723 	if (priv->nmd) {
724 		nm_close(priv->nmd);
725 	}
726 	be->fd = -1;
727 }
728 
729 static ssize_t
730 netmap_send(struct net_backend *be, const struct iovec *iov,
731 	    int iovcnt)
732 {
733 	struct netmap_priv *priv = NET_BE_PRIV(be);
734 	struct netmap_ring *ring;
735 	ssize_t totlen = 0;
736 	int nm_buf_size;
737 	int nm_buf_len;
738 	uint32_t head;
739 	uint8_t *nm_buf;
740 	int j;
741 
742 	ring = priv->tx;
743 	head = ring->head;
744 	if (head == ring->tail) {
745 		WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt)));
746 		goto txsync;
747 	}
748 	nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
749 	nm_buf_size = ring->nr_buf_size;
750 	nm_buf_len = 0;
751 
752 	for (j = 0; j < iovcnt; j++) {
753 		uint8_t *iov_frag_buf = iov[j].iov_base;
754 		int iov_frag_size = iov[j].iov_len;
755 
756 		totlen += iov_frag_size;
757 
758 		/*
759 		 * Split each iovec fragment over more netmap slots, if
760 		 * necessary.
761 		 */
762 		for (;;) {
763 			int copylen;
764 
765 			copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
766 			memcpy(nm_buf, iov_frag_buf, copylen);
767 
768 			iov_frag_buf += copylen;
769 			iov_frag_size -= copylen;
770 			nm_buf += copylen;
771 			nm_buf_size -= copylen;
772 			nm_buf_len += copylen;
773 
774 			if (iov_frag_size == 0) {
775 				break;
776 			}
777 
778 			ring->slot[head].len = nm_buf_len;
779 			ring->slot[head].flags = NS_MOREFRAG;
780 			head = nm_ring_next(ring, head);
781 			if (head == ring->tail) {
782 				/*
783 				 * We ran out of netmap slots while
784 				 * splitting the iovec fragments.
785 				 */
786 				WPRINTF(("No space, drop %zu bytes",
787 				   count_iov(iov, iovcnt)));
788 				goto txsync;
789 			}
790 			nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
791 			nm_buf_size = ring->nr_buf_size;
792 			nm_buf_len = 0;
793 		}
794 	}
795 
796 	/* Complete the last slot, which must not have NS_MOREFRAG set. */
797 	ring->slot[head].len = nm_buf_len;
798 	ring->slot[head].flags = 0;
799 	head = nm_ring_next(ring, head);
800 
801 	/* Now update ring->head and ring->cur. */
802 	ring->head = ring->cur = head;
803 txsync:
804 	ioctl(be->fd, NIOCTXSYNC, NULL);
805 
806 	return (totlen);
807 }
808 
809 static ssize_t
810 netmap_peek_recvlen(struct net_backend *be)
811 {
812 	struct netmap_priv *priv = NET_BE_PRIV(be);
813 	struct netmap_ring *ring = priv->rx;
814 	uint32_t head = ring->head;
815 	ssize_t totlen = 0;
816 
817 	while (head != ring->tail) {
818 		struct netmap_slot *slot = ring->slot + head;
819 
820 		totlen += slot->len;
821 		if ((slot->flags & NS_MOREFRAG) == 0)
822 			break;
823 		head = nm_ring_next(ring, head);
824 	}
825 
826 	return (totlen);
827 }
828 
829 static ssize_t
830 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
831 {
832 	struct netmap_priv *priv = NET_BE_PRIV(be);
833 	struct netmap_slot *slot = NULL;
834 	struct netmap_ring *ring;
835 	uint8_t *iov_frag_buf;
836 	int iov_frag_size;
837 	ssize_t totlen = 0;
838 	uint32_t head;
839 
840 	assert(iovcnt);
841 
842 	ring = priv->rx;
843 	head = ring->head;
844 	iov_frag_buf = iov->iov_base;
845 	iov_frag_size = iov->iov_len;
846 
847 	do {
848 		uint8_t *nm_buf;
849 		int nm_buf_len;
850 
851 		if (head == ring->tail) {
852 			return (0);
853 		}
854 
855 		slot = ring->slot + head;
856 		nm_buf = NETMAP_BUF(ring, slot->buf_idx);
857 		nm_buf_len = slot->len;
858 
859 		for (;;) {
860 			int copylen = nm_buf_len < iov_frag_size ?
861 			    nm_buf_len : iov_frag_size;
862 
863 			memcpy(iov_frag_buf, nm_buf, copylen);
864 			nm_buf += copylen;
865 			nm_buf_len -= copylen;
866 			iov_frag_buf += copylen;
867 			iov_frag_size -= copylen;
868 			totlen += copylen;
869 
870 			if (nm_buf_len == 0) {
871 				break;
872 			}
873 
874 			iov++;
875 			iovcnt--;
876 			if (iovcnt == 0) {
877 				/* No space to receive. */
878 				WPRINTF(("Short iov, drop %zd bytes",
879 				    totlen));
880 				return (-ENOSPC);
881 			}
882 			iov_frag_buf = iov->iov_base;
883 			iov_frag_size = iov->iov_len;
884 		}
885 
886 		head = nm_ring_next(ring, head);
887 
888 	} while (slot->flags & NS_MOREFRAG);
889 
890 	/* Release slots to netmap. */
891 	ring->head = ring->cur = head;
892 
893 	return (totlen);
894 }
895 
896 static void
897 netmap_recv_enable(struct net_backend *be)
898 {
899 	struct netmap_priv *priv = NET_BE_PRIV(be);
900 
901 	mevent_enable(priv->mevp);
902 }
903 
904 static void
905 netmap_recv_disable(struct net_backend *be)
906 {
907 	struct netmap_priv *priv = NET_BE_PRIV(be);
908 
909 	mevent_disable(priv->mevp);
910 }
911 
912 static struct net_backend netmap_backend = {
913 	.prefix = "netmap",
914 	.priv_size = sizeof(struct netmap_priv),
915 	.init = netmap_init,
916 	.cleanup = netmap_cleanup,
917 	.send = netmap_send,
918 	.peek_recvlen = netmap_peek_recvlen,
919 	.recv = netmap_recv,
920 	.recv_enable = netmap_recv_enable,
921 	.recv_disable = netmap_recv_disable,
922 	.get_cap = netmap_get_cap,
923 	.set_cap = netmap_set_cap,
924 };
925 
926 /* A clone of the netmap backend, with a different prefix. */
927 static struct net_backend vale_backend = {
928 	.prefix = "vale",
929 	.priv_size = sizeof(struct netmap_priv),
930 	.init = netmap_init,
931 	.cleanup = netmap_cleanup,
932 	.send = netmap_send,
933 	.peek_recvlen = netmap_peek_recvlen,
934 	.recv = netmap_recv,
935 	.recv_enable = netmap_recv_enable,
936 	.recv_disable = netmap_recv_disable,
937 	.get_cap = netmap_get_cap,
938 	.set_cap = netmap_set_cap,
939 };
940 
941 DATA_SET(net_backend_set, netmap_backend);
942 DATA_SET(net_backend_set, vale_backend);
943 
944 int
945 netbe_legacy_config(nvlist_t *nvl, const char *opts)
946 {
947 	char *backend, *cp;
948 
949 	if (opts == NULL)
950 		return (0);
951 
952 	cp = strchr(opts, ',');
953 	if (cp == NULL) {
954 		set_config_value_node(nvl, "backend", opts);
955 		return (0);
956 	}
957 	backend = strndup(opts, cp - opts);
958 	set_config_value_node(nvl, "backend", backend);
959 	free(backend);
960 	return (pci_parse_legacy_config(nvl, cp + 1));
961 }
962 
963 /*
964  * Initialize a backend and attach to the frontend.
965  * This is called during frontend initialization.
966  *  @ret is a pointer to the backend to be initialized
967  *  @devname is the backend-name as supplied on the command line,
968  * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
969  *  @cb is the receive callback supplied by the frontend,
970  *	and it is invoked in the event loop when a receive
971  *	event is generated in the hypervisor,
972  *  @param is a pointer to the frontend, and normally used as
973  *	the argument for the callback.
974  */
975 int
976 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb,
977     void *param)
978 {
979 	struct net_backend **pbe, *nbe, *tbe = NULL;
980 	const char *value, *type;
981 	char *devname;
982 	int err;
983 
984 	value = get_config_value_node(nvl, "backend");
985 	if (value == NULL) {
986 		return (-1);
987 	}
988 	devname = strdup(value);
989 
990 	/*
991 	 * Use the type given by configuration if exists; otherwise
992 	 * use the prefix of the backend as the type.
993 	 */
994 	type = get_config_value_node(nvl, "type");
995 	if (type == NULL)
996 		type = devname;
997 
998 	/*
999 	 * Find the network backend that matches the user-provided
1000 	 * device name. net_backend_set is built using a linker set.
1001 	 */
1002 	SET_FOREACH(pbe, net_backend_set) {
1003 		if (strncmp(type, (*pbe)->prefix,
1004 		    strlen((*pbe)->prefix)) == 0) {
1005 			tbe = *pbe;
1006 			assert(tbe->init != NULL);
1007 			assert(tbe->cleanup != NULL);
1008 			assert(tbe->send != NULL);
1009 			assert(tbe->recv != NULL);
1010 			assert(tbe->get_cap != NULL);
1011 			assert(tbe->set_cap != NULL);
1012 			break;
1013 		}
1014 	}
1015 
1016 	*ret = NULL;
1017 	if (tbe == NULL) {
1018 		free(devname);
1019 		return (EINVAL);
1020 	}
1021 
1022 	nbe = calloc(1, NET_BE_SIZE(tbe));
1023 	*nbe = *tbe;	/* copy the template */
1024 	nbe->fd = -1;
1025 	nbe->sc = param;
1026 	nbe->be_vnet_hdr_len = 0;
1027 	nbe->fe_vnet_hdr_len = 0;
1028 
1029 	/* Initialize the backend. */
1030 	err = nbe->init(nbe, devname, nvl, cb, param);
1031 	if (err) {
1032 		free(devname);
1033 		free(nbe);
1034 		return (err);
1035 	}
1036 
1037 	*ret = nbe;
1038 	free(devname);
1039 
1040 	return (0);
1041 }
1042 
1043 void
1044 netbe_cleanup(struct net_backend *be)
1045 {
1046 
1047 	if (be != NULL) {
1048 		be->cleanup(be);
1049 		free(be);
1050 	}
1051 }
1052 
1053 uint64_t
1054 netbe_get_cap(struct net_backend *be)
1055 {
1056 
1057 	assert(be != NULL);
1058 	return (be->get_cap(be));
1059 }
1060 
1061 int
1062 netbe_set_cap(struct net_backend *be, uint64_t features,
1063 	      unsigned vnet_hdr_len)
1064 {
1065 	int ret;
1066 
1067 	assert(be != NULL);
1068 
1069 	/* There are only three valid lengths, i.e., 0, 10 and 12. */
1070 	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
1071 		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
1072 		return (-1);
1073 
1074 	be->fe_vnet_hdr_len = vnet_hdr_len;
1075 
1076 	ret = be->set_cap(be, features, vnet_hdr_len);
1077 	assert(be->be_vnet_hdr_len == 0 ||
1078 	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
1079 
1080 	return (ret);
1081 }
1082 
1083 ssize_t
1084 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
1085 {
1086 
1087 	return (be->send(be, iov, iovcnt));
1088 }
1089 
1090 ssize_t
1091 netbe_peek_recvlen(struct net_backend *be)
1092 {
1093 
1094 	return (be->peek_recvlen(be));
1095 }
1096 
1097 /*
1098  * Try to read a packet from the backend, without blocking.
1099  * If no packets are available, return 0. In case of success, return
1100  * the length of the packet just read. Return -1 in case of errors.
1101  */
1102 ssize_t
1103 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
1104 {
1105 
1106 	return (be->recv(be, iov, iovcnt));
1107 }
1108 
1109 /*
1110  * Read a packet from the backend and discard it.
1111  * Returns the size of the discarded packet or zero if no packet was available.
1112  * A negative error code is returned in case of read error.
1113  */
1114 ssize_t
1115 netbe_rx_discard(struct net_backend *be)
1116 {
1117 	/*
1118 	 * MP note: the dummybuf is only used to discard frames,
1119 	 * so there is no need for it to be per-vtnet or locked.
1120 	 * We only make it large enough for TSO-sized segment.
1121 	 */
1122 	static uint8_t dummybuf[65536 + 64];
1123 	struct iovec iov;
1124 
1125 	iov.iov_base = dummybuf;
1126 	iov.iov_len = sizeof(dummybuf);
1127 
1128 	return netbe_recv(be, &iov, 1);
1129 }
1130 
1131 void
1132 netbe_rx_disable(struct net_backend *be)
1133 {
1134 
1135 	return be->recv_disable(be);
1136 }
1137 
1138 void
1139 netbe_rx_enable(struct net_backend *be)
1140 {
1141 
1142 	return be->recv_enable(be);
1143 }
1144 
1145 size_t
1146 netbe_get_vnet_hdr_len(struct net_backend *be)
1147 {
1148 
1149 	return (be->be_vnet_hdr_len);
1150 }
1151