xref: /illumos-gate/usr/src/cmd/bhyve/net_backends.c (revision b0de25cb)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 /*
31  * This file implements multiple network backends (tap, netmap, ...),
32  * to be used by network frontends such as virtio-net and e1000.
33  * The API to access the backend (e.g. send/receive packets, negotiate
34  * features) is exported by net_backends.h.
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include <sys/types.h>		/* u_short etc */
41 #ifndef WITHOUT_CAPSICUM
42 #include <sys/capsicum.h>
43 #endif
44 #include <sys/ioctl.h>
45 #include <sys/mman.h>
46 #include <sys/uio.h>
47 
48 #include <net/if.h>
49 #ifdef __FreeBSD__
50 #if defined(INET6) || defined(INET)
51 #include <net/if_tap.h>
52 #endif
53 #include <net/netmap.h>
54 #include <net/netmap_virt.h>
55 #define NETMAP_WITH_LIBS
56 #include <net/netmap_user.h>
57 #endif /* __FreeBSD__ */
58 
59 #ifndef WITHOUT_CAPSICUM
60 #include <capsicum_helpers.h>
61 #endif
62 #include <err.h>
63 #include <errno.h>
64 #include <fcntl.h>
65 #include <stdio.h>
66 #include <stdlib.h>
67 #include <stdint.h>
68 #include <string.h>
69 #include <unistd.h>
70 #include <sysexits.h>
71 #include <assert.h>
72 #include <pthread.h>
73 #include <pthread_np.h>
74 #include <poll.h>
75 #include <assert.h>
76 
77 #ifdef NETGRAPH
78 #include <sys/param.h>
79 #include <sys/sysctl.h>
80 #include <netgraph.h>
81 #endif
82 
83 #ifndef __FreeBSD__
84 #include <libdlpi.h>
85 #include <net/ethernet.h>
86 #endif
87 
88 #include "config.h"
89 #include "debug.h"
90 #include "iov.h"
91 #include "mevent.h"
92 #include "net_backends.h"
93 #include "pci_emul.h"
94 
95 #include <sys/linker_set.h>
96 
97 /*
98  * Each network backend registers a set of function pointers that are
99  * used to implement the net backends API.
100  * This might need to be exposed if we implement backends in separate files.
101  */
102 struct net_backend {
103 	const char *prefix;	/* prefix matching this backend */
104 
105 	/*
106 	 * Routines used to initialize and cleanup the resources needed
107 	 * by a backend. The cleanup function is used internally,
108 	 * and should not be called by the frontend.
109 	 */
110 	int (*init)(struct net_backend *be, const char *devname,
111 	    nvlist_t *nvl, net_be_rxeof_t cb, void *param);
112 	void (*cleanup)(struct net_backend *be);
113 
114 	/*
115 	 * Called to serve a guest transmit request. The scatter-gather
116 	 * vector provided by the caller has 'iovcnt' elements and contains
117 	 * the packet to send.
118 	 */
119 	ssize_t (*send)(struct net_backend *be, const struct iovec *iov,
120 	    int iovcnt);
121 
122 	/*
123 	 * Get the length of the next packet that can be received from
124 	 * the backend. If no packets are currently available, this
125 	 * function returns 0.
126 	 */
127 	ssize_t (*peek_recvlen)(struct net_backend *be);
128 
129 	/*
130 	 * Called to receive a packet from the backend. When the function
131 	 * returns a positive value 'len', the scatter-gather vector
132 	 * provided by the caller contains a packet with such length.
133 	 * The function returns 0 if the backend doesn't have a new packet to
134 	 * receive.
135 	 */
136 	ssize_t (*recv)(struct net_backend *be, const struct iovec *iov,
137 	    int iovcnt);
138 
139 	/*
140 	 * Ask the backend to enable or disable receive operation in the
141 	 * backend. On return from a disable operation, it is guaranteed
142 	 * that the receive callback won't be called until receive is
143 	 * enabled again. Note however that it is up to the caller to make
144 	 * sure that netbe_recv() is not currently being executed by another
145 	 * thread.
146 	 */
147 	void (*recv_enable)(struct net_backend *be);
148 	void (*recv_disable)(struct net_backend *be);
149 
150 	/*
151 	 * Ask the backend for the virtio-net features it is able to
152 	 * support. Possible features are TSO, UFO and checksum offloading
153 	 * in both rx and tx direction and for both IPv4 and IPv6.
154 	 */
155 	uint64_t (*get_cap)(struct net_backend *be);
156 
157 	/*
158 	 * Tell the backend to enable/disable the specified virtio-net
159 	 * features (capabilities).
160 	 */
161 	int (*set_cap)(struct net_backend *be, uint64_t features,
162 	    unsigned int vnet_hdr_len);
163 
164 #ifndef __FreeBSD__
165 	int (*get_mac)(struct net_backend *be, void *, size_t *);
166 #endif
167 
168 	struct pci_vtnet_softc *sc;
169 	int fd;
170 
171 	/*
172 	 * Length of the virtio-net header used by the backend and the
173 	 * frontend, respectively. A zero value means that the header
174 	 * is not used.
175 	 */
176 	unsigned int be_vnet_hdr_len;
177 	unsigned int fe_vnet_hdr_len;
178 
179 	/* Size of backend-specific private data. */
180 	size_t priv_size;
181 
182 	/* Room for backend-specific data. */
183 	char opaque[0];
184 };
185 
186 SET_DECLARE(net_backend_set, struct net_backend);
187 
188 #define VNET_HDR_LEN	sizeof(struct virtio_net_rxhdr)
189 
190 #define WPRINTF(params) PRINTLN params
191 
192 #ifdef __FreeBSD__
193 
194 /*
195  * The tap backend
196  */
197 
198 #if defined(INET6) || defined(INET)
199 const int pf_list[] = {
200 #if defined(INET6)
201 	PF_INET6,
202 #endif
203 #if defined(INET)
204 	PF_INET,
205 #endif
206 };
207 #endif
208 
209 struct tap_priv {
210 	struct mevent *mevp;
211 	/*
212 	 * A bounce buffer that allows us to implement the peek_recvlen
213 	 * callback. In the future we may get the same information from
214 	 * the kevent data.
215 	 */
216 	char bbuf[1 << 16];
217 	ssize_t bbuflen;
218 };
219 
220 static void
221 tap_cleanup(struct net_backend *be)
222 {
223 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
224 
225 	if (priv->mevp) {
226 		mevent_delete(priv->mevp);
227 	}
228 	if (be->fd != -1) {
229 		close(be->fd);
230 		be->fd = -1;
231 	}
232 }
233 
234 static int
235 tap_init(struct net_backend *be, const char *devname,
236 	 nvlist_t *nvl, net_be_rxeof_t cb, void *param)
237 {
238 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
239 	char tbuf[80];
240 	int opt = 1;
241 #if defined(INET6) || defined(INET)
242 	struct ifreq ifrq;
243 	int i, s;
244 #endif
245 #ifndef WITHOUT_CAPSICUM
246 	cap_rights_t rights;
247 #endif
248 
249 	if (cb == NULL) {
250 		WPRINTF(("TAP backend requires non-NULL callback"));
251 		return (-1);
252 	}
253 
254 	strcpy(tbuf, "/dev/");
255 	strlcat(tbuf, devname, sizeof(tbuf));
256 
257 	be->fd = open(tbuf, O_RDWR);
258 	if (be->fd == -1) {
259 		WPRINTF(("open of tap device %s failed", tbuf));
260 		goto error;
261 	}
262 
263 	/*
264 	 * Set non-blocking and register for read
265 	 * notifications with the event loop
266 	 */
267 	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
268 		WPRINTF(("tap device O_NONBLOCK failed"));
269 		goto error;
270 	}
271 
272 #if defined(INET6) || defined(INET)
273 	/*
274 	 * Try to UP the interface rather than relying on
275 	 * net.link.tap.up_on_open.
276 	  */
277 	bzero(&ifrq, sizeof(ifrq));
278 	if (ioctl(be->fd, TAPGIFNAME, &ifrq) < 0) {
279 		WPRINTF(("Could not get interface name"));
280 		goto error;
281 	}
282 
283 	s = -1;
284 	for (i = 0; s == -1 && i < nitems(pf_list); i++)
285 		s = socket(pf_list[i], SOCK_DGRAM, 0);
286 	if (s == -1) {
287 		WPRINTF(("Could open socket"));
288 		goto error;
289 	}
290 
291 	if (ioctl(s, SIOCGIFFLAGS, &ifrq) < 0) {
292 		(void)close(s);
293 		WPRINTF(("Could not get interface flags"));
294 		goto error;
295 	}
296 	ifrq.ifr_flags |= IFF_UP;
297 	if (ioctl(s, SIOCSIFFLAGS, &ifrq) < 0) {
298 		(void)close(s);
299 		WPRINTF(("Could not set interface flags"));
300 		goto error;
301 	}
302 	(void)close(s);
303 #endif
304 
305 #ifndef WITHOUT_CAPSICUM
306 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
307 	if (caph_rights_limit(be->fd, &rights) == -1)
308 		errx(EX_OSERR, "Unable to apply rights for sandbox");
309 #endif
310 
311 	memset(priv->bbuf, 0, sizeof(priv->bbuf));
312 	priv->bbuflen = 0;
313 
314 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
315 	if (priv->mevp == NULL) {
316 		WPRINTF(("Could not register event"));
317 		goto error;
318 	}
319 
320 	return (0);
321 
322 error:
323 	tap_cleanup(be);
324 	return (-1);
325 }
326 
327 /*
328  * Called to send a buffer chain out to the tap device
329  */
330 static ssize_t
331 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
332 {
333 	return (writev(be->fd, iov, iovcnt));
334 }
335 
336 static ssize_t
337 tap_peek_recvlen(struct net_backend *be)
338 {
339 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
340 	ssize_t ret;
341 
342 	if (priv->bbuflen > 0) {
343 		/*
344 		 * We already have a packet in the bounce buffer.
345 		 * Just return its length.
346 		 */
347 		return priv->bbuflen;
348 	}
349 
350 	/*
351 	 * Read the next packet (if any) into the bounce buffer, so
352 	 * that we get to know its length and we can return that
353 	 * to the caller.
354 	 */
355 	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
356 	if (ret < 0 && errno == EWOULDBLOCK) {
357 		return (0);
358 	}
359 
360 	if (ret > 0)
361 		priv->bbuflen = ret;
362 
363 	return (ret);
364 }
365 
366 static ssize_t
367 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
368 {
369 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
370 	ssize_t ret;
371 
372 	if (priv->bbuflen > 0) {
373 		/*
374 		 * A packet is available in the bounce buffer, so
375 		 * we read it from there.
376 		 */
377 		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
378 		    iov, iovcnt, 0);
379 
380 		/* Mark the bounce buffer as empty. */
381 		priv->bbuflen = 0;
382 
383 		return (ret);
384 	}
385 
386 	ret = readv(be->fd, iov, iovcnt);
387 	if (ret < 0 && errno == EWOULDBLOCK) {
388 		return (0);
389 	}
390 
391 	return (ret);
392 }
393 
394 static void
395 tap_recv_enable(struct net_backend *be)
396 {
397 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
398 
399 	mevent_enable(priv->mevp);
400 }
401 
402 static void
403 tap_recv_disable(struct net_backend *be)
404 {
405 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
406 
407 	mevent_disable(priv->mevp);
408 }
409 
410 static uint64_t
411 tap_get_cap(struct net_backend *be)
412 {
413 
414 	return (0); /* no capabilities for now */
415 }
416 
417 static int
418 tap_set_cap(struct net_backend *be, uint64_t features,
419 		unsigned vnet_hdr_len)
420 {
421 
422 	return ((features || vnet_hdr_len) ? -1 : 0);
423 }
424 
425 static struct net_backend tap_backend = {
426 	.prefix = "tap",
427 	.priv_size = sizeof(struct tap_priv),
428 	.init = tap_init,
429 	.cleanup = tap_cleanup,
430 	.send = tap_send,
431 	.peek_recvlen = tap_peek_recvlen,
432 	.recv = tap_recv,
433 	.recv_enable = tap_recv_enable,
434 	.recv_disable = tap_recv_disable,
435 	.get_cap = tap_get_cap,
436 	.set_cap = tap_set_cap,
437 };
438 
439 /* A clone of the tap backend, with a different prefix. */
440 static struct net_backend vmnet_backend = {
441 	.prefix = "vmnet",
442 	.priv_size = sizeof(struct tap_priv),
443 	.init = tap_init,
444 	.cleanup = tap_cleanup,
445 	.send = tap_send,
446 	.peek_recvlen = tap_peek_recvlen,
447 	.recv = tap_recv,
448 	.recv_enable = tap_recv_enable,
449 	.recv_disable = tap_recv_disable,
450 	.get_cap = tap_get_cap,
451 	.set_cap = tap_set_cap,
452 };
453 
454 DATA_SET(net_backend_set, tap_backend);
455 DATA_SET(net_backend_set, vmnet_backend);
456 
457 #ifdef NETGRAPH
458 
459 /*
460  * Netgraph backend
461  */
462 
463 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024)
464 
465 static int
466 ng_init(struct net_backend *be, const char *devname,
467 	 nvlist_t *nvl, net_be_rxeof_t cb, void *param)
468 {
469 	struct tap_priv *p = (struct tap_priv *)be->opaque;
470 	struct ngm_connect ngc;
471 	const char *value, *nodename;
472 	int sbsz;
473 	int ctrl_sock;
474 	int flags;
475 	unsigned long maxsbsz;
476 	size_t msbsz;
477 #ifndef WITHOUT_CAPSICUM
478 	cap_rights_t rights;
479 #endif
480 
481 	if (cb == NULL) {
482 		WPRINTF(("Netgraph backend requires non-NULL callback"));
483 		return (-1);
484 	}
485 
486 	be->fd = -1;
487 
488 	memset(&ngc, 0, sizeof(ngc));
489 
490 	value = get_config_value_node(nvl, "path");
491 	if (value == NULL) {
492 		WPRINTF(("path must be provided"));
493 		return (-1);
494 	}
495 	strncpy(ngc.path, value, NG_PATHSIZ - 1);
496 
497 	value = get_config_value_node(nvl, "hook");
498 	if (value == NULL)
499 		value = "vmlink";
500 	strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1);
501 
502 	value = get_config_value_node(nvl, "peerhook");
503 	if (value == NULL) {
504 		WPRINTF(("peer hook must be provided"));
505 		return (-1);
506 	}
507 	strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1);
508 
509 	nodename = get_config_value_node(nvl, "socket");
510 	if (NgMkSockNode(nodename,
511 		&ctrl_sock, &be->fd) < 0) {
512 		WPRINTF(("can't get Netgraph sockets"));
513 		return (-1);
514 	}
515 
516 	if (NgSendMsg(ctrl_sock, ".",
517 		NGM_GENERIC_COOKIE,
518 		NGM_CONNECT, &ngc, sizeof(ngc)) < 0) {
519 		WPRINTF(("can't connect to node"));
520 		close(ctrl_sock);
521 		goto error;
522 	}
523 
524 	close(ctrl_sock);
525 
526 	flags = fcntl(be->fd, F_GETFL);
527 
528 	if (flags < 0) {
529 		WPRINTF(("can't get socket flags"));
530 		goto error;
531 	}
532 
533 	if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) {
534 		WPRINTF(("can't set O_NONBLOCK flag"));
535 		goto error;
536 	}
537 
538 	/*
539 	 * The default ng_socket(4) buffer's size is too low.
540 	 * Calculate the minimum value between NG_SBUF_MAX_SIZE
541 	 * and kern.ipc.maxsockbuf.
542 	 */
543 	msbsz = sizeof(maxsbsz);
544 	if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz,
545 		NULL, 0) < 0) {
546 		WPRINTF(("can't get 'kern.ipc.maxsockbuf' value"));
547 		goto error;
548 	}
549 
550 	/*
551 	 * We can't set the socket buffer size to kern.ipc.maxsockbuf value,
552 	 * as it takes into account the mbuf(9) overhead.
553 	 */
554 	maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES);
555 
556 	sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz);
557 
558 	if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz,
559 		sizeof(sbsz)) < 0) {
560 		WPRINTF(("can't set TX buffer size"));
561 		goto error;
562 	}
563 
564 	if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz,
565 		sizeof(sbsz)) < 0) {
566 		WPRINTF(("can't set RX buffer size"));
567 		goto error;
568 	}
569 
570 #ifndef WITHOUT_CAPSICUM
571 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
572 	if (caph_rights_limit(be->fd, &rights) == -1)
573 		errx(EX_OSERR, "Unable to apply rights for sandbox");
574 #endif
575 
576 	memset(p->bbuf, 0, sizeof(p->bbuf));
577 	p->bbuflen = 0;
578 
579 	p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
580 	if (p->mevp == NULL) {
581 		WPRINTF(("Could not register event"));
582 		goto error;
583 	}
584 
585 	return (0);
586 
587 error:
588 	tap_cleanup(be);
589 	return (-1);
590 }
591 
592 static struct net_backend ng_backend = {
593 	.prefix = "netgraph",
594 	.priv_size = sizeof(struct tap_priv),
595 	.init = ng_init,
596 	.cleanup = tap_cleanup,
597 	.send = tap_send,
598 	.peek_recvlen = tap_peek_recvlen,
599 	.recv = tap_recv,
600 	.recv_enable = tap_recv_enable,
601 	.recv_disable = tap_recv_disable,
602 	.get_cap = tap_get_cap,
603 	.set_cap = tap_set_cap,
604 };
605 
606 DATA_SET(net_backend_set, ng_backend);
607 
608 #endif /* NETGRAPH */
609 
610 /*
611  * The netmap backend
612  */
613 
614 /* The virtio-net features supported by netmap. */
615 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
616 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
617 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
618 		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
619 
620 struct netmap_priv {
621 	char ifname[IFNAMSIZ];
622 	struct nm_desc *nmd;
623 	uint16_t memid;
624 	struct netmap_ring *rx;
625 	struct netmap_ring *tx;
626 	struct mevent *mevp;
627 	net_be_rxeof_t cb;
628 	void *cb_param;
629 };
630 
631 static void
632 nmreq_init(struct nmreq *req, char *ifname)
633 {
634 
635 	memset(req, 0, sizeof(*req));
636 	strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
637 	req->nr_version = NETMAP_API;
638 }
639 
640 static int
641 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
642 {
643 	int err;
644 	struct nmreq req;
645 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
646 
647 	nmreq_init(&req, priv->ifname);
648 	req.nr_cmd = NETMAP_BDG_VNET_HDR;
649 	req.nr_arg1 = vnet_hdr_len;
650 	err = ioctl(be->fd, NIOCREGIF, &req);
651 	if (err) {
652 		WPRINTF(("Unable to set vnet header length %d",
653 				vnet_hdr_len));
654 		return (err);
655 	}
656 
657 	be->be_vnet_hdr_len = vnet_hdr_len;
658 
659 	return (0);
660 }
661 
662 static int
663 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
664 {
665 	int prev_hdr_len = be->be_vnet_hdr_len;
666 	int ret;
667 
668 	if (vnet_hdr_len == prev_hdr_len) {
669 		return (1);
670 	}
671 
672 	ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
673 	if (ret) {
674 		return (0);
675 	}
676 
677 	netmap_set_vnet_hdr_len(be, prev_hdr_len);
678 
679 	return (1);
680 }
681 
682 static uint64_t
683 netmap_get_cap(struct net_backend *be)
684 {
685 
686 	return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
687 	    NETMAP_FEATURES : 0);
688 }
689 
690 static int
691 netmap_set_cap(struct net_backend *be, uint64_t features,
692 	       unsigned vnet_hdr_len)
693 {
694 
695 	return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
696 }
697 
698 static int
699 netmap_init(struct net_backend *be, const char *devname,
700 	    nvlist_t *nvl, net_be_rxeof_t cb, void *param)
701 {
702 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
703 
704 	strlcpy(priv->ifname, devname, sizeof(priv->ifname));
705 	priv->ifname[sizeof(priv->ifname) - 1] = '\0';
706 
707 	priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
708 	if (priv->nmd == NULL) {
709 		WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)",
710 			devname, strerror(errno)));
711 		free(priv);
712 		return (-1);
713 	}
714 
715 	priv->memid = priv->nmd->req.nr_arg2;
716 	priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
717 	priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
718 	priv->cb = cb;
719 	priv->cb_param = param;
720 	be->fd = priv->nmd->fd;
721 
722 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
723 	if (priv->mevp == NULL) {
724 		WPRINTF(("Could not register event"));
725 		return (-1);
726 	}
727 
728 	return (0);
729 }
730 
731 static void
732 netmap_cleanup(struct net_backend *be)
733 {
734 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
735 
736 	if (priv->mevp) {
737 		mevent_delete(priv->mevp);
738 	}
739 	if (priv->nmd) {
740 		nm_close(priv->nmd);
741 	}
742 	be->fd = -1;
743 }
744 
745 static ssize_t
746 netmap_send(struct net_backend *be, const struct iovec *iov,
747 	    int iovcnt)
748 {
749 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
750 	struct netmap_ring *ring;
751 	ssize_t totlen = 0;
752 	int nm_buf_size;
753 	int nm_buf_len;
754 	uint32_t head;
755 	void *nm_buf;
756 	int j;
757 
758 	ring = priv->tx;
759 	head = ring->head;
760 	if (head == ring->tail) {
761 		WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt)));
762 		goto txsync;
763 	}
764 	nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
765 	nm_buf_size = ring->nr_buf_size;
766 	nm_buf_len = 0;
767 
768 	for (j = 0; j < iovcnt; j++) {
769 		int iov_frag_size = iov[j].iov_len;
770 		void *iov_frag_buf = iov[j].iov_base;
771 
772 		totlen += iov_frag_size;
773 
774 		/*
775 		 * Split each iovec fragment over more netmap slots, if
776 		 * necessary.
777 		 */
778 		for (;;) {
779 			int copylen;
780 
781 			copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
782 			memcpy(nm_buf, iov_frag_buf, copylen);
783 
784 			iov_frag_buf += copylen;
785 			iov_frag_size -= copylen;
786 			nm_buf += copylen;
787 			nm_buf_size -= copylen;
788 			nm_buf_len += copylen;
789 
790 			if (iov_frag_size == 0) {
791 				break;
792 			}
793 
794 			ring->slot[head].len = nm_buf_len;
795 			ring->slot[head].flags = NS_MOREFRAG;
796 			head = nm_ring_next(ring, head);
797 			if (head == ring->tail) {
798 				/*
799 				 * We ran out of netmap slots while
800 				 * splitting the iovec fragments.
801 				 */
802 				WPRINTF(("No space, drop %zu bytes",
803 				   count_iov(iov, iovcnt)));
804 				goto txsync;
805 			}
806 			nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
807 			nm_buf_size = ring->nr_buf_size;
808 			nm_buf_len = 0;
809 		}
810 	}
811 
812 	/* Complete the last slot, which must not have NS_MOREFRAG set. */
813 	ring->slot[head].len = nm_buf_len;
814 	ring->slot[head].flags = 0;
815 	head = nm_ring_next(ring, head);
816 
817 	/* Now update ring->head and ring->cur. */
818 	ring->head = ring->cur = head;
819 txsync:
820 	ioctl(be->fd, NIOCTXSYNC, NULL);
821 
822 	return (totlen);
823 }
824 
825 static ssize_t
826 netmap_peek_recvlen(struct net_backend *be)
827 {
828 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
829 	struct netmap_ring *ring = priv->rx;
830 	uint32_t head = ring->head;
831 	ssize_t totlen = 0;
832 
833 	while (head != ring->tail) {
834 		struct netmap_slot *slot = ring->slot + head;
835 
836 		totlen += slot->len;
837 		if ((slot->flags & NS_MOREFRAG) == 0)
838 			break;
839 		head = nm_ring_next(ring, head);
840 	}
841 
842 	return (totlen);
843 }
844 
845 static ssize_t
846 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
847 {
848 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
849 	struct netmap_slot *slot = NULL;
850 	struct netmap_ring *ring;
851 	void *iov_frag_buf;
852 	int iov_frag_size;
853 	ssize_t totlen = 0;
854 	uint32_t head;
855 
856 	assert(iovcnt);
857 
858 	ring = priv->rx;
859 	head = ring->head;
860 	iov_frag_buf = iov->iov_base;
861 	iov_frag_size = iov->iov_len;
862 
863 	do {
864 		int nm_buf_len;
865 		void *nm_buf;
866 
867 		if (head == ring->tail) {
868 			return (0);
869 		}
870 
871 		slot = ring->slot + head;
872 		nm_buf = NETMAP_BUF(ring, slot->buf_idx);
873 		nm_buf_len = slot->len;
874 
875 		for (;;) {
876 			int copylen = nm_buf_len < iov_frag_size ?
877 			    nm_buf_len : iov_frag_size;
878 
879 			memcpy(iov_frag_buf, nm_buf, copylen);
880 			nm_buf += copylen;
881 			nm_buf_len -= copylen;
882 			iov_frag_buf += copylen;
883 			iov_frag_size -= copylen;
884 			totlen += copylen;
885 
886 			if (nm_buf_len == 0) {
887 				break;
888 			}
889 
890 			iov++;
891 			iovcnt--;
892 			if (iovcnt == 0) {
893 				/* No space to receive. */
894 				WPRINTF(("Short iov, drop %zd bytes",
895 				    totlen));
896 				return (-ENOSPC);
897 			}
898 			iov_frag_buf = iov->iov_base;
899 			iov_frag_size = iov->iov_len;
900 		}
901 
902 		head = nm_ring_next(ring, head);
903 
904 	} while (slot->flags & NS_MOREFRAG);
905 
906 	/* Release slots to netmap. */
907 	ring->head = ring->cur = head;
908 
909 	return (totlen);
910 }
911 
912 static void
913 netmap_recv_enable(struct net_backend *be)
914 {
915 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
916 
917 	mevent_enable(priv->mevp);
918 }
919 
920 static void
921 netmap_recv_disable(struct net_backend *be)
922 {
923 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
924 
925 	mevent_disable(priv->mevp);
926 }
927 
928 static struct net_backend netmap_backend = {
929 	.prefix = "netmap",
930 	.priv_size = sizeof(struct netmap_priv),
931 	.init = netmap_init,
932 	.cleanup = netmap_cleanup,
933 	.send = netmap_send,
934 	.peek_recvlen = netmap_peek_recvlen,
935 	.recv = netmap_recv,
936 	.recv_enable = netmap_recv_enable,
937 	.recv_disable = netmap_recv_disable,
938 	.get_cap = netmap_get_cap,
939 	.set_cap = netmap_set_cap,
940 };
941 
942 /* A clone of the netmap backend, with a different prefix. */
943 static struct net_backend vale_backend = {
944 	.prefix = "vale",
945 	.priv_size = sizeof(struct netmap_priv),
946 	.init = netmap_init,
947 	.cleanup = netmap_cleanup,
948 	.send = netmap_send,
949 	.peek_recvlen = netmap_peek_recvlen,
950 	.recv = netmap_recv,
951 	.recv_enable = netmap_recv_enable,
952 	.recv_disable = netmap_recv_disable,
953 	.get_cap = netmap_get_cap,
954 	.set_cap = netmap_set_cap,
955 };
956 
957 DATA_SET(net_backend_set, netmap_backend);
958 DATA_SET(net_backend_set, vale_backend);
959 
960 #else /* __FreeBSD__ */
961 
962 /*
963  * The illumos dlpi backend
964  */
965 
966 /*
967  * The size of the bounce buffer used to implement the peek callback.
968  * This value should be big enough to accommodate the largest of all possible
969  * frontend packet lengths. The value here matches the definition of
970  * VTNET_MAX_PKT_LEN in pci_virtio_net.c
971  */
972 #define	DLPI_BBUF_SIZE (65536 + 64)
973 
974 typedef struct be_dlpi_priv {
975 	dlpi_handle_t bdp_dhp;
976 	struct mevent *bdp_mevp;
977 	/*
978 	 * A bounce buffer that allows us to implement the peek_recvlen
979 	 * callback. Each structure is only used by a single thread so
980 	 * one is enough.
981 	 */
982 	uint8_t bdp_bbuf[DLPI_BBUF_SIZE];
983 	ssize_t bdp_bbuflen;
984 } be_dlpi_priv_t;
985 
986 static void
987 be_dlpi_cleanup(net_backend_t *be)
988 {
989 	be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
990 
991 	if (priv->bdp_dhp != NULL)
992 		dlpi_close(priv->bdp_dhp);
993 	priv->bdp_dhp = NULL;
994 
995 	if (priv->bdp_mevp != NULL)
996 		mevent_delete(priv->bdp_mevp);
997 	priv->bdp_mevp = NULL;
998 
999 	priv->bdp_bbuflen = 0;
1000 	be->fd = -1;
1001 }
1002 
1003 static void
1004 be_dlpi_err(int ret, const char *dev, char *msg)
1005 {
1006 	WPRINTF(("%s: %s (%s)", dev, msg, dlpi_strerror(ret)));
1007 }
1008 
1009 static int
1010 be_dlpi_init(net_backend_t *be, const char *devname __unused,
1011      nvlist_t *nvl, net_be_rxeof_t cb, void *param)
1012 {
1013 	be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
1014 	const char *vnic;
1015 	int ret;
1016 
1017 	if (cb == NULL) {
1018 		WPRINTF(("dlpi backend requires non-NULL callback"));
1019 		return (-1);
1020 	}
1021 
1022 	vnic = get_config_value_node(nvl, "vnic");
1023 	if (vnic == NULL) {
1024 		WPRINTF(("dlpi backend requires a VNIC"));
1025 		return (-1);
1026 	}
1027 
1028 	priv->bdp_bbuflen = 0;
1029 
1030 	ret = dlpi_open(vnic, &priv->bdp_dhp, DLPI_RAW);
1031 
1032 	if (ret != DLPI_SUCCESS) {
1033 		be_dlpi_err(ret, vnic, "open failed");
1034 		goto error;
1035 	}
1036 
1037 	if ((ret = dlpi_bind(priv->bdp_dhp, DLPI_ANY_SAP, NULL)) !=
1038 	    DLPI_SUCCESS) {
1039 		be_dlpi_err(ret, vnic, "bind failed");
1040 		goto error;
1041 	}
1042 
1043 	if (get_config_bool_node_default(nvl, "promiscrxonly", true)) {
1044 		if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_RX_ONLY)) !=
1045 		    DLPI_SUCCESS) {
1046 			be_dlpi_err(ret, vnic,
1047 			    "enable promiscuous mode(rxonly) failed");
1048 			goto error;
1049 		}
1050 	}
1051 	if (get_config_bool_node_default(nvl, "promiscphys", false)) {
1052 		if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_PHYS)) !=
1053 		    DLPI_SUCCESS) {
1054 			be_dlpi_err(ret, vnic,
1055 			    "enable promiscuous mode(physical) failed");
1056 			goto error;
1057 		}
1058 	}
1059 	if (get_config_bool_node_default(nvl, "promiscsap", true)) {
1060 		if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_SAP)) !=
1061 		    DLPI_SUCCESS) {
1062 			be_dlpi_err(ret, vnic,
1063 			    "enable promiscuous mode(SAP) failed");
1064 			goto error;
1065 		}
1066 	}
1067 	if (get_config_bool_node_default(nvl, "promiscmulti", true)) {
1068 		if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_MULTI)) !=
1069 		    DLPI_SUCCESS) {
1070 			be_dlpi_err(ret, vnic,
1071 			    "enable promiscuous mode(muticast) failed");
1072 			goto error;
1073 		}
1074 	}
1075 
1076         be->fd = dlpi_fd(priv->bdp_dhp);
1077 
1078         if (fcntl(be->fd, F_SETFL, O_NONBLOCK) < 0) {
1079                 WPRINTF(("%s: enable O_NONBLOCK failed", vnic));
1080 		goto error;
1081         }
1082 
1083 	priv->bdp_mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
1084 	if (priv->bdp_mevp == NULL) {
1085 		WPRINTF(("Could not register event"));
1086 		goto error;
1087 	}
1088 
1089 	return (0);
1090 
1091 error:
1092 	be_dlpi_cleanup(be);
1093 	return (-1);
1094 }
1095 
1096 /*
1097  * Called to send a buffer chain out to the dlpi device
1098  */
1099 static ssize_t
1100 be_dlpi_send(net_backend_t *be, const struct iovec *iov, int iovcnt)
1101 {
1102 	be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
1103 	ssize_t len = 0;
1104 	int ret;
1105 
1106 	if (iovcnt == 1) {
1107 		len = iov[0].iov_len;
1108 		ret = dlpi_send(priv->bdp_dhp, NULL, 0, iov[0].iov_base, len,
1109 		    NULL);
1110 	} else {
1111 		void *buf = NULL;
1112 
1113 		len = iov_to_buf(iov, iovcnt, &buf);
1114 
1115 		if (len <= 0 || buf == NULL)
1116 			return (-1);
1117 
1118 		ret = dlpi_send(priv->bdp_dhp, NULL, 0, buf, len, NULL);
1119 		free(buf);
1120 	}
1121 
1122 	if (ret != DLPI_SUCCESS)
1123 		return (-1);
1124 
1125 	return (len);
1126 }
1127 
1128 static ssize_t
1129 be_dlpi_peek_recvlen(net_backend_t *be)
1130 {
1131 	be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
1132 	dlpi_recvinfo_t recv;
1133 	size_t len;
1134 	int ret;
1135 
1136 	/*
1137 	 * We already have a packet in the bounce buffer.
1138 	 * Just return its length.
1139 	 */
1140 	if (priv->bdp_bbuflen > 0)
1141 		return (priv->bdp_bbuflen);
1142 
1143 	/*
1144 	 * Read the next packet (if any) into the bounce buffer, so
1145 	 * that we get to know its length and we can return that
1146 	 * to the caller.
1147 	 */
1148 	len = sizeof (priv->bdp_bbuf);
1149 	ret = dlpi_recv(priv->bdp_dhp, NULL, NULL, priv->bdp_bbuf, &len,
1150 	    0, &recv);
1151 	if (ret == DL_SYSERR) {
1152 		if (errno == EWOULDBLOCK)
1153 			return (0);
1154 		return (-1);
1155 	} else if (ret == DLPI_ETIMEDOUT) {
1156 		return (0);
1157 	} else if (ret != DLPI_SUCCESS) {
1158 		return (-1);
1159 	}
1160 
1161 	if (recv.dri_totmsglen > sizeof (priv->bdp_bbuf)) {
1162 		EPRINTLN("DLPI bounce buffer was too small! - needed %x bytes",
1163 		    recv.dri_totmsglen);
1164 	}
1165 
1166 	priv->bdp_bbuflen = len;
1167 
1168 	return (len);
1169 }
1170 
1171 static ssize_t
1172 be_dlpi_recv(net_backend_t *be, const struct iovec *iov, int iovcnt)
1173 {
1174 	be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
1175 	size_t len;
1176 	int ret;
1177 
1178 	if (priv->bdp_bbuflen > 0) {
1179 		/*
1180 		 * A packet is available in the bounce buffer, so
1181 		 * we read it from there.
1182 		 */
1183 		len = buf_to_iov(priv->bdp_bbuf, priv->bdp_bbuflen,
1184 		    iov, iovcnt, 0);
1185 
1186 		/* Mark the bounce buffer as empty. */
1187 		priv->bdp_bbuflen = 0;
1188 
1189 		return (len);
1190 	}
1191 
1192 	len = iov[0].iov_len;
1193 	ret = dlpi_recv(priv->bdp_dhp, NULL, NULL,
1194 	    (uint8_t *)iov[0].iov_base, &len, 0, NULL);
1195 	if (ret == DL_SYSERR) {
1196 		if (errno == EWOULDBLOCK)
1197 			return (0);
1198 		return (-1);
1199 	} else if (ret == DLPI_ETIMEDOUT) {
1200 		return (0);
1201 	} else if (ret != DLPI_SUCCESS) {
1202 		return (-1);
1203 	}
1204 
1205 	return (len);
1206 }
1207 
1208 static void
1209 be_dlpi_recv_enable(net_backend_t *be)
1210 {
1211 	be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
1212 
1213 	mevent_enable(priv->bdp_mevp);
1214 }
1215 
1216 static void
1217 be_dlpi_recv_disable(net_backend_t *be)
1218 {
1219 	be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
1220 
1221 	mevent_disable(priv->bdp_mevp);
1222 }
1223 
1224 static uint64_t
1225 be_dlpi_get_cap(net_backend_t *be)
1226 {
1227 	return (0); /* no capabilities for now */
1228 }
1229 
1230 static int
1231 be_dlpi_set_cap(net_backend_t *be, uint64_t features,
1232     unsigned vnet_hdr_len)
1233 {
1234 	return ((features || vnet_hdr_len) ? -1 : 0);
1235 }
1236 
1237 static int
1238 be_dlpi_get_mac(net_backend_t *be, void *buf, size_t *buflen)
1239 {
1240 	be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
1241 	uchar_t physaddr[DLPI_PHYSADDR_MAX];
1242 	size_t physaddrlen = DLPI_PHYSADDR_MAX;
1243 	int ret;
1244 
1245 	if ((ret = dlpi_get_physaddr(priv->bdp_dhp, DL_CURR_PHYS_ADDR,
1246 	    physaddr, &physaddrlen)) != DLPI_SUCCESS) {
1247 		be_dlpi_err(ret, dlpi_linkname(priv->bdp_dhp),
1248 		    "read MAC address failed");
1249 		return (EINVAL);
1250 	}
1251 
1252 	if (physaddrlen != ETHERADDRL) {
1253 		WPRINTF(("%s: bad MAC address len %d",
1254 		    dlpi_linkname(priv->bdp_dhp), physaddrlen));
1255 		return (EINVAL);
1256 	}
1257 
1258 	if (physaddrlen > *buflen) {
1259 		WPRINTF(("%s: MAC address too long (%d bytes required)",
1260 		    dlpi_linkname(priv->bdp_dhp), physaddrlen));
1261 		return (ENOMEM);
1262 	}
1263 
1264 	*buflen = physaddrlen;
1265 	memcpy(buf, physaddr, *buflen);
1266 
1267 	return (0);
1268 }
1269 
1270 static struct net_backend dlpi_backend = {
1271 	.prefix = "dlpi",
1272 	.priv_size = sizeof(struct be_dlpi_priv),
1273 	.init = be_dlpi_init,
1274 	.cleanup = be_dlpi_cleanup,
1275 	.send = be_dlpi_send,
1276 	.peek_recvlen = be_dlpi_peek_recvlen,
1277 	.recv = be_dlpi_recv,
1278 	.recv_enable = be_dlpi_recv_enable,
1279 	.recv_disable = be_dlpi_recv_disable,
1280 	.get_cap = be_dlpi_get_cap,
1281 	.set_cap = be_dlpi_set_cap,
1282 	.get_mac = be_dlpi_get_mac,
1283 };
1284 
1285 DATA_SET(net_backend_set, dlpi_backend);
1286 
1287 #endif /* __FreeBSD__ */
1288 
1289 #ifdef __FreeBSD__
1290 int
1291 netbe_legacy_config(nvlist_t *nvl, const char *opts)
1292 {
1293 	char *backend, *cp;
1294 
1295 	if (opts == NULL)
1296 		return (0);
1297 
1298 	cp = strchr(opts, ',');
1299 	if (cp == NULL) {
1300 		set_config_value_node(nvl, "backend", opts);
1301 		return (0);
1302 	}
1303 	backend = strndup(opts, cp - opts);
1304 	set_config_value_node(nvl, "backend", backend);
1305 	free(backend);
1306 	return (pci_parse_legacy_config(nvl, cp + 1));
1307 }
1308 #else
1309 int
1310 netbe_legacy_config(nvlist_t *nvl, const char *opts)
1311 {
1312 	char *config, *name, *tofree, *value;
1313 
1314 	if (opts == NULL)
1315 		return (0);
1316 
1317 	/* Default to the 'dlpi' backend - can still be overridden by opts */
1318 	set_config_value_node(nvl, "backend", "dlpi");
1319 
1320 	config = tofree = strdup(opts);
1321 	if (config == NULL)
1322 		err(4, "netbe_legacy_config strdup()");
1323 	while ((name = strsep(&config, ",")) != NULL) {
1324 		value = strchr(name, '=');
1325 		if (value != NULL) {
1326 			*value++ = '\0';
1327 			set_config_value_node(nvl, name, value);
1328 		} else {
1329 			set_config_value_node(nvl, "vnic", name);
1330 		}
1331 	}
1332 	free(tofree);
1333 
1334 	return (0);
1335 }
1336 #endif
1337 
1338 /*
1339  * Initialize a backend and attach to the frontend.
1340  * This is called during frontend initialization.
1341  *  @ret is a pointer to the backend to be initialized
1342  *  @devname is the backend-name as supplied on the command line,
1343  * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
1344  *  @cb is the receive callback supplied by the frontend,
1345  *	and it is invoked in the event loop when a receive
1346  *	event is generated in the hypervisor,
1347  *  @param is a pointer to the frontend, and normally used as
1348  *	the argument for the callback.
1349  */
1350 int
1351 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb,
1352     void *param)
1353 {
1354 	struct net_backend **pbe, *nbe, *tbe = NULL;
1355 	const char *value;
1356 	char *devname;
1357 	int err;
1358 
1359 	value = get_config_value_node(nvl, "backend");
1360 	if (value == NULL) {
1361 		return (-1);
1362 	}
1363 	devname = strdup(value);
1364 
1365 	/*
1366 	 * Find the network backend that matches the user-provided
1367 	 * device name. net_backend_set is built using a linker set.
1368 	 */
1369 	SET_FOREACH(pbe, net_backend_set) {
1370 		if (strncmp(devname, (*pbe)->prefix,
1371 		    strlen((*pbe)->prefix)) == 0) {
1372 			tbe = *pbe;
1373 			assert(tbe->init != NULL);
1374 			assert(tbe->cleanup != NULL);
1375 			assert(tbe->send != NULL);
1376 			assert(tbe->recv != NULL);
1377 			assert(tbe->get_cap != NULL);
1378 			assert(tbe->set_cap != NULL);
1379 			break;
1380 		}
1381 	}
1382 
1383 	*ret = NULL;
1384 	if (tbe == NULL) {
1385 		free(devname);
1386 		return (EINVAL);
1387 	}
1388 
1389 	nbe = calloc(1, sizeof(*nbe) + tbe->priv_size);
1390 	*nbe = *tbe;	/* copy the template */
1391 	nbe->fd = -1;
1392 	nbe->sc = param;
1393 	nbe->be_vnet_hdr_len = 0;
1394 	nbe->fe_vnet_hdr_len = 0;
1395 
1396 	/* Initialize the backend. */
1397 	err = nbe->init(nbe, devname, nvl, cb, param);
1398 	if (err) {
1399 		free(devname);
1400 		free(nbe);
1401 		return (err);
1402 	}
1403 
1404 	*ret = nbe;
1405 	free(devname);
1406 
1407 	return (0);
1408 }
1409 
1410 void
1411 netbe_cleanup(struct net_backend *be)
1412 {
1413 
1414 	if (be != NULL) {
1415 		be->cleanup(be);
1416 		free(be);
1417 	}
1418 }
1419 
1420 uint64_t
1421 netbe_get_cap(struct net_backend *be)
1422 {
1423 
1424 	assert(be != NULL);
1425 	return (be->get_cap(be));
1426 }
1427 
1428 int
1429 netbe_set_cap(struct net_backend *be, uint64_t features,
1430 	      unsigned vnet_hdr_len)
1431 {
1432 	int ret;
1433 
1434 	assert(be != NULL);
1435 
1436 	/* There are only three valid lengths, i.e., 0, 10 and 12. */
1437 	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
1438 		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
1439 		return (-1);
1440 
1441 	be->fe_vnet_hdr_len = vnet_hdr_len;
1442 
1443 	ret = be->set_cap(be, features, vnet_hdr_len);
1444 	assert(be->be_vnet_hdr_len == 0 ||
1445 	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
1446 
1447 	return (ret);
1448 }
1449 
1450 ssize_t
1451 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
1452 {
1453 
1454 	return (be->send(be, iov, iovcnt));
1455 }
1456 
1457 ssize_t
1458 netbe_peek_recvlen(struct net_backend *be)
1459 {
1460 
1461 	return (be->peek_recvlen(be));
1462 }
1463 
1464 /*
1465  * Try to read a packet from the backend, without blocking.
1466  * If no packets are available, return 0. In case of success, return
1467  * the length of the packet just read. Return -1 in case of errors.
1468  */
1469 ssize_t
1470 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
1471 {
1472 
1473 	return (be->recv(be, iov, iovcnt));
1474 }
1475 
1476 /*
1477  * Read a packet from the backend and discard it.
1478  * Returns the size of the discarded packet or zero if no packet was available.
1479  * A negative error code is returned in case of read error.
1480  */
1481 ssize_t
1482 netbe_rx_discard(struct net_backend *be)
1483 {
1484 	/*
1485 	 * MP note: the dummybuf is only used to discard frames,
1486 	 * so there is no need for it to be per-vtnet or locked.
1487 	 * We only make it large enough for TSO-sized segment.
1488 	 */
1489 	static uint8_t dummybuf[65536 + 64];
1490 	struct iovec iov;
1491 
1492 #ifdef __FreeBSD__
1493 	iov.iov_base = dummybuf;
1494 #else
1495 	iov.iov_base = (caddr_t)dummybuf;
1496 #endif
1497 	iov.iov_len = sizeof(dummybuf);
1498 
1499 	return netbe_recv(be, &iov, 1);
1500 }
1501 
1502 void
1503 netbe_rx_disable(struct net_backend *be)
1504 {
1505 
1506 	return be->recv_disable(be);
1507 }
1508 
1509 void
1510 netbe_rx_enable(struct net_backend *be)
1511 {
1512 
1513 	return be->recv_enable(be);
1514 }
1515 
1516 size_t
1517 netbe_get_vnet_hdr_len(struct net_backend *be)
1518 {
1519 
1520 	return (be->be_vnet_hdr_len);
1521 }
1522 
1523 #ifndef __FreeBSD__
1524 int
1525 netbe_get_mac(net_backend_t *be, void *buf, size_t *buflen)
1526 {
1527 	if (be->get_mac == NULL)
1528 		return (ENOTSUP);
1529 	return (be->get_mac(be, buf, buflen));
1530 }
1531 #endif
1532