xref: /freebsd/sys/dev/netmap/netmap_vale.c (revision 19261079)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2013-2016 Universita` di Pisa
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *   1. Redistributions of source code must retain the above copyright
11  *      notice, this list of conditions and the following disclaimer.
12  *   2. Redistributions in binary form must reproduce the above copyright
13  *      notice, this list of conditions and the following disclaimer in the
14  *      documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 
30 #if defined(__FreeBSD__)
31 #include <sys/cdefs.h> /* prerequisite */
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/types.h>
35 #include <sys/errno.h>
36 #include <sys/param.h>	/* defines used in kernel.h */
37 #include <sys/kernel.h>	/* types used in module initialization */
38 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
39 #include <sys/sockio.h>
40 #include <sys/socketvar.h>	/* struct socket */
41 #include <sys/malloc.h>
42 #include <sys/poll.h>
43 #include <sys/rwlock.h>
44 #include <sys/socket.h> /* sockaddrs */
45 #include <sys/selinfo.h>
46 #include <sys/sysctl.h>
47 #include <net/if.h>
48 #include <net/if_var.h>
49 #include <net/bpf.h>		/* BIOCIMMEDIATE */
50 #include <machine/bus.h>	/* bus_dmamap_* */
51 #include <sys/endian.h>
52 #include <sys/refcount.h>
53 #include <sys/smp.h>
54 
55 
56 #elif defined(linux)
57 
58 #include "bsd_glue.h"
59 
60 #elif defined(__APPLE__)
61 
62 #warning OSX support is only partial
63 #include "osx_glue.h"
64 
65 #elif defined(_WIN32)
66 #include "win_glue.h"
67 
68 #else
69 
70 #error	Unsupported platform
71 
72 #endif /* unsupported */
73 
74 /*
75  * common headers
76  */
77 
78 #include <net/netmap.h>
79 #include <dev/netmap/netmap_kern.h>
80 #include <dev/netmap/netmap_mem2.h>
81 #include <dev/netmap/netmap_bdg.h>
82 
83 #ifdef WITH_VALE
84 
85 /*
86  * system parameters (most of them in netmap_kern.h)
87  * NM_BDG_NAME	prefix for switch port names, default "vale"
88  * NM_BDG_MAXPORTS	number of ports
89  * NM_BRIDGES	max number of switches in the system.
90  *	XXX should become a sysctl or tunable
91  *
92  * Switch ports are named valeX:Y where X is the switch name and Y
93  * is the port. If Y matches a physical interface name, the port is
94  * connected to a physical device.
95  *
96  * Unlike physical interfaces, switch ports use their own memory region
97  * for rings and buffers.
98  * The virtual interfaces use per-queue lock instead of core lock.
99  * In the tx loop, we aggregate traffic in batches to make all operations
100  * faster. The batch size is bridge_batch.
101  */
102 #define NM_BDG_MAXRINGS		16	/* XXX unclear how many (must be a pow of 2). */
103 #define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
104 #define NM_BRIDGE_RINGSIZE	1024	/* in the device */
105 #define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
106 /* actual size of the tables */
107 #define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NETMAP_MAX_FRAGS)
108 /* NM_FT_NULL terminates a list of slots in the ft */
109 #define NM_FT_NULL		NM_BDG_BATCH_MAX
110 
111 
112 /*
113  * bridge_batch is set via sysctl to the max batch size to be
114  * used in the bridge. The actual value may be larger as the
115  * last packet in the block may overflow the size.
116  */
117 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
118 SYSBEGIN(vars_vale);
119 SYSCTL_DECL(_dev_netmap);
120 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0,
121 		"Max batch size to be used in the bridge");
122 SYSEND;
123 
124 static int netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *,
125 		struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
126 static int netmap_vale_vp_bdg_attach(const char *, struct netmap_adapter *,
127 		struct nm_bridge *);
128 static int netmap_vale_bwrap_attach(const char *, struct netmap_adapter *);
129 
130 /*
131  * For each output interface, nm_vale_q is used to construct a list.
132  * bq_len is the number of output buffers (we can have coalescing
133  * during the copy).
134  */
135 struct nm_vale_q {
136 	uint16_t bq_head;
137 	uint16_t bq_tail;
138 	uint32_t bq_len;	/* number of buffers */
139 };
140 
141 /* Holds the default callbacks */
142 struct netmap_bdg_ops vale_bdg_ops = {
143 	.lookup = netmap_vale_learning,
144 	.config = NULL,
145 	.dtor = NULL,
146 	.vp_create = netmap_vale_vp_create,
147 	.bwrap_attach = netmap_vale_bwrap_attach,
148 	.name = NM_BDG_NAME,
149 };
150 
151 /*
152  * this is a slightly optimized copy routine which rounds
153  * to multiple of 64 bytes and is often faster than dealing
154  * with other odd sizes. We assume there is enough room
155  * in the source and destination buffers.
156  *
157  * XXX only for multiples of NM_BUF_ALIGN bytes, non overlapped.
158  */
159 
160 static inline void
161 pkt_copy(void *_src, void *_dst, int l)
162 {
163 	uint64_t *src = _src;
164 	uint64_t *dst = _dst;
165 	if (unlikely(l >= 1024)) {
166 		memcpy(dst, src, l);
167 		return;
168 	}
169 	for (; likely(l > 0); l -= NM_BUF_ALIGN) {
170 		/* XXX NM_BUF_ALIGN/sizeof(uint64_t) statements */
171 		*dst++ = *src++;
172 		*dst++ = *src++;
173 		*dst++ = *src++;
174 		*dst++ = *src++;
175 		*dst++ = *src++;
176 		*dst++ = *src++;
177 		*dst++ = *src++;
178 		*dst++ = *src++;
179 	}
180 }
181 
182 
183 /*
184  * Free the forwarding tables for rings attached to switch ports.
185  */
186 static void
187 nm_free_bdgfwd(struct netmap_adapter *na)
188 {
189 	int nrings, i;
190 	struct netmap_kring **kring;
191 
192 	NMG_LOCK_ASSERT();
193 	nrings = na->num_tx_rings;
194 	kring = na->tx_rings;
195 	for (i = 0; i < nrings; i++) {
196 		if (kring[i]->nkr_ft) {
197 			nm_os_free(kring[i]->nkr_ft);
198 			kring[i]->nkr_ft = NULL; /* protect from freeing twice */
199 		}
200 	}
201 }
202 
203 
204 /*
205  * Allocate the forwarding tables for the rings attached to the bridge ports.
206  */
207 static int
208 nm_alloc_bdgfwd(struct netmap_adapter *na)
209 {
210 	int nrings, l, i, num_dstq;
211 	struct netmap_kring **kring;
212 
213 	NMG_LOCK_ASSERT();
214 	/* all port:rings + broadcast */
215 	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
216 	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
217 	l += sizeof(struct nm_vale_q) * num_dstq;
218 	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
219 
220 	nrings = netmap_real_rings(na, NR_TX);
221 	kring = na->tx_rings;
222 	for (i = 0; i < nrings; i++) {
223 		struct nm_bdg_fwd *ft;
224 		struct nm_vale_q *dstq;
225 		int j;
226 
227 		ft = nm_os_malloc(l);
228 		if (!ft) {
229 			nm_free_bdgfwd(na);
230 			return ENOMEM;
231 		}
232 		dstq = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX);
233 		for (j = 0; j < num_dstq; j++) {
234 			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
235 			dstq[j].bq_len = 0;
236 		}
237 		kring[i]->nkr_ft = ft;
238 	}
239 	return 0;
240 }
241 
242 /* Allows external modules to create bridges in exclusive mode,
243  * returns an authentication token that the external module will need
244  * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(),
245  * and nm_bdg_update_private_data() operations.
246  * Successfully executed if ret != NULL and *return_status == 0.
247  */
248 void *
249 netmap_vale_create(const char *bdg_name, int *return_status)
250 {
251 	struct nm_bridge *b = NULL;
252 	void *ret = NULL;
253 
254 	NMG_LOCK();
255 	b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
256 	if (b) {
257 		*return_status = EEXIST;
258 		goto unlock_bdg_create;
259 	}
260 
261 	b = nm_find_bridge(bdg_name, 1 /* create */, &vale_bdg_ops);
262 	if (!b) {
263 		*return_status = ENOMEM;
264 		goto unlock_bdg_create;
265 	}
266 
267 	b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE;
268 	ret = nm_bdg_get_auth_token(b);
269 	*return_status = 0;
270 
271 unlock_bdg_create:
272 	NMG_UNLOCK();
273 	return ret;
274 }
275 
276 /* Allows external modules to destroy a bridge created through
277  * netmap_bdg_create(), the bridge must be empty.
278  */
279 int
280 netmap_vale_destroy(const char *bdg_name, void *auth_token)
281 {
282 	struct nm_bridge *b = NULL;
283 	int ret = 0;
284 
285 	NMG_LOCK();
286 	b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
287 	if (!b) {
288 		ret = ENXIO;
289 		goto unlock_bdg_free;
290 	}
291 
292 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
293 		ret = EACCES;
294 		goto unlock_bdg_free;
295 	}
296 	if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) {
297 		ret = EINVAL;
298 		goto unlock_bdg_free;
299 	}
300 
301 	b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE);
302 	ret = netmap_bdg_free(b);
303 	if (ret) {
304 		b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE;
305 	}
306 
307 unlock_bdg_free:
308 	NMG_UNLOCK();
309 	return ret;
310 }
311 
312 /* Process NETMAP_REQ_VALE_LIST. */
313 int
314 netmap_vale_list(struct nmreq_header *hdr)
315 {
316 	struct nmreq_vale_list *req =
317 		(struct nmreq_vale_list *)(uintptr_t)hdr->nr_body;
318 	int namelen = strlen(hdr->nr_name);
319 	struct nm_bridge *b, *bridges;
320 	struct netmap_vp_adapter *vpna;
321 	int error = 0, i, j;
322 	u_int num_bridges;
323 
324 	netmap_bns_getbridges(&bridges, &num_bridges);
325 
326 	/* this is used to enumerate bridges and ports */
327 	if (namelen) { /* look up indexes of bridge and port */
328 		if (strncmp(hdr->nr_name, NM_BDG_NAME,
329 					strlen(NM_BDG_NAME))) {
330 			return EINVAL;
331 		}
332 		NMG_LOCK();
333 		b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
334 		if (!b) {
335 			NMG_UNLOCK();
336 			return ENOENT;
337 		}
338 
339 		req->nr_bridge_idx = b - bridges; /* bridge index */
340 		req->nr_port_idx = NM_BDG_NOPORT;
341 		for (j = 0; j < b->bdg_active_ports; j++) {
342 			i = b->bdg_port_index[j];
343 			vpna = b->bdg_ports[i];
344 			if (vpna == NULL) {
345 				nm_prerr("This should not happen");
346 				continue;
347 			}
348 			/* the former and the latter identify a
349 			 * virtual port and a NIC, respectively
350 			 */
351 			if (!strcmp(vpna->up.name, hdr->nr_name)) {
352 				req->nr_port_idx = i; /* port index */
353 				break;
354 			}
355 		}
356 		NMG_UNLOCK();
357 	} else {
358 		/* return the first non-empty entry starting from
359 		 * bridge nr_arg1 and port nr_arg2.
360 		 *
361 		 * Users can detect the end of the same bridge by
362 		 * seeing the new and old value of nr_arg1, and can
363 		 * detect the end of all the bridge by error != 0
364 		 */
365 		i = req->nr_bridge_idx;
366 		j = req->nr_port_idx;
367 
368 		NMG_LOCK();
369 		for (error = ENOENT; i < NM_BRIDGES; i++) {
370 			b = bridges + i;
371 			for ( ; j < NM_BDG_MAXPORTS; j++) {
372 				if (b->bdg_ports[j] == NULL)
373 					continue;
374 				vpna = b->bdg_ports[j];
375 				/* write back the VALE switch name */
376 				strlcpy(hdr->nr_name, vpna->up.name,
377 					sizeof(hdr->nr_name));
378 				error = 0;
379 				goto out;
380 			}
381 			j = 0; /* following bridges scan from 0 */
382 		}
383 	out:
384 		req->nr_bridge_idx = i;
385 		req->nr_port_idx = j;
386 		NMG_UNLOCK();
387 	}
388 
389 	return error;
390 }
391 
392 
393 /* nm_dtor callback for ephemeral VALE ports */
394 static void
395 netmap_vale_vp_dtor(struct netmap_adapter *na)
396 {
397 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
398 	struct nm_bridge *b = vpna->na_bdg;
399 
400 	nm_prdis("%s has %d references", na->name, na->na_refcount);
401 
402 	if (b) {
403 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
404 	}
405 
406 	if (na->ifp != NULL && !nm_iszombie(na)) {
407 		NM_DETACH_NA(na->ifp);
408 		if (vpna->autodelete) {
409 			nm_prdis("releasing %s", na->ifp->if_xname);
410 			NMG_UNLOCK();
411 			nm_os_vi_detach(na->ifp);
412 			NMG_LOCK();
413 		}
414 	}
415 }
416 
417 
418 
419 /* nm_krings_create callback for VALE ports.
420  * Calls the standard netmap_krings_create, then adds leases on rx
421  * rings and bdgfwd on tx rings.
422  */
423 static int
424 netmap_vale_vp_krings_create(struct netmap_adapter *na)
425 {
426 	u_int tailroom;
427 	int error, i;
428 	uint32_t *leases;
429 	u_int nrx = netmap_real_rings(na, NR_RX);
430 
431 	/*
432 	 * Leases are attached to RX rings on vale ports
433 	 */
434 	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
435 
436 	error = netmap_krings_create(na, tailroom);
437 	if (error)
438 		return error;
439 
440 	leases = na->tailroom;
441 
442 	for (i = 0; i < nrx; i++) { /* Receive rings */
443 		na->rx_rings[i]->nkr_leases = leases;
444 		leases += na->num_rx_desc;
445 	}
446 
447 	error = nm_alloc_bdgfwd(na);
448 	if (error) {
449 		netmap_krings_delete(na);
450 		return error;
451 	}
452 
453 	return 0;
454 }
455 
456 
457 /* nm_krings_delete callback for VALE ports. */
458 static void
459 netmap_vale_vp_krings_delete(struct netmap_adapter *na)
460 {
461 	nm_free_bdgfwd(na);
462 	netmap_krings_delete(na);
463 }
464 
465 
466 static int
467 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n,
468 	struct netmap_vp_adapter *na, u_int ring_nr);
469 
470 
471 /*
472  * main dispatch routine for the bridge.
473  * Grab packets from a kring, move them into the ft structure
474  * associated to the tx (input) port. Max one instance per port,
475  * filtered on input (ioctl, poll or XXX).
476  * Returns the next position in the ring.
477  */
478 static int
479 nm_vale_preflush(struct netmap_kring *kring, u_int end)
480 {
481 	struct netmap_vp_adapter *na =
482 		(struct netmap_vp_adapter*)kring->na;
483 	struct netmap_ring *ring = kring->ring;
484 	struct nm_bdg_fwd *ft;
485 	u_int ring_nr = kring->ring_id;
486 	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
487 	u_int ft_i = 0;	/* start from 0 */
488 	u_int frags = 1; /* how many frags ? */
489 	struct nm_bridge *b = na->na_bdg;
490 
491 	/* To protect against modifications to the bridge we acquire a
492 	 * shared lock, waiting if we can sleep (if the source port is
493 	 * attached to a user process) or with a trylock otherwise (NICs).
494 	 */
495 	nm_prdis("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
496 	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
497 		BDG_RLOCK(b);
498 	else if (!BDG_RTRYLOCK(b))
499 		return j;
500 	nm_prdis(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
501 	ft = kring->nkr_ft;
502 
503 	for (; likely(j != end); j = nm_next(j, lim)) {
504 		struct netmap_slot *slot = &ring->slot[j];
505 		char *buf;
506 
507 		ft[ft_i].ft_len = slot->len;
508 		ft[ft_i].ft_flags = slot->flags;
509 		ft[ft_i].ft_offset = 0;
510 
511 		nm_prdis("flags is 0x%x", slot->flags);
512 		/* we do not use the buf changed flag, but we still need to reset it */
513 		slot->flags &= ~NS_BUF_CHANGED;
514 
515 		/* this slot goes into a list so initialize the link field */
516 		ft[ft_i].ft_next = NM_FT_NULL;
517 		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
518 			(void *)(uintptr_t)slot->ptr : NMB_O(kring, slot);
519 		if (unlikely(buf == NULL ||
520 		     slot->len > NETMAP_BUF_SIZE(&na->up) - nm_get_offset(kring, slot))) {
521 			nm_prlim(5, "NULL %s buffer pointer from %s slot %d len %d",
522 				(slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
523 				kring->name, j, ft[ft_i].ft_len);
524 			buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
525 			ft[ft_i].ft_len = 0;
526 			ft[ft_i].ft_flags = 0;
527 		}
528 		__builtin_prefetch(buf);
529 		++ft_i;
530 		if (slot->flags & NS_MOREFRAG) {
531 			frags++;
532 			continue;
533 		}
534 		if (unlikely(netmap_verbose && frags > 1))
535 			nm_prlim(5, "%d frags at %d", frags, ft_i - frags);
536 		ft[ft_i - frags].ft_frags = frags;
537 		frags = 1;
538 		if (unlikely((int)ft_i >= bridge_batch))
539 			ft_i = nm_vale_flush(ft, ft_i, na, ring_nr);
540 	}
541 	if (frags > 1) {
542 		/* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
543 		 * have to fix frags count. */
544 		frags--;
545 		ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
546 		ft[ft_i - frags].ft_frags = frags;
547 		nm_prlim(5, "Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
548 	}
549 	if (ft_i)
550 		ft_i = nm_vale_flush(ft, ft_i, na, ring_nr);
551 	BDG_RUNLOCK(b);
552 	return j;
553 }
554 
555 
556 /* ----- FreeBSD if_bridge hash function ------- */
557 
558 /*
559  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
560  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
561  *
562  * http://www.burtleburtle.net/bob/hash/spooky.html
563  */
564 #define mix(a, b, c)                                                    \
565 do {                                                                    \
566 	a -= b; a -= c; a ^= (c >> 13);                                 \
567 	b -= c; b -= a; b ^= (a << 8);                                  \
568 	c -= a; c -= b; c ^= (b >> 13);                                 \
569 	a -= b; a -= c; a ^= (c >> 12);                                 \
570 	b -= c; b -= a; b ^= (a << 16);                                 \
571 	c -= a; c -= b; c ^= (b >> 5);                                  \
572 	a -= b; a -= c; a ^= (c >> 3);                                  \
573 	b -= c; b -= a; b ^= (a << 10);                                 \
574 	c -= a; c -= b; c ^= (b >> 15);                                 \
575 } while (/*CONSTCOND*/0)
576 
577 
578 static __inline uint32_t
579 nm_vale_rthash(const uint8_t *addr)
580 {
581 	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hash key
582 
583 	b += addr[5] << 8;
584 	b += addr[4];
585 	a += addr[3] << 24;
586 	a += addr[2] << 16;
587 	a += addr[1] << 8;
588 	a += addr[0];
589 
590 	mix(a, b, c);
591 #define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
592 	return (c & BRIDGE_RTHASH_MASK);
593 }
594 
595 #undef mix
596 
597 
598 /*
599  * Lookup function for a learning bridge.
600  * Update the hash table with the source address,
601  * and then returns the destination port index, and the
602  * ring in *dst_ring (at the moment, always use ring 0)
603  */
604 uint32_t
605 netmap_vale_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
606 		struct netmap_vp_adapter *na, void *private_data)
607 {
608 	uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset;
609 	u_int buf_len = ft->ft_len - ft->ft_offset;
610 	struct nm_hash_ent *ht = private_data;
611 	uint32_t sh, dh;
612 	u_int dst, mysrc = na->bdg_port;
613 	uint64_t smac, dmac;
614 	uint8_t indbuf[12];
615 
616 	if (buf_len < 14) {
617 		return NM_BDG_NOPORT;
618 	}
619 
620 	if (ft->ft_flags & NS_INDIRECT) {
621 		if (copyin(buf, indbuf, sizeof(indbuf))) {
622 			return NM_BDG_NOPORT;
623 		}
624 		buf = indbuf;
625 	}
626 
627 	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
628 	smac = le64toh(*(uint64_t *)(buf + 4));
629 	smac >>= 16;
630 
631 	/*
632 	 * The hash is somewhat expensive, there might be some
633 	 * worthwhile optimizations here.
634 	 */
635 	if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
636 		uint8_t *s = buf+6;
637 		sh = nm_vale_rthash(s); /* hash of source */
638 		/* update source port forwarding entry */
639 		na->last_smac = ht[sh].mac = smac;	/* XXX expire ? */
640 		ht[sh].ports = mysrc;
641 		if (netmap_debug & NM_DEBUG_VALE)
642 		    nm_prinf("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
643 			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
644 	}
645 	dst = NM_BDG_BROADCAST;
646 	if ((buf[0] & 1) == 0) { /* unicast */
647 		dh = nm_vale_rthash(buf); /* hash of dst */
648 		if (ht[dh].mac == dmac) {	/* found dst */
649 			dst = ht[dh].ports;
650 		}
651 	}
652 	return dst;
653 }
654 
655 
656 /*
657  * Available space in the ring. Only used in VALE code
658  * and only with is_rx = 1
659  */
660 static inline uint32_t
661 nm_kr_space(struct netmap_kring *k, int is_rx)
662 {
663 	int space;
664 
665 	if (is_rx) {
666 		int busy = k->nkr_hwlease - k->nr_hwcur;
667 		if (busy < 0)
668 			busy += k->nkr_num_slots;
669 		space = k->nkr_num_slots - 1 - busy;
670 	} else {
671 		/* XXX never used in this branch */
672 		space = k->nr_hwtail - k->nkr_hwlease;
673 		if (space < 0)
674 			space += k->nkr_num_slots;
675 	}
676 #if 0
677 	// sanity check
678 	if (k->nkr_hwlease >= k->nkr_num_slots ||
679 		k->nr_hwcur >= k->nkr_num_slots ||
680 		k->nr_tail >= k->nkr_num_slots ||
681 		busy < 0 ||
682 		busy >= k->nkr_num_slots) {
683 		nm_prerr("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",
684 		    k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
685 		    k->nkr_lease_idx, k->nkr_num_slots);
686 	}
687 #endif
688 	return space;
689 }
690 
691 
692 
693 
694 /* make a lease on the kring for N positions. return the
695  * lease index
696  * XXX only used in VALE code and with is_rx = 1
697  */
698 static inline uint32_t
699 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
700 {
701 	uint32_t lim = k->nkr_num_slots - 1;
702 	uint32_t lease_idx = k->nkr_lease_idx;
703 
704 	k->nkr_leases[lease_idx] = NR_NOSLOT;
705 	k->nkr_lease_idx = nm_next(lease_idx, lim);
706 
707 #ifdef CONFIG_NETMAP_DEBUG
708 	if (n > nm_kr_space(k, is_rx)) {
709 		nm_prerr("invalid request for %d slots", n);
710 		panic("x");
711 	}
712 #endif /* CONFIG NETMAP_DEBUG */
713 	/* XXX verify that there are n slots */
714 	k->nkr_hwlease += n;
715 	if (k->nkr_hwlease > lim)
716 		k->nkr_hwlease -= lim + 1;
717 
718 #ifdef CONFIG_NETMAP_DEBUG
719 	if (k->nkr_hwlease >= k->nkr_num_slots ||
720 		k->nr_hwcur >= k->nkr_num_slots ||
721 		k->nr_hwtail >= k->nkr_num_slots ||
722 		k->nkr_lease_idx >= k->nkr_num_slots) {
723 		nm_prerr("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
724 			k->na->name,
725 			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
726 			k->nkr_lease_idx, k->nkr_num_slots);
727 	}
728 #endif /* CONFIG_NETMAP_DEBUG */
729 	return lease_idx;
730 }
731 
732 /*
733  *
734  * This flush routine supports only unicast and broadcast but a large
735  * number of ports, and lets us replace the learn and dispatch functions.
736  */
737 int
738 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
739 		u_int ring_nr)
740 {
741 	struct nm_vale_q *dst_ents, *brddst;
742 	uint16_t num_dsts = 0, *dsts;
743 	struct nm_bridge *b = na->na_bdg;
744 	u_int i, me = na->bdg_port;
745 
746 	/*
747 	 * The work area (pointed by ft) is followed by an array of
748 	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
749 	 * queues per port plus one for the broadcast traffic.
750 	 * Then we have an array of destination indexes.
751 	 */
752 	dst_ents = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX);
753 	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
754 
755 	/* first pass: find a destination for each packet in the batch */
756 	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
757 		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
758 		uint16_t dst_port, d_i;
759 		struct nm_vale_q *d;
760 		struct nm_bdg_fwd *start_ft = NULL;
761 
762 		nm_prdis("slot %d frags %d", i, ft[i].ft_frags);
763 
764 		if (na->up.virt_hdr_len < ft[i].ft_len) {
765 			ft[i].ft_offset = na->up.virt_hdr_len;
766 			start_ft = &ft[i];
767 		} else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) {
768 			ft[i].ft_offset = ft[i].ft_len;
769 			start_ft = &ft[i+1];
770 		} else {
771 			/* Drop the packet if the virtio-net header is not into the first
772 			 * fragment nor at the very beginning of the second.
773 			 */
774 			continue;
775 		}
776 		dst_port = b->bdg_ops.lookup(start_ft, &dst_ring, na, b->private_data);
777 		if (netmap_verbose > 255)
778 			nm_prlim(5, "slot %d port %d -> %d", i, me, dst_port);
779 		if (dst_port >= NM_BDG_NOPORT)
780 			continue; /* this packet is identified to be dropped */
781 		else if (dst_port == NM_BDG_BROADCAST)
782 			dst_ring = 0; /* broadcasts always go to ring 0 */
783 		else if (unlikely(dst_port == me ||
784 		    !b->bdg_ports[dst_port]))
785 			continue;
786 
787 		/* get a position in the scratch pad */
788 		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
789 		d = dst_ents + d_i;
790 
791 		/* append the first fragment to the list */
792 		if (d->bq_head == NM_FT_NULL) { /* new destination */
793 			d->bq_head = d->bq_tail = i;
794 			/* remember this position to be scanned later */
795 			if (dst_port != NM_BDG_BROADCAST)
796 				dsts[num_dsts++] = d_i;
797 		} else {
798 			ft[d->bq_tail].ft_next = i;
799 			d->bq_tail = i;
800 		}
801 		d->bq_len += ft[i].ft_frags;
802 	}
803 
804 	/*
805 	 * Broadcast traffic goes to ring 0 on all destinations.
806 	 * So we need to add these rings to the list of ports to scan.
807 	 */
808 	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
809 	if (brddst->bq_head != NM_FT_NULL) {
810 		u_int j;
811 		for (j = 0; likely(j < b->bdg_active_ports); j++) {
812 			uint16_t d_i;
813 			i = b->bdg_port_index[j];
814 			if (unlikely(i == me))
815 				continue;
816 			d_i = i * NM_BDG_MAXRINGS;
817 			if (dst_ents[d_i].bq_head == NM_FT_NULL)
818 				dsts[num_dsts++] = d_i;
819 		}
820 	}
821 
822 	nm_prdis(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
823 	/* second pass: scan destinations */
824 	for (i = 0; i < num_dsts; i++) {
825 		struct netmap_vp_adapter *dst_na;
826 		struct netmap_kring *kring;
827 		struct netmap_ring *ring;
828 		u_int dst_nr, lim, j, d_i, next, brd_next;
829 		u_int needed, howmany;
830 		int retry = netmap_txsync_retry;
831 		struct nm_vale_q *d;
832 		uint32_t my_start = 0, lease_idx = 0;
833 		int nrings;
834 		int virt_hdr_mismatch = 0;
835 
836 		d_i = dsts[i];
837 		nm_prdis("second pass %d port %d", i, d_i);
838 		d = dst_ents + d_i;
839 		// XXX fix the division
840 		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
841 		/* protect from the lookup function returning an inactive
842 		 * destination port
843 		 */
844 		if (unlikely(dst_na == NULL))
845 			goto cleanup;
846 		if (dst_na->up.na_flags & NAF_SW_ONLY)
847 			goto cleanup;
848 		/*
849 		 * The interface may be in !netmap mode in two cases:
850 		 * - when na is attached but not activated yet;
851 		 * - when na is being deactivated but is still attached.
852 		 */
853 		if (unlikely(!nm_netmap_on(&dst_na->up))) {
854 			nm_prdis("not in netmap mode!");
855 			goto cleanup;
856 		}
857 
858 		/* there is at least one either unicast or broadcast packet */
859 		brd_next = brddst->bq_head;
860 		next = d->bq_head;
861 		/* we need to reserve this many slots. If fewer are
862 		 * available, some packets will be dropped.
863 		 * Packets may have multiple fragments, so
864 		 * there is a chance that we may not use all of the slots
865 		 * we have claimed, so we will need to handle the leftover
866 		 * ones when we regain the lock.
867 		 */
868 		needed = d->bq_len + brddst->bq_len;
869 
870 		if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
871 			if (netmap_verbose) {
872 				nm_prlim(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
873 						dst_na->up.virt_hdr_len);
874 			}
875 			/* There is a virtio-net header/offloadings mismatch between
876 			 * source and destination. The slower mismatch datapath will
877 			 * be used to cope with all the mismatches.
878 			 */
879 			virt_hdr_mismatch = 1;
880 			if (dst_na->mfs < na->mfs) {
881 				/* We may need to do segmentation offloadings, and so
882 				 * we may need a number of destination slots greater
883 				 * than the number of input slots ('needed').
884 				 * We look for the smallest integer 'x' which satisfies:
885 				 *	needed * na->mfs + x * H <= x * na->mfs
886 				 * where 'H' is the length of the longest header that may
887 				 * be replicated in the segmentation process (e.g. for
888 				 * TCPv4 we must account for ethernet header, IP header
889 				 * and TCPv4 header).
890 				 */
891 				KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0"));
892 				needed = (needed * na->mfs) /
893 						(dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
894 				nm_prdis(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
895 			}
896 		}
897 
898 		nm_prdis(5, "pass 2 dst %d is %x %s",
899 			i, d_i, nm_is_bwrap(&dst_na->up) ? "nic/host" : "virtual");
900 		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
901 		nrings = dst_na->up.num_rx_rings;
902 		if (dst_nr >= nrings)
903 			dst_nr = dst_nr % nrings;
904 		kring = dst_na->up.rx_rings[dst_nr];
905 		ring = kring->ring;
906 		/* the destination ring may have not been opened for RX */
907 		if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON))
908 			goto cleanup;
909 		lim = kring->nkr_num_slots - 1;
910 
911 retry:
912 
913 		if (dst_na->retry && retry) {
914 			/* try to get some free slot from the previous run */
915 			kring->nm_notify(kring, NAF_FORCE_RECLAIM);
916 			/* actually useful only for bwraps, since there
917 			 * the notify will trigger a txsync on the hwna. VALE ports
918 			 * have dst_na->retry == 0
919 			 */
920 		}
921 		/* reserve the buffers in the queue and an entry
922 		 * to report completion, and drop lock.
923 		 * XXX this might become a helper function.
924 		 */
925 		mtx_lock(&kring->q_lock);
926 		if (kring->nkr_stopped) {
927 			mtx_unlock(&kring->q_lock);
928 			goto cleanup;
929 		}
930 		my_start = j = kring->nkr_hwlease;
931 		howmany = nm_kr_space(kring, 1);
932 		if (needed < howmany)
933 			howmany = needed;
934 		lease_idx = nm_kr_lease(kring, howmany, 1);
935 		mtx_unlock(&kring->q_lock);
936 
937 		/* only retry if we need more than available slots */
938 		if (retry && needed <= howmany)
939 			retry = 0;
940 
941 		/* copy to the destination queue */
942 		while (howmany > 0) {
943 			struct netmap_slot *slot;
944 			struct nm_bdg_fwd *ft_p, *ft_end;
945 			u_int cnt;
946 
947 			/* find the queue from which we pick next packet.
948 			 * NM_FT_NULL is always higher than valid indexes
949 			 * so we never dereference it if the other list
950 			 * has packets (and if both are empty we never
951 			 * get here).
952 			 */
953 			if (next < brd_next) {
954 				ft_p = ft + next;
955 				next = ft_p->ft_next;
956 			} else { /* insert broadcast */
957 				ft_p = ft + brd_next;
958 				brd_next = ft_p->ft_next;
959 			}
960 			cnt = ft_p->ft_frags; // cnt > 0
961 			if (unlikely(cnt > howmany))
962 			    break; /* no more space */
963 			if (netmap_verbose && cnt > 1)
964 				nm_prlim(5, "rx %d frags to %d", cnt, j);
965 			ft_end = ft_p + cnt;
966 			if (unlikely(virt_hdr_mismatch)) {
967 				bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
968 			} else {
969 				howmany -= cnt;
970 				do {
971 					char *dst, *src = ft_p->ft_buf;
972 					size_t copy_len = ft_p->ft_len, dst_len = copy_len;
973 					uintptr_t src_cb;
974 					uint64_t dstoff, dstoff_cb;
975 					int src_co, dst_co;
976 					const uintptr_t mask = NM_BUF_ALIGN - 1;
977 
978 					slot = &ring->slot[j];
979 					dst = NMB(&dst_na->up, slot);
980 					dstoff = nm_get_offset(kring, slot);
981 					dstoff_cb = dstoff & ~mask;
982 					src_cb = ((uintptr_t)src) & ~mask;
983 					src_co = ((uintptr_t)src) & mask;
984 					dst_co = ((uintptr_t)(dst + dstoff)) & mask;
985 					if (dst_co < src_co) {
986 						dstoff_cb += NM_BUF_ALIGN;
987 					}
988 					dstoff = dstoff_cb + src_co;
989 					copy_len += src_co;
990 
991 					nm_prdis("send [%d] %d(%d) bytes at %s:%d",
992 							i, (int)copy_len, (int)dst_len,
993 							NM_IFPNAME(dst_ifp), j);
994 
995 					if (unlikely(dstoff > NETMAP_BUF_SIZE(&dst_na->up) ||
996 				                     dst_len > NETMAP_BUF_SIZE(&dst_na->up) - dstoff)) {
997 						nm_prlim(5, "dropping packet/fragment of len %zu, dest offset %llu",
998 								dst_len, (unsigned long long)dstoff);
999 						copy_len = dst_len = 0;
1000 						dstoff = nm_get_offset(kring, slot);
1001 					}
1002 
1003 					if (ft_p->ft_flags & NS_INDIRECT) {
1004 						if (copyin(src, dst, copy_len)) {
1005 							// invalid user pointer, pretend len is 0
1006 							dst_len = 0;
1007 						}
1008 					} else {
1009 						//memcpy(dst, src, copy_len);
1010 						pkt_copy((char *)src_cb, dst + dstoff_cb, (int)copy_len);
1011 					}
1012 					slot->len = dst_len;
1013 					slot->flags = (cnt << 8)| NS_MOREFRAG;
1014 					nm_write_offset(kring, slot, dstoff);
1015 					j = nm_next(j, lim);
1016 					needed--;
1017 					ft_p++;
1018 				} while (ft_p != ft_end);
1019 				slot->flags = (cnt << 8); /* clear flag on last entry */
1020 			}
1021 			/* are we done ? */
1022 			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1023 				break;
1024 		}
1025 		{
1026 		    /* current position */
1027 		    uint32_t *p = kring->nkr_leases; /* shorthand */
1028 		    uint32_t update_pos;
1029 		    int still_locked = 1;
1030 
1031 		    mtx_lock(&kring->q_lock);
1032 		    if (unlikely(howmany > 0)) {
1033 			/* not used all bufs. If i am the last one
1034 			 * i can recover the slots, otherwise must
1035 			 * fill them with 0 to mark empty packets.
1036 			 */
1037 			nm_prdis("leftover %d bufs", howmany);
1038 			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1039 			    /* yes i am the last one */
1040 			    nm_prdis("roll back nkr_hwlease to %d", j);
1041 			    kring->nkr_hwlease = j;
1042 			} else {
1043 			    while (howmany-- > 0) {
1044 				ring->slot[j].len = 0;
1045 				ring->slot[j].flags = 0;
1046 				j = nm_next(j, lim);
1047 			    }
1048 			}
1049 		    }
1050 		    p[lease_idx] = j; /* report I am done */
1051 
1052 		    update_pos = kring->nr_hwtail;
1053 
1054 		    if (my_start == update_pos) {
1055 			/* all slots before my_start have been reported,
1056 			 * so scan subsequent leases to see if other ranges
1057 			 * have been completed, and to a selwakeup or txsync.
1058 		         */
1059 			while (lease_idx != kring->nkr_lease_idx &&
1060 				p[lease_idx] != NR_NOSLOT) {
1061 			    j = p[lease_idx];
1062 			    p[lease_idx] = NR_NOSLOT;
1063 			    lease_idx = nm_next(lease_idx, lim);
1064 			}
1065 			/* j is the new 'write' position. j != my_start
1066 			 * means there are new buffers to report
1067 			 */
1068 			if (likely(j != my_start)) {
1069 				kring->nr_hwtail = j;
1070 				still_locked = 0;
1071 				mtx_unlock(&kring->q_lock);
1072 				kring->nm_notify(kring, 0);
1073 				/* this is netmap_notify for VALE ports and
1074 				 * netmap_bwrap_notify for bwrap. The latter will
1075 				 * trigger a txsync on the underlying hwna
1076 				 */
1077 				if (dst_na->retry && retry--) {
1078 					/* XXX this is going to call nm_notify again.
1079 					 * Only useful for bwrap in virtual machines
1080 					 */
1081 					goto retry;
1082 				}
1083 			}
1084 		    }
1085 		    if (still_locked)
1086 			mtx_unlock(&kring->q_lock);
1087 		}
1088 cleanup:
1089 		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1090 		d->bq_len = 0;
1091 	}
1092 	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1093 	brddst->bq_len = 0;
1094 	return 0;
1095 }
1096 
1097 /* nm_txsync callback for VALE ports */
1098 static int
1099 netmap_vale_vp_txsync(struct netmap_kring *kring, int flags)
1100 {
1101 	struct netmap_vp_adapter *na =
1102 		(struct netmap_vp_adapter *)kring->na;
1103 	u_int done;
1104 	u_int const lim = kring->nkr_num_slots - 1;
1105 	u_int const head = kring->rhead;
1106 
1107 	if (bridge_batch <= 0) { /* testing only */
1108 		done = head; // used all
1109 		goto done;
1110 	}
1111 	if (!na->na_bdg) {
1112 		done = head;
1113 		goto done;
1114 	}
1115 	if (bridge_batch > NM_BDG_BATCH)
1116 		bridge_batch = NM_BDG_BATCH;
1117 
1118 	done = nm_vale_preflush(kring, head);
1119 done:
1120 	if (done != head)
1121 		nm_prerr("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
1122 	/*
1123 	 * packets between 'done' and 'cur' are left unsent.
1124 	 */
1125 	kring->nr_hwcur = done;
1126 	kring->nr_hwtail = nm_prev(done, lim);
1127 	if (netmap_debug & NM_DEBUG_TXSYNC)
1128 		nm_prinf("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
1129 	return 0;
1130 }
1131 
1132 
1133 /* create a netmap_vp_adapter that describes a VALE port.
1134  * Only persistent VALE ports have a non-null ifp.
1135  */
1136 static int
1137 netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *ifp,
1138 		struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret)
1139 {
1140 	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1141 	struct netmap_vp_adapter *vpna;
1142 	struct netmap_adapter *na;
1143 	int error = 0;
1144 	u_int npipes = 0;
1145 	u_int extrabufs = 0;
1146 
1147 	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1148 		return EINVAL;
1149 	}
1150 
1151 	vpna = nm_os_malloc(sizeof(*vpna));
1152 	if (vpna == NULL)
1153 		return ENOMEM;
1154 
1155  	na = &vpna->up;
1156 
1157 	na->ifp = ifp;
1158 	strlcpy(na->name, hdr->nr_name, sizeof(na->name));
1159 
1160 	/* bound checking */
1161 	na->num_tx_rings = req->nr_tx_rings;
1162 	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1163 	req->nr_tx_rings = na->num_tx_rings; /* write back */
1164 	na->num_rx_rings = req->nr_rx_rings;
1165 	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1166 	req->nr_rx_rings = na->num_rx_rings; /* write back */
1167 	nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1168 			1, NM_BDG_MAXSLOTS, NULL);
1169 	na->num_tx_desc = req->nr_tx_slots;
1170 	nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1171 			1, NM_BDG_MAXSLOTS, NULL);
1172 	/* validate number of pipes. We want at least 1,
1173 	 * but probably can do with some more.
1174 	 * So let's use 2 as default (when 0 is supplied)
1175 	 */
1176 	nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
1177 	/* validate extra bufs */
1178 	extrabufs = req->nr_extra_bufs;
1179 	nm_bound_var(&extrabufs, 0, 0,
1180 			128*NM_BDG_MAXSLOTS, NULL);
1181 	req->nr_extra_bufs = extrabufs; /* write back */
1182 	na->num_rx_desc = req->nr_rx_slots;
1183 	/* Set the mfs to a default value, as it is needed on the VALE
1184 	 * mismatch datapath. XXX We should set it according to the MTU
1185 	 * known to the kernel. */
1186 	vpna->mfs = NM_BDG_MFS_DEFAULT;
1187 	vpna->last_smac = ~0llu;
1188 	/*if (vpna->mfs > netmap_buf_size)  TODO netmap_buf_size is zero??
1189 		vpna->mfs = netmap_buf_size; */
1190 	if (netmap_verbose)
1191 		nm_prinf("max frame size %u", vpna->mfs);
1192 
1193 	na->na_flags |= (NAF_BDG_MAYSLEEP | NAF_OFFSETS);
1194 	/* persistent VALE ports look like hw devices
1195 	 * with a native netmap adapter
1196 	 */
1197 	if (ifp)
1198 		na->na_flags |= NAF_NATIVE;
1199 	na->nm_txsync = netmap_vale_vp_txsync;
1200 	na->nm_rxsync = netmap_vp_rxsync; /* use the one provided by bdg */
1201 	na->nm_register = netmap_vp_reg;  /* use the one provided by bdg */
1202 	na->nm_krings_create = netmap_vale_vp_krings_create;
1203 	na->nm_krings_delete = netmap_vale_vp_krings_delete;
1204 	na->nm_dtor = netmap_vale_vp_dtor;
1205 	nm_prdis("nr_mem_id %d", req->nr_mem_id);
1206 	na->nm_mem = nmd ?
1207 		netmap_mem_get(nmd):
1208 		netmap_mem_private_new(
1209 			na->num_tx_rings, na->num_tx_desc,
1210 			na->num_rx_rings, na->num_rx_desc,
1211 			req->nr_extra_bufs, npipes, &error);
1212 	if (na->nm_mem == NULL)
1213 		goto err;
1214 	na->nm_bdg_attach = netmap_vale_vp_bdg_attach;
1215 	/* other nmd fields are set in the common routine */
1216 	error = netmap_attach_common(na);
1217 	if (error)
1218 		goto err;
1219 	*ret = vpna;
1220 	return 0;
1221 
1222 err:
1223 	if (na->nm_mem != NULL)
1224 		netmap_mem_put(na->nm_mem);
1225 	nm_os_free(vpna);
1226 	return error;
1227 }
1228 
1229 /* nm_bdg_attach callback for VALE ports
1230  * The na_vp port is this same netmap_adapter. There is no host port.
1231  */
1232 static int
1233 netmap_vale_vp_bdg_attach(const char *name, struct netmap_adapter *na,
1234 		struct nm_bridge *b)
1235 {
1236 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
1237 
1238 	if ((b->bdg_flags & NM_BDG_NEED_BWRAP) || vpna->na_bdg) {
1239 		return NM_NEED_BWRAP;
1240 	}
1241 	na->na_vp = vpna;
1242 	strlcpy(na->name, name, sizeof(na->name));
1243 	na->na_hostvp = NULL;
1244 	return 0;
1245 }
1246 
1247 static int
1248 netmap_vale_bwrap_krings_create(struct netmap_adapter *na)
1249 {
1250 	int error;
1251 
1252 	/* impersonate a netmap_vp_adapter */
1253 	error = netmap_vale_vp_krings_create(na);
1254 	if (error)
1255 		return error;
1256 	error = netmap_bwrap_krings_create_common(na);
1257 	if (error) {
1258 		netmap_vale_vp_krings_delete(na);
1259 	}
1260 	return error;
1261 }
1262 
1263 static void
1264 netmap_vale_bwrap_krings_delete(struct netmap_adapter *na)
1265 {
1266 	netmap_bwrap_krings_delete_common(na);
1267 	netmap_vale_vp_krings_delete(na);
1268 }
1269 
1270 static int
1271 netmap_vale_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
1272 {
1273 	struct netmap_bwrap_adapter *bna;
1274 	struct netmap_adapter *na = NULL;
1275 	struct netmap_adapter *hostna = NULL;
1276 	int error;
1277 
1278 	bna = nm_os_malloc(sizeof(*bna));
1279 	if (bna == NULL) {
1280 		return ENOMEM;
1281 	}
1282 	na = &bna->up.up;
1283 	strlcpy(na->name, nr_name, sizeof(na->name));
1284 	na->nm_register = netmap_bwrap_reg;
1285 	na->nm_txsync = netmap_vale_vp_txsync;
1286 	// na->nm_rxsync = netmap_bwrap_rxsync;
1287 	na->nm_krings_create = netmap_vale_bwrap_krings_create;
1288 	na->nm_krings_delete = netmap_vale_bwrap_krings_delete;
1289 	na->nm_notify = netmap_bwrap_notify;
1290 	bna->nm_intr_notify = netmap_bwrap_intr_notify;
1291 	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
1292 	/* Set the mfs, needed on the VALE mismatch datapath. */
1293 	bna->up.mfs = NM_BDG_MFS_DEFAULT;
1294 
1295 	if (hwna->na_flags & NAF_HOST_RINGS) {
1296 		hostna = &bna->host.up;
1297 		hostna->nm_notify = netmap_bwrap_notify;
1298 		bna->host.mfs = NM_BDG_MFS_DEFAULT;
1299 	}
1300 
1301 	error = netmap_bwrap_attach_common(na, hwna);
1302 	if (error) {
1303 		nm_os_free(bna);
1304 	}
1305 	return error;
1306 }
1307 
1308 int
1309 netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na,
1310 		struct netmap_mem_d *nmd, int create)
1311 {
1312 	return netmap_get_bdg_na(hdr, na, nmd, create, &vale_bdg_ops);
1313 }
1314 
1315 
1316 /* creates a persistent VALE port */
1317 int
1318 nm_vi_create(struct nmreq_header *hdr)
1319 {
1320 	struct nmreq_vale_newif *req =
1321 		(struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body;
1322 	int error = 0;
1323 	/* Build a nmreq_register out of the nmreq_vale_newif,
1324 	 * so that we can call netmap_get_bdg_na(). */
1325 	struct nmreq_register regreq;
1326 	bzero(&regreq, sizeof(regreq));
1327 	regreq.nr_tx_slots = req->nr_tx_slots;
1328 	regreq.nr_rx_slots = req->nr_rx_slots;
1329 	regreq.nr_tx_rings = req->nr_tx_rings;
1330 	regreq.nr_rx_rings = req->nr_rx_rings;
1331 	regreq.nr_mem_id = req->nr_mem_id;
1332 	hdr->nr_reqtype = NETMAP_REQ_REGISTER;
1333 	hdr->nr_body = (uintptr_t)&regreq;
1334 	error = netmap_vi_create(hdr, 0 /* no autodelete */);
1335 	hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF;
1336 	hdr->nr_body = (uintptr_t)req;
1337 	/* Write back to the original struct. */
1338 	req->nr_tx_slots = regreq.nr_tx_slots;
1339 	req->nr_rx_slots = regreq.nr_rx_slots;
1340 	req->nr_tx_rings = regreq.nr_tx_rings;
1341 	req->nr_rx_rings = regreq.nr_rx_rings;
1342 	req->nr_mem_id = regreq.nr_mem_id;
1343 	return error;
1344 }
1345 
1346 /* remove a persistent VALE port from the system */
1347 int
1348 nm_vi_destroy(const char *name)
1349 {
1350 	struct ifnet *ifp;
1351 	struct netmap_vp_adapter *vpna;
1352 	int error;
1353 
1354 	ifp = ifunit_ref(name);
1355 	if (!ifp)
1356 		return ENXIO;
1357 	NMG_LOCK();
1358 	/* make sure this is actually a VALE port */
1359 	if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
1360 		error = EINVAL;
1361 		goto err;
1362 	}
1363 
1364 	vpna = (struct netmap_vp_adapter *)NA(ifp);
1365 
1366 	/* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
1367 	if (vpna->autodelete) {
1368 		error = EINVAL;
1369 		goto err;
1370 	}
1371 
1372 	/* also make sure that nobody is using the interface */
1373 	if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
1374 	    vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
1375 		error = EBUSY;
1376 		goto err;
1377 	}
1378 
1379 	NMG_UNLOCK();
1380 
1381 	if (netmap_verbose)
1382 		nm_prinf("destroying a persistent vale interface %s", ifp->if_xname);
1383 	/* Linux requires all the references are released
1384 	 * before unregister
1385 	 */
1386 	netmap_detach(ifp);
1387 	if_rele(ifp);
1388 	nm_os_vi_detach(ifp);
1389 	return 0;
1390 
1391 err:
1392 	NMG_UNLOCK();
1393 	if_rele(ifp);
1394 	return error;
1395 }
1396 
1397 static int
1398 nm_update_info(struct nmreq_register *req, struct netmap_adapter *na)
1399 {
1400 	req->nr_rx_rings = na->num_rx_rings;
1401 	req->nr_tx_rings = na->num_tx_rings;
1402 	req->nr_rx_slots = na->num_rx_desc;
1403 	req->nr_tx_slots = na->num_tx_desc;
1404 	return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL,
1405 					&req->nr_mem_id);
1406 }
1407 
1408 
1409 /*
1410  * Create a virtual interface registered to the system.
1411  * The interface will be attached to a bridge later.
1412  */
1413 int
1414 netmap_vi_create(struct nmreq_header *hdr, int autodelete)
1415 {
1416 	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1417 	struct ifnet *ifp;
1418 	struct netmap_vp_adapter *vpna;
1419 	struct netmap_mem_d *nmd = NULL;
1420 	int error;
1421 
1422 	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1423 		return EINVAL;
1424 	}
1425 
1426 	/* don't include VALE prefix */
1427 	if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
1428 		return EINVAL;
1429 	if (strlen(hdr->nr_name) >= IFNAMSIZ) {
1430 		return EINVAL;
1431 	}
1432 	ifp = ifunit_ref(hdr->nr_name);
1433 	if (ifp) { /* already exist, cannot create new one */
1434 		error = EEXIST;
1435 		NMG_LOCK();
1436 		if (NM_NA_VALID(ifp)) {
1437 			int update_err = nm_update_info(req, NA(ifp));
1438 			if (update_err)
1439 				error = update_err;
1440 		}
1441 		NMG_UNLOCK();
1442 		if_rele(ifp);
1443 		return error;
1444 	}
1445 	error = nm_os_vi_persist(hdr->nr_name, &ifp);
1446 	if (error)
1447 		return error;
1448 
1449 	NMG_LOCK();
1450 	if (req->nr_mem_id) {
1451 		nmd = netmap_mem_find(req->nr_mem_id);
1452 		if (nmd == NULL) {
1453 			error = EINVAL;
1454 			goto err_1;
1455 		}
1456 	}
1457 	/* netmap_vp_create creates a struct netmap_vp_adapter */
1458 	error = netmap_vale_vp_create(hdr, ifp, nmd, &vpna);
1459 	if (error) {
1460 		if (netmap_debug & NM_DEBUG_VALE)
1461 			nm_prerr("error %d", error);
1462 		goto err_1;
1463 	}
1464 	/* persist-specific routines */
1465 	vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
1466 	if (!autodelete) {
1467 		netmap_adapter_get(&vpna->up);
1468 	} else {
1469 		vpna->autodelete = 1;
1470 	}
1471 	NM_ATTACH_NA(ifp, &vpna->up);
1472 	/* return the updated info */
1473 	error = nm_update_info(req, &vpna->up);
1474 	if (error) {
1475 		goto err_2;
1476 	}
1477 	nm_prdis("returning nr_mem_id %d", req->nr_mem_id);
1478 	if (nmd)
1479 		netmap_mem_put(nmd);
1480 	NMG_UNLOCK();
1481 	nm_prdis("created %s", ifp->if_xname);
1482 	return 0;
1483 
1484 err_2:
1485 	netmap_detach(ifp);
1486 err_1:
1487 	if (nmd)
1488 		netmap_mem_put(nmd);
1489 	NMG_UNLOCK();
1490 	nm_os_vi_detach(ifp);
1491 
1492 	return error;
1493 }
1494 
1495 #endif /* WITH_VALE */
1496