xref: /freebsd/sys/dev/netmap/netmap_bdg.c (revision d0b2dbfa)
1 /*
2  * Copyright (C) 2013-2016 Universita` di Pisa
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *   1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  *   2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 
28 /*
29  * This module implements the VALE switch for netmap
30 
31 --- VALE SWITCH ---
32 
33 NMG_LOCK() serializes all modifications to switches and ports.
34 A switch cannot be deleted until all ports are gone.
35 
36 For each switch, an SX lock (RWlock on linux) protects
37 deletion of ports. When configuring or deleting a new port, the
38 lock is acquired in exclusive mode (after holding NMG_LOCK).
39 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
40 The lock is held throughout the entire forwarding cycle,
41 during which the thread may incur in a page fault.
42 Hence it is important that sleepable shared locks are used.
43 
44 On the rx ring, the per-port lock is grabbed initially to reserve
45 a number of slot in the ring, then the lock is released,
46 packets are copied from source to destination, and then
47 the lock is acquired again and the receive ring is updated.
48 (A similar thing is done on the tx ring for NIC and host stack
49 ports attached to the switch)
50 
51  */
52 
53 /*
54  * OS-specific code that is used only within this file.
55  * Other OS-specific code that must be accessed by drivers
56  * is present in netmap_kern.h
57  */
58 
59 #if defined(__FreeBSD__)
60 #include <sys/cdefs.h> /* prerequisite */
61 #include <sys/types.h>
62 #include <sys/errno.h>
63 #include <sys/param.h>	/* defines used in kernel.h */
64 #include <sys/kernel.h>	/* types used in module initialization */
65 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
66 #include <sys/sockio.h>
67 #include <sys/socketvar.h>	/* struct socket */
68 #include <sys/malloc.h>
69 #include <sys/poll.h>
70 #include <sys/rwlock.h>
71 #include <sys/socket.h> /* sockaddrs */
72 #include <sys/selinfo.h>
73 #include <sys/sysctl.h>
74 #include <net/if.h>
75 #include <net/if_var.h>
76 #include <net/bpf.h>		/* BIOCIMMEDIATE */
77 #include <machine/bus.h>	/* bus_dmamap_* */
78 #include <sys/endian.h>
79 #include <sys/refcount.h>
80 #include <sys/smp.h>
81 
82 
83 #elif defined(linux)
84 
85 #include "bsd_glue.h"
86 
87 #elif defined(__APPLE__)
88 
89 #warning OSX support is only partial
90 #include "osx_glue.h"
91 
92 #elif defined(_WIN32)
93 #include "win_glue.h"
94 
95 #else
96 
97 #error	Unsupported platform
98 
99 #endif /* unsupported */
100 
101 /*
102  * common headers
103  */
104 
105 #include <net/netmap.h>
106 #include <dev/netmap/netmap_kern.h>
107 #include <dev/netmap/netmap_mem2.h>
108 
109 #include <dev/netmap/netmap_bdg.h>
110 
111 const char*
112 netmap_bdg_name(struct netmap_vp_adapter *vp)
113 {
114 	struct nm_bridge *b = vp->na_bdg;
115 	if (b == NULL)
116 		return NULL;
117 	return b->bdg_basename;
118 }
119 
120 
121 #ifndef CONFIG_NET_NS
122 /*
123  * XXX in principle nm_bridges could be created dynamically
124  * Right now we have a static array and deletions are protected
125  * by an exclusive lock.
126  */
127 struct nm_bridge *nm_bridges;
128 #endif /* !CONFIG_NET_NS */
129 
130 
131 static int
132 nm_is_id_char(const char c)
133 {
134 	return (c >= 'a' && c <= 'z') ||
135 	       (c >= 'A' && c <= 'Z') ||
136 	       (c >= '0' && c <= '9') ||
137 	       (c == '_');
138 }
139 
140 /* Validate the name of a bdg port and return the
141  * position of the ":" character. */
142 static int
143 nm_bdg_name_validate(const char *name, size_t prefixlen)
144 {
145 	int colon_pos = -1;
146 	int i;
147 
148 	if (!name || strlen(name) < prefixlen) {
149 		return -1;
150 	}
151 
152 	for (i = 0; i < NM_BDG_IFNAMSIZ && name[i]; i++) {
153 		if (name[i] == ':') {
154 			colon_pos = i;
155 			break;
156 		} else if (!nm_is_id_char(name[i])) {
157 			return -1;
158 		}
159 	}
160 
161 	if (strlen(name) - colon_pos > IFNAMSIZ) {
162 		/* interface name too long */
163 		return -1;
164 	}
165 
166 	return colon_pos;
167 }
168 
169 /*
170  * locate a bridge among the existing ones.
171  * MUST BE CALLED WITH NMG_LOCK()
172  *
173  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
174  * We assume that this is called with a name of at least NM_NAME chars.
175  */
176 struct nm_bridge *
177 nm_find_bridge(const char *name, int create, struct netmap_bdg_ops *ops)
178 {
179 	int i, namelen;
180 	struct nm_bridge *b = NULL, *bridges;
181 	u_int num_bridges;
182 
183 	NMG_LOCK_ASSERT();
184 
185 	netmap_bns_getbridges(&bridges, &num_bridges);
186 
187 	namelen = nm_bdg_name_validate(name,
188 			(ops != NULL ? strlen(ops->name) : 0));
189 	if (namelen < 0) {
190 		nm_prerr("invalid bridge name %s", name ? name : NULL);
191 		return NULL;
192 	}
193 
194 	/* lookup the name, remember empty slot if there is one */
195 	for (i = 0; i < num_bridges; i++) {
196 		struct nm_bridge *x = bridges + i;
197 
198 		if ((x->bdg_flags & NM_BDG_ACTIVE) + x->bdg_active_ports == 0) {
199 			if (create && b == NULL)
200 				b = x;	/* record empty slot */
201 		} else if (x->bdg_namelen != namelen) {
202 			continue;
203 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
204 			nm_prdis("found '%.*s' at %d", namelen, name, i);
205 			b = x;
206 			break;
207 		}
208 	}
209 	if (i == num_bridges && b) { /* name not found, can create entry */
210 		/* initialize the bridge */
211 		nm_prdis("create new bridge %s with ports %d", b->bdg_basename,
212 			b->bdg_active_ports);
213 		b->ht = nm_os_malloc(sizeof(struct nm_hash_ent) * NM_BDG_HASH);
214 		if (b->ht == NULL) {
215 			nm_prerr("failed to allocate hash table");
216 			return NULL;
217 		}
218 		strncpy(b->bdg_basename, name, namelen);
219 		b->bdg_namelen = namelen;
220 		b->bdg_active_ports = 0;
221 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
222 			b->bdg_port_index[i] = i;
223 		/* set the default function */
224 		b->bdg_ops = b->bdg_saved_ops = *ops;
225 		b->private_data = b->ht;
226 		b->bdg_flags = 0;
227 		NM_BNS_GET(b);
228 	}
229 	return b;
230 }
231 
232 
233 int
234 netmap_bdg_free(struct nm_bridge *b)
235 {
236 	if ((b->bdg_flags & NM_BDG_ACTIVE) + b->bdg_active_ports != 0) {
237 		return EBUSY;
238 	}
239 
240 	nm_prdis("marking bridge %s as free", b->bdg_basename);
241 	nm_os_free(b->ht);
242 	memset(&b->bdg_ops, 0, sizeof(b->bdg_ops));
243 	memset(&b->bdg_saved_ops, 0, sizeof(b->bdg_saved_ops));
244 	b->bdg_flags = 0;
245 	NM_BNS_PUT(b);
246 	return 0;
247 }
248 
249 /* Called by external kernel modules (e.g., Openvswitch).
250  * to modify the private data previously given to regops().
251  * 'name' may be just bridge's name (including ':' if it
252  * is not just NM_BDG_NAME).
253  * Called without NMG_LOCK.
254  */
255 int
256 netmap_bdg_update_private_data(const char *name, bdg_update_private_data_fn_t callback,
257 	void *callback_data, void *auth_token)
258 {
259 	void *private_data = NULL;
260 	struct nm_bridge *b;
261 	int error = 0;
262 
263 	NMG_LOCK();
264 	b = nm_find_bridge(name, 0 /* don't create */, NULL);
265 	if (!b) {
266 		error = EINVAL;
267 		goto unlock_update_priv;
268 	}
269 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
270 		error = EACCES;
271 		goto unlock_update_priv;
272 	}
273 	BDG_WLOCK(b);
274 	private_data = callback(b->private_data, callback_data, &error);
275 	b->private_data = private_data;
276 	BDG_WUNLOCK(b);
277 
278 unlock_update_priv:
279 	NMG_UNLOCK();
280 	return error;
281 }
282 
283 
284 
285 /* remove from bridge b the ports in slots hw and sw
286  * (sw can be -1 if not needed)
287  */
288 void
289 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
290 {
291 	int s_hw = hw, s_sw = sw;
292 	int i, lim =b->bdg_active_ports;
293 	uint32_t *tmp = b->tmp_bdg_port_index;
294 
295 	/*
296 	New algorithm:
297 	make a copy of bdg_port_index;
298 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
299 	in the array of bdg_port_index, replacing them with
300 	entries from the bottom of the array;
301 	decrement bdg_active_ports;
302 	acquire BDG_WLOCK() and copy back the array.
303 	 */
304 
305 	if (netmap_debug & NM_DEBUG_BDG)
306 		nm_prinf("detach %d and %d (lim %d)", hw, sw, lim);
307 	/* make a copy of the list of active ports, update it,
308 	 * and then copy back within BDG_WLOCK().
309 	 */
310 	memcpy(b->tmp_bdg_port_index, b->bdg_port_index, sizeof(b->tmp_bdg_port_index));
311 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
312 		if (hw >= 0 && tmp[i] == hw) {
313 			nm_prdis("detach hw %d at %d", hw, i);
314 			lim--; /* point to last active port */
315 			tmp[i] = tmp[lim]; /* swap with i */
316 			tmp[lim] = hw;	/* now this is inactive */
317 			hw = -1;
318 		} else if (sw >= 0 && tmp[i] == sw) {
319 			nm_prdis("detach sw %d at %d", sw, i);
320 			lim--;
321 			tmp[i] = tmp[lim];
322 			tmp[lim] = sw;
323 			sw = -1;
324 		} else {
325 			i++;
326 		}
327 	}
328 	if (hw >= 0 || sw >= 0) {
329 		nm_prerr("delete failed hw %d sw %d, should panic...", hw, sw);
330 	}
331 
332 	BDG_WLOCK(b);
333 	if (b->bdg_ops.dtor)
334 		b->bdg_ops.dtor(b->bdg_ports[s_hw]);
335 	b->bdg_ports[s_hw] = NULL;
336 	if (s_sw >= 0) {
337 		b->bdg_ports[s_sw] = NULL;
338 	}
339 	memcpy(b->bdg_port_index, b->tmp_bdg_port_index, sizeof(b->tmp_bdg_port_index));
340 	b->bdg_active_ports = lim;
341 	BDG_WUNLOCK(b);
342 
343 	nm_prdis("now %d active ports", lim);
344 	netmap_bdg_free(b);
345 }
346 
347 
348 /* nm_bdg_ctl callback for VALE ports */
349 int
350 netmap_vp_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
351 {
352 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
353 	struct nm_bridge *b = vpna->na_bdg;
354 
355 	if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
356 		return 0; /* nothing to do */
357 	}
358 	if (b) {
359 		netmap_set_all_rings(na, 0 /* disable */);
360 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
361 		vpna->na_bdg = NULL;
362 		netmap_set_all_rings(na, 1 /* enable */);
363 	}
364 	/* I have took reference just for attach */
365 	netmap_adapter_put(na);
366 	return 0;
367 }
368 
369 int
370 netmap_default_bdg_attach(const char *name, struct netmap_adapter *na,
371 		struct nm_bridge *b)
372 {
373 	return NM_NEED_BWRAP;
374 }
375 
376 /* Try to get a reference to a netmap adapter attached to a VALE switch.
377  * If the adapter is found (or is created), this function returns 0, a
378  * non NULL pointer is returned into *na, and the caller holds a
379  * reference to the adapter.
380  * If an adapter is not found, then no reference is grabbed and the
381  * function returns an error code, or 0 if there is just a VALE prefix
382  * mismatch. Therefore the caller holds a reference when
383  * (*na != NULL && return == 0).
384  */
385 int
386 netmap_get_bdg_na(struct nmreq_header *hdr, struct netmap_adapter **na,
387 	struct netmap_mem_d *nmd, int create, struct netmap_bdg_ops *ops)
388 {
389 	char *nr_name = hdr->nr_name;
390 	const char *ifname;
391 	if_t ifp = NULL;
392 	int error = 0;
393 	struct netmap_vp_adapter *vpna, *hostna = NULL;
394 	struct nm_bridge *b;
395 	uint32_t i, j;
396 	uint32_t cand = NM_BDG_NOPORT, cand2 = NM_BDG_NOPORT;
397 	int needed;
398 
399 	*na = NULL;     /* default return value */
400 
401 	/* first try to see if this is a bridge port. */
402 	NMG_LOCK_ASSERT();
403 	if (strncmp(nr_name, ops->name, strlen(ops->name) - 1)) {
404 		return 0;  /* no error, but no VALE prefix */
405 	}
406 
407 	b = nm_find_bridge(nr_name, create, ops);
408 	if (b == NULL) {
409 		nm_prdis("no bridges available for '%s'", nr_name);
410 		return (create ? ENOMEM : ENXIO);
411 	}
412 	if (strlen(nr_name) < b->bdg_namelen) /* impossible */
413 		panic("x");
414 
415 	/* Now we are sure that name starts with the bridge's name,
416 	 * lookup the port in the bridge. We need to scan the entire
417 	 * list. It is not important to hold a WLOCK on the bridge
418 	 * during the search because NMG_LOCK already guarantees
419 	 * that there are no other possible writers.
420 	 */
421 
422 	/* lookup in the local list of ports */
423 	for (j = 0; j < b->bdg_active_ports; j++) {
424 		i = b->bdg_port_index[j];
425 		vpna = b->bdg_ports[i];
426 		nm_prdis("checking %s", vpna->up.name);
427 		if (!strcmp(vpna->up.name, nr_name)) {
428 			netmap_adapter_get(&vpna->up);
429 			nm_prdis("found existing if %s refs %d", nr_name)
430 			*na = &vpna->up;
431 			return 0;
432 		}
433 	}
434 	/* not found, should we create it? */
435 	if (!create)
436 		return ENXIO;
437 	/* yes we should, see if we have space to attach entries */
438 	needed = 2; /* in some cases we only need 1 */
439 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
440 		nm_prerr("bridge full %d, cannot create new port", b->bdg_active_ports);
441 		return ENOMEM;
442 	}
443 	/* record the next two ports available, but do not allocate yet */
444 	cand = b->bdg_port_index[b->bdg_active_ports];
445 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
446 	nm_prdis("+++ bridge %s port %s used %d avail %d %d",
447 		b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
448 
449 	/*
450 	 * try see if there is a matching NIC with this name
451 	 * (after the bridge's name)
452 	 */
453 	ifname = nr_name + b->bdg_namelen + 1;
454 	ifp = ifunit_ref(ifname);
455 	if (!ifp) {
456 		/* Create an ephemeral virtual port.
457 		 * This block contains all the ephemeral-specific logic.
458 		 */
459 
460 		if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
461 			error = EINVAL;
462 			goto out;
463 		}
464 
465 		/* bdg_netmap_attach creates a struct netmap_adapter */
466 		error = b->bdg_ops.vp_create(hdr, NULL, nmd, &vpna);
467 		if (error) {
468 			if (netmap_debug & NM_DEBUG_BDG)
469 				nm_prerr("error %d", error);
470 			goto out;
471 		}
472 		/* shortcut - we can skip get_hw_na(),
473 		 * ownership check and nm_bdg_attach()
474 		 */
475 
476 	} else {
477 		struct netmap_adapter *hw;
478 
479 		/* the vale:nic syntax is only valid for some commands */
480 		switch (hdr->nr_reqtype) {
481 		case NETMAP_REQ_VALE_ATTACH:
482 		case NETMAP_REQ_VALE_DETACH:
483 		case NETMAP_REQ_VALE_POLLING_ENABLE:
484 		case NETMAP_REQ_VALE_POLLING_DISABLE:
485 			break; /* ok */
486 		default:
487 			error = EINVAL;
488 			goto out;
489 		}
490 
491 		error = netmap_get_hw_na(ifp, nmd, &hw);
492 		if (error || hw == NULL)
493 			goto out;
494 
495 		/* host adapter might not be created */
496 		error = hw->nm_bdg_attach(nr_name, hw, b);
497 		if (error == NM_NEED_BWRAP) {
498 			error = b->bdg_ops.bwrap_attach(nr_name, hw);
499 		}
500 		if (error)
501 			goto out;
502 		vpna = hw->na_vp;
503 		hostna = hw->na_hostvp;
504 		if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
505 			/* Check if we need to skip the host rings. */
506 			struct nmreq_vale_attach *areq =
507 				(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
508 			if (areq->reg.nr_mode != NR_REG_NIC_SW) {
509 				hostna = NULL;
510 			}
511 		}
512 	}
513 
514 	BDG_WLOCK(b);
515 	vpna->bdg_port = cand;
516 	nm_prdis("NIC  %p to bridge port %d", vpna, cand);
517 	/* bind the port to the bridge (virtual ports are not active) */
518 	b->bdg_ports[cand] = vpna;
519 	vpna->na_bdg = b;
520 	b->bdg_active_ports++;
521 	if (hostna != NULL) {
522 		/* also bind the host stack to the bridge */
523 		b->bdg_ports[cand2] = hostna;
524 		hostna->bdg_port = cand2;
525 		hostna->na_bdg = b;
526 		b->bdg_active_ports++;
527 		nm_prdis("host %p to bridge port %d", hostna, cand2);
528 	}
529 	nm_prdis("if %s refs %d", ifname, vpna->up.na_refcount);
530 	BDG_WUNLOCK(b);
531 	*na = &vpna->up;
532 	netmap_adapter_get(*na);
533 
534 out:
535 	if (ifp)
536 		if_rele(ifp);
537 
538 	return error;
539 }
540 
541 /* Process NETMAP_REQ_VALE_ATTACH.
542  */
543 int
544 netmap_bdg_attach(struct nmreq_header *hdr, void *auth_token)
545 {
546 	struct nmreq_vale_attach *req =
547 		(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
548 	struct netmap_vp_adapter * vpna;
549 	struct netmap_adapter *na = NULL;
550 	struct netmap_mem_d *nmd = NULL;
551 	struct nm_bridge *b = NULL;
552 	int error;
553 
554 	NMG_LOCK();
555 	/* permission check for modified bridges */
556 	b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
557 	if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
558 		error = EACCES;
559 		goto unlock_exit;
560 	}
561 
562 	if (req->reg.nr_mem_id) {
563 		nmd = netmap_mem_find(req->reg.nr_mem_id);
564 		if (nmd == NULL) {
565 			error = EINVAL;
566 			goto unlock_exit;
567 		}
568 	}
569 
570 	/* check for existing one */
571 	error = netmap_get_vale_na(hdr, &na, nmd, 0);
572 	if (na) {
573 		error = EBUSY;
574 		goto unref_exit;
575 	}
576 	error = netmap_get_vale_na(hdr, &na,
577 				nmd, 1 /* create if not exists */);
578 	if (error) { /* no device */
579 		goto unlock_exit;
580 	}
581 
582 	if (na == NULL) { /* VALE prefix missing */
583 		error = EINVAL;
584 		goto unlock_exit;
585 	}
586 
587 	if (NETMAP_OWNED_BY_ANY(na)) {
588 		error = EBUSY;
589 		goto unref_exit;
590 	}
591 
592 	if (na->nm_bdg_ctl) {
593 		/* nop for VALE ports. The bwrap needs to put the hwna
594 		 * in netmap mode (see netmap_bwrap_bdg_ctl)
595 		 */
596 		error = na->nm_bdg_ctl(hdr, na);
597 		if (error)
598 			goto unref_exit;
599 		nm_prdis("registered %s to netmap-mode", na->name);
600 	}
601 	vpna = (struct netmap_vp_adapter *)na;
602 	req->port_index = vpna->bdg_port;
603 
604 	if (nmd)
605 		netmap_mem_put(nmd);
606 
607 	NMG_UNLOCK();
608 	return 0;
609 
610 unref_exit:
611 	netmap_adapter_put(na);
612 unlock_exit:
613 	if (nmd)
614 		netmap_mem_put(nmd);
615 
616 	NMG_UNLOCK();
617 	return error;
618 }
619 
620 
621 int
622 nm_is_bwrap(struct netmap_adapter *na)
623 {
624 	return na->nm_register == netmap_bwrap_reg;
625 }
626 
627 /* Process NETMAP_REQ_VALE_DETACH.
628  */
629 int
630 netmap_bdg_detach(struct nmreq_header *hdr, void *auth_token)
631 {
632 	int error;
633 
634 	NMG_LOCK();
635 	error = netmap_bdg_detach_locked(hdr, auth_token);
636 	NMG_UNLOCK();
637 	return error;
638 }
639 
640 int
641 netmap_bdg_detach_locked(struct nmreq_header *hdr, void *auth_token)
642 {
643 	struct nmreq_vale_detach *nmreq_det = (void *)(uintptr_t)hdr->nr_body;
644 	struct netmap_vp_adapter *vpna;
645 	struct netmap_adapter *na;
646 	struct nm_bridge *b = NULL;
647 	int error;
648 
649 	/* permission check for modified bridges */
650 	b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
651 	if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
652 		error = EACCES;
653 		goto error_exit;
654 	}
655 
656 	error = netmap_get_vale_na(hdr, &na, NULL, 0 /* don't create */);
657 	if (error) { /* no device, or another bridge or user owns the device */
658 		goto error_exit;
659 	}
660 
661 	if (na == NULL) { /* VALE prefix missing */
662 		error = EINVAL;
663 		goto error_exit;
664 	} else if (nm_is_bwrap(na) &&
665 		   ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
666 		/* Don't detach a NIC with polling */
667 		error = EBUSY;
668 		goto unref_exit;
669 	}
670 
671 	vpna = (struct netmap_vp_adapter *)na;
672 	if (na->na_vp != vpna) {
673 		/* trying to detach first attach of VALE persistent port attached
674 		 * to 2 bridges
675 		 */
676 		error = EBUSY;
677 		goto unref_exit;
678 	}
679 	nmreq_det->port_index = vpna->bdg_port;
680 
681 	if (na->nm_bdg_ctl) {
682 		/* remove the port from bridge. The bwrap
683 		 * also needs to put the hwna in normal mode
684 		 */
685 		error = na->nm_bdg_ctl(hdr, na);
686 	}
687 
688 unref_exit:
689 	netmap_adapter_put(na);
690 error_exit:
691 	return error;
692 
693 }
694 
695 
696 struct nm_bdg_polling_state;
697 struct
698 nm_bdg_kthread {
699 	struct nm_kctx *nmk;
700 	u_int qfirst;
701 	u_int qlast;
702 	struct nm_bdg_polling_state *bps;
703 };
704 
705 struct nm_bdg_polling_state {
706 	bool configured;
707 	bool stopped;
708 	struct netmap_bwrap_adapter *bna;
709 	uint32_t mode;
710 	u_int qfirst;
711 	u_int qlast;
712 	u_int cpu_from;
713 	u_int ncpus;
714 	struct nm_bdg_kthread *kthreads;
715 };
716 
717 static void
718 netmap_bwrap_polling(void *data)
719 {
720 	struct nm_bdg_kthread *nbk = data;
721 	struct netmap_bwrap_adapter *bna;
722 	u_int qfirst, qlast, i;
723 	struct netmap_kring **kring0, *kring;
724 
725 	if (!nbk)
726 		return;
727 	qfirst = nbk->qfirst;
728 	qlast = nbk->qlast;
729 	bna = nbk->bps->bna;
730 	kring0 = NMR(bna->hwna, NR_RX);
731 
732 	for (i = qfirst; i < qlast; i++) {
733 		kring = kring0[i];
734 		kring->nm_notify(kring, 0);
735 	}
736 }
737 
738 static int
739 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)
740 {
741 	struct nm_kctx_cfg kcfg;
742 	int i, j;
743 
744 	bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus);
745 	if (bps->kthreads == NULL)
746 		return ENOMEM;
747 
748 	bzero(&kcfg, sizeof(kcfg));
749 	kcfg.worker_fn = netmap_bwrap_polling;
750 	for (i = 0; i < bps->ncpus; i++) {
751 		struct nm_bdg_kthread *t = bps->kthreads + i;
752 		int all = (bps->ncpus == 1 &&
753 			bps->mode == NETMAP_POLLING_MODE_SINGLE_CPU);
754 		int affinity = bps->cpu_from + i;
755 
756 		t->bps = bps;
757 		t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;
758 		t->qlast = all ? bps->qlast : t->qfirst + 1;
759 		if (netmap_verbose)
760 			nm_prinf("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,
761 				t->qlast);
762 
763 		kcfg.type = i;
764 		kcfg.worker_private = t;
765 		t->nmk = nm_os_kctx_create(&kcfg, NULL);
766 		if (t->nmk == NULL) {
767 			goto cleanup;
768 		}
769 		nm_os_kctx_worker_setaff(t->nmk, affinity);
770 	}
771 	return 0;
772 
773 cleanup:
774 	for (j = 0; j < i; j++) {
775 		struct nm_bdg_kthread *t = bps->kthreads + i;
776 		nm_os_kctx_destroy(t->nmk);
777 	}
778 	nm_os_free(bps->kthreads);
779 	return EFAULT;
780 }
781 
782 /* A variant of ptnetmap_start_kthreads() */
783 static int
784 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)
785 {
786 	int error, i, j;
787 
788 	if (!bps) {
789 		nm_prerr("polling is not configured");
790 		return EFAULT;
791 	}
792 	bps->stopped = false;
793 
794 	for (i = 0; i < bps->ncpus; i++) {
795 		struct nm_bdg_kthread *t = bps->kthreads + i;
796 		error = nm_os_kctx_worker_start(t->nmk);
797 		if (error) {
798 			nm_prerr("error in nm_kthread_start(): %d", error);
799 			goto cleanup;
800 		}
801 	}
802 	return 0;
803 
804 cleanup:
805 	for (j = 0; j < i; j++) {
806 		struct nm_bdg_kthread *t = bps->kthreads + i;
807 		nm_os_kctx_worker_stop(t->nmk);
808 	}
809 	bps->stopped = true;
810 	return error;
811 }
812 
813 static void
814 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)
815 {
816 	int i;
817 
818 	if (!bps)
819 		return;
820 
821 	for (i = 0; i < bps->ncpus; i++) {
822 		struct nm_bdg_kthread *t = bps->kthreads + i;
823 		nm_os_kctx_worker_stop(t->nmk);
824 		nm_os_kctx_destroy(t->nmk);
825 	}
826 	bps->stopped = true;
827 }
828 
829 static int
830 get_polling_cfg(struct nmreq_vale_polling *req, struct netmap_adapter *na,
831 		struct nm_bdg_polling_state *bps)
832 {
833 	unsigned int avail_cpus, core_from;
834 	unsigned int qfirst, qlast;
835 	uint32_t i = req->nr_first_cpu_id;
836 	uint32_t req_cpus = req->nr_num_polling_cpus;
837 
838 	avail_cpus = nm_os_ncpus();
839 
840 	if (req_cpus == 0) {
841 		nm_prerr("req_cpus must be > 0");
842 		return EINVAL;
843 	} else if (req_cpus >= avail_cpus) {
844 		nm_prerr("Cannot use all the CPUs in the system");
845 		return EINVAL;
846 	}
847 
848 	if (req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU) {
849 		/* Use a separate core for each ring. If nr_num_polling_cpus>1
850 		 * more consecutive rings are polled.
851 		 * For example, if nr_first_cpu_id=2 and nr_num_polling_cpus=2,
852 		 * ring 2 and 3 are polled by core 2 and 3, respectively. */
853 		if (i + req_cpus > nma_get_nrings(na, NR_RX)) {
854 			nm_prerr("Rings %u-%u not in range (have %d rings)",
855 				i, i + req_cpus, nma_get_nrings(na, NR_RX));
856 			return EINVAL;
857 		}
858 		qfirst = i;
859 		qlast = qfirst + req_cpus;
860 		core_from = qfirst;
861 
862 	} else if (req->nr_mode == NETMAP_POLLING_MODE_SINGLE_CPU) {
863 		/* Poll all the rings using a core specified by nr_first_cpu_id.
864 		 * the number of cores must be 1. */
865 		if (req_cpus != 1) {
866 			nm_prerr("ncpus must be 1 for NETMAP_POLLING_MODE_SINGLE_CPU "
867 				"(was %d)", req_cpus);
868 			return EINVAL;
869 		}
870 		qfirst = 0;
871 		qlast = nma_get_nrings(na, NR_RX);
872 		core_from = i;
873 	} else {
874 		nm_prerr("Invalid polling mode");
875 		return EINVAL;
876 	}
877 
878 	bps->mode = req->nr_mode;
879 	bps->qfirst = qfirst;
880 	bps->qlast = qlast;
881 	bps->cpu_from = core_from;
882 	bps->ncpus = req_cpus;
883 	nm_prinf("%s qfirst %u qlast %u cpu_from %u ncpus %u",
884 		req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU ?
885 		"MULTI" : "SINGLE",
886 		qfirst, qlast, core_from, req_cpus);
887 	return 0;
888 }
889 
890 static int
891 nm_bdg_ctl_polling_start(struct nmreq_vale_polling *req, struct netmap_adapter *na)
892 {
893 	struct nm_bdg_polling_state *bps;
894 	struct netmap_bwrap_adapter *bna;
895 	int error;
896 
897 	bna = (struct netmap_bwrap_adapter *)na;
898 	if (bna->na_polling_state) {
899 		nm_prerr("ERROR adapter already in polling mode");
900 		return EFAULT;
901 	}
902 
903 	bps = nm_os_malloc(sizeof(*bps));
904 	if (!bps)
905 		return ENOMEM;
906 	bps->configured = false;
907 	bps->stopped = true;
908 
909 	if (get_polling_cfg(req, na, bps)) {
910 		nm_os_free(bps);
911 		return EINVAL;
912 	}
913 
914 	if (nm_bdg_create_kthreads(bps)) {
915 		nm_os_free(bps);
916 		return EFAULT;
917 	}
918 
919 	bps->configured = true;
920 	bna->na_polling_state = bps;
921 	bps->bna = bna;
922 
923 	/* disable interrupts if possible */
924 	nma_intr_enable(bna->hwna, 0);
925 	/* start kthread now */
926 	error = nm_bdg_polling_start_kthreads(bps);
927 	if (error) {
928 		nm_prerr("ERROR nm_bdg_polling_start_kthread()");
929 		nm_os_free(bps->kthreads);
930 		nm_os_free(bps);
931 		bna->na_polling_state = NULL;
932 		nma_intr_enable(bna->hwna, 1);
933 	}
934 	return error;
935 }
936 
937 static int
938 nm_bdg_ctl_polling_stop(struct netmap_adapter *na)
939 {
940 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;
941 	struct nm_bdg_polling_state *bps;
942 
943 	if (!bna->na_polling_state) {
944 		nm_prerr("ERROR adapter is not in polling mode");
945 		return EFAULT;
946 	}
947 	bps = bna->na_polling_state;
948 	nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);
949 	bps->configured = false;
950 	nm_os_free(bps);
951 	bna->na_polling_state = NULL;
952 	/* re-enable interrupts */
953 	nma_intr_enable(bna->hwna, 1);
954 	return 0;
955 }
956 
957 int
958 nm_bdg_polling(struct nmreq_header *hdr)
959 {
960 	struct nmreq_vale_polling *req =
961 		(struct nmreq_vale_polling *)(uintptr_t)hdr->nr_body;
962 	struct netmap_adapter *na = NULL;
963 	int error = 0;
964 
965 	NMG_LOCK();
966 	error = netmap_get_vale_na(hdr, &na, NULL, /*create=*/0);
967 	if (na && !error) {
968 		if (!nm_is_bwrap(na)) {
969 			error = EOPNOTSUPP;
970 		} else if (hdr->nr_reqtype == NETMAP_BDG_POLLING_ON) {
971 			error = nm_bdg_ctl_polling_start(req, na);
972 			if (!error)
973 				netmap_adapter_get(na);
974 		} else {
975 			error = nm_bdg_ctl_polling_stop(na);
976 			if (!error)
977 				netmap_adapter_put(na);
978 		}
979 		netmap_adapter_put(na);
980 	} else if (!na && !error) {
981 		/* Not VALE port. */
982 		error = EINVAL;
983 	}
984 	NMG_UNLOCK();
985 
986 	return error;
987 }
988 
989 /* Called by external kernel modules (e.g., Openvswitch).
990  * to set configure/lookup/dtor functions of a VALE instance.
991  * Register callbacks to the given bridge. 'name' may be just
992  * bridge's name (including ':' if it is not just NM_BDG_NAME).
993  *
994  * Called without NMG_LOCK.
995  */
996 
997 int
998 netmap_bdg_regops(const char *name, struct netmap_bdg_ops *bdg_ops, void *private_data, void *auth_token)
999 {
1000 	struct nm_bridge *b;
1001 	int error = 0;
1002 
1003 	NMG_LOCK();
1004 	b = nm_find_bridge(name, 0 /* don't create */, NULL);
1005 	if (!b) {
1006 		error = ENXIO;
1007 		goto unlock_regops;
1008 	}
1009 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
1010 		error = EACCES;
1011 		goto unlock_regops;
1012 	}
1013 
1014 	BDG_WLOCK(b);
1015 	if (!bdg_ops) {
1016 		/* resetting the bridge */
1017 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
1018 		b->bdg_ops = b->bdg_saved_ops;
1019 		b->private_data = b->ht;
1020 	} else {
1021 		/* modifying the bridge */
1022 		b->private_data = private_data;
1023 #define nm_bdg_override(m) if (bdg_ops->m) b->bdg_ops.m = bdg_ops->m
1024 		nm_bdg_override(lookup);
1025 		nm_bdg_override(config);
1026 		nm_bdg_override(dtor);
1027 		nm_bdg_override(vp_create);
1028 		nm_bdg_override(bwrap_attach);
1029 #undef nm_bdg_override
1030 
1031 	}
1032 	BDG_WUNLOCK(b);
1033 
1034 unlock_regops:
1035 	NMG_UNLOCK();
1036 	return error;
1037 }
1038 
1039 
1040 int
1041 netmap_bdg_config(struct nm_ifreq *nr)
1042 {
1043 	struct nm_bridge *b;
1044 	int error = EINVAL;
1045 
1046 	NMG_LOCK();
1047 	b = nm_find_bridge(nr->nifr_name, 0, NULL);
1048 	if (!b) {
1049 		NMG_UNLOCK();
1050 		return error;
1051 	}
1052 	NMG_UNLOCK();
1053 	/* Don't call config() with NMG_LOCK() held */
1054 	BDG_RLOCK(b);
1055 	if (b->bdg_ops.config != NULL)
1056 		error = b->bdg_ops.config(nr);
1057 	BDG_RUNLOCK(b);
1058 	return error;
1059 }
1060 
1061 
1062 /* nm_register callback for VALE ports */
1063 int
1064 netmap_vp_reg(struct netmap_adapter *na, int onoff)
1065 {
1066 	struct netmap_vp_adapter *vpna =
1067 		(struct netmap_vp_adapter*)na;
1068 
1069 	/* persistent ports may be put in netmap mode
1070 	 * before being attached to a bridge
1071 	 */
1072 	if (vpna->na_bdg)
1073 		BDG_WLOCK(vpna->na_bdg);
1074 	if (onoff) {
1075 		netmap_krings_mode_commit(na, onoff);
1076 		if (na->active_fds == 0)
1077 			na->na_flags |= NAF_NETMAP_ON;
1078 		 /* XXX on FreeBSD, persistent VALE ports should also
1079 		 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
1080 		 */
1081 	} else {
1082 		if (na->active_fds == 0)
1083 			na->na_flags &= ~NAF_NETMAP_ON;
1084 		netmap_krings_mode_commit(na, onoff);
1085 	}
1086 	if (vpna->na_bdg)
1087 		BDG_WUNLOCK(vpna->na_bdg);
1088 	return 0;
1089 }
1090 
1091 
1092 /* rxsync code used by VALE ports nm_rxsync callback and also
1093  * internally by the brwap
1094  */
1095 static int
1096 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
1097 {
1098 	struct netmap_adapter *na = kring->na;
1099 	struct netmap_ring *ring = kring->ring;
1100 	u_int nm_i, lim = kring->nkr_num_slots - 1;
1101 	u_int head = kring->rhead;
1102 	int n;
1103 
1104 	if (head > lim) {
1105 		nm_prerr("ouch dangerous reset!!!");
1106 		n = netmap_ring_reinit(kring);
1107 		goto done;
1108 	}
1109 
1110 	/* First part, import newly received packets. */
1111 	/* actually nothing to do here, they are already in the kring */
1112 
1113 	/* Second part, skip past packets that userspace has released. */
1114 	nm_i = kring->nr_hwcur;
1115 	if (nm_i != head) {
1116 		/* consistency check, but nothing really important here */
1117 		for (n = 0; likely(nm_i != head); n++) {
1118 			struct netmap_slot *slot = &ring->slot[nm_i];
1119 			void *addr = NMB(na, slot);
1120 
1121 			if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
1122 				nm_prerr("bad buffer index %d, ignore ?",
1123 					slot->buf_idx);
1124 			}
1125 			slot->flags &= ~NS_BUF_CHANGED;
1126 			nm_i = nm_next(nm_i, lim);
1127 		}
1128 		kring->nr_hwcur = head;
1129 	}
1130 
1131 	n = 0;
1132 done:
1133 	return n;
1134 }
1135 
1136 /*
1137  * nm_rxsync callback for VALE ports
1138  * user process reading from a VALE switch.
1139  * Already protected against concurrent calls from userspace,
1140  * but we must acquire the queue's lock to protect against
1141  * writers on the same queue.
1142  */
1143 int
1144 netmap_vp_rxsync(struct netmap_kring *kring, int flags)
1145 {
1146 	int n;
1147 
1148 	mtx_lock(&kring->q_lock);
1149 	n = netmap_vp_rxsync_locked(kring, flags);
1150 	mtx_unlock(&kring->q_lock);
1151 	return n;
1152 }
1153 
1154 int
1155 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna,
1156 		struct netmap_bdg_ops *ops)
1157 {
1158 	return ops->bwrap_attach(nr_name, hwna);
1159 }
1160 
1161 
1162 /* Bridge wrapper code (bwrap).
1163  * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
1164  * VALE switch.
1165  * The main task is to swap the meaning of tx and rx rings to match the
1166  * expectations of the VALE switch code (see nm_bdg_flush).
1167  *
1168  * The bwrap works by interposing a netmap_bwrap_adapter between the
1169  * rest of the system and the hwna. The netmap_bwrap_adapter looks like
1170  * a netmap_vp_adapter to the rest the system, but, internally, it
1171  * translates all callbacks to what the hwna expects.
1172  *
1173  * Note that we have to intercept callbacks coming from two sides:
1174  *
1175  *  - callbacks coming from the netmap module are intercepted by
1176  *    passing around the netmap_bwrap_adapter instead of the hwna
1177  *
1178  *  - callbacks coming from outside of the netmap module only know
1179  *    about the hwna. This, however, only happens in interrupt
1180  *    handlers, where only the hwna->nm_notify callback is called.
1181  *    What the bwrap does is to overwrite the hwna->nm_notify callback
1182  *    with its own netmap_bwrap_intr_notify.
1183  *    XXX This assumes that the hwna->nm_notify callback was the
1184  *    standard netmap_notify(), as it is the case for nic adapters.
1185  *    Any additional action performed by hwna->nm_notify will not be
1186  *    performed by netmap_bwrap_intr_notify.
1187  *
1188  * Additionally, the bwrap can optionally attach the host rings pair
1189  * of the wrapped adapter to a different port of the switch.
1190  */
1191 
1192 
1193 static void
1194 netmap_bwrap_dtor(struct netmap_adapter *na)
1195 {
1196 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1197 	struct netmap_adapter *hwna = bna->hwna;
1198 	struct nm_bridge *b = bna->up.na_bdg,
1199 		*bh = bna->host.na_bdg;
1200 
1201 	if (bna->host.up.nm_mem)
1202 		netmap_mem_put(bna->host.up.nm_mem);
1203 
1204 	if (b) {
1205 		netmap_bdg_detach_common(b, bna->up.bdg_port,
1206 			    (bh ? bna->host.bdg_port : -1));
1207 	}
1208 
1209 	nm_prdis("na %p", na);
1210 	na->ifp = NULL;
1211 	bna->host.up.ifp = NULL;
1212 	hwna->na_vp = bna->saved_na_vp;
1213 	hwna->na_hostvp = NULL;
1214 	hwna->na_private = NULL;
1215 	hwna->na_flags &= ~NAF_BUSY;
1216 	netmap_adapter_put(hwna);
1217 
1218 }
1219 
1220 
1221 /*
1222  * Intr callback for NICs connected to a bridge.
1223  * Simply ignore tx interrupts (maybe we could try to recover space ?)
1224  * and pass received packets from nic to the bridge.
1225  *
1226  * XXX TODO check locking: this is called from the interrupt
1227  * handler so we should make sure that the interface is not
1228  * disconnected while passing down an interrupt.
1229  *
1230  * Note, no user process can access this NIC or the host stack.
1231  * The only part of the ring that is significant are the slots,
1232  * and head/cur/tail are set from the kring as needed
1233  * (part as a receive ring, part as a transmit ring).
1234  *
1235  * callback that overwrites the hwna notify callback.
1236  * Packets come from the outside or from the host stack and are put on an
1237  * hwna rx ring.
1238  * The bridge wrapper then sends the packets through the bridge.
1239  */
1240 int
1241 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
1242 {
1243 	struct netmap_adapter *na = kring->na;
1244 	struct netmap_bwrap_adapter *bna = na->na_private;
1245 	struct netmap_kring *bkring;
1246 	struct netmap_vp_adapter *vpna = &bna->up;
1247 	u_int ring_nr = kring->ring_id;
1248 	int ret = NM_IRQ_COMPLETED;
1249 	int error;
1250 
1251 	if (netmap_debug & NM_DEBUG_RXINTR)
1252 	    nm_prinf("%s %s 0x%x", na->name, kring->name, flags);
1253 
1254 	bkring = vpna->up.tx_rings[ring_nr];
1255 
1256 	/* make sure the ring is not disabled */
1257 	if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {
1258 		return EIO;
1259 	}
1260 
1261 	if (netmap_debug & NM_DEBUG_RXINTR)
1262 	    nm_prinf("%s head %d cur %d tail %d",  na->name,
1263 		kring->rhead, kring->rcur, kring->rtail);
1264 
1265 	/* simulate a user wakeup on the rx ring
1266 	 * fetch packets that have arrived.
1267 	 */
1268 	error = kring->nm_sync(kring, 0);
1269 	if (error)
1270 		goto put_out;
1271 	if (kring->nr_hwcur == kring->nr_hwtail) {
1272 		if (netmap_verbose)
1273 			nm_prlim(1, "interrupt with no packets on %s",
1274 				kring->name);
1275 		goto put_out;
1276 	}
1277 
1278 	/* new packets are kring->rcur to kring->nr_hwtail, and the bkring
1279 	 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
1280 	 * to push all packets out.
1281 	 */
1282 	bkring->rhead = bkring->rcur = kring->nr_hwtail;
1283 
1284 	bkring->nm_sync(bkring, flags);
1285 
1286 	/* mark all buffers as released on this ring */
1287 	kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
1288 	/* another call to actually release the buffers */
1289 	error = kring->nm_sync(kring, 0);
1290 
1291 	/* The second rxsync may have further advanced hwtail. If this happens,
1292 	 *  return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */
1293 	if (kring->rcur != kring->nr_hwtail) {
1294 		ret = NM_IRQ_RESCHED;
1295 	}
1296 put_out:
1297 	nm_kr_put(kring);
1298 
1299 	return error ? error : ret;
1300 }
1301 
1302 
1303 /* nm_register callback for bwrap */
1304 int
1305 netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
1306 {
1307 	struct netmap_bwrap_adapter *bna =
1308 		(struct netmap_bwrap_adapter *)na;
1309 	struct netmap_adapter *hwna = bna->hwna;
1310 	struct netmap_vp_adapter *hostna = &bna->host;
1311 	int error, i;
1312 	enum txrx t;
1313 
1314 	nm_prdis("%s %s", na->name, onoff ? "on" : "off");
1315 
1316 	if (onoff) {
1317 		/* netmap_do_regif has been called on the bwrap na.
1318 		 * We need to pass the information about the
1319 		 * memory allocator down to the hwna before
1320 		 * putting it in netmap mode
1321 		 */
1322 		hwna->na_lut = na->na_lut;
1323 
1324 		if (hostna->na_bdg) {
1325 			/* if the host rings have been attached to switch,
1326 			 * we need to copy the memory allocator information
1327 			 * in the hostna also
1328 			 */
1329 			hostna->up.na_lut = na->na_lut;
1330 		}
1331 
1332 	}
1333 
1334 	/* pass down the pending ring state information */
1335 	for_rx_tx(t) {
1336 		for (i = 0; i < netmap_all_rings(na, t); i++) {
1337 			NMR(hwna, nm_txrx_swap(t))[i]->nr_pending_mode =
1338 				NMR(na, t)[i]->nr_pending_mode;
1339 		}
1340 	}
1341 
1342 	/* forward the request to the hwna */
1343 	error = hwna->nm_register(hwna, onoff);
1344 	if (error)
1345 		return error;
1346 
1347 	/* copy up the current ring state information */
1348 	for_rx_tx(t) {
1349 		for (i = 0; i < netmap_all_rings(na, t); i++) {
1350 			struct netmap_kring *kring = NMR(hwna, nm_txrx_swap(t))[i];
1351 			NMR(na, t)[i]->nr_mode = kring->nr_mode;
1352 		}
1353 	}
1354 
1355 	/* impersonate a netmap_vp_adapter */
1356 	netmap_vp_reg(na, onoff);
1357 	if (hostna->na_bdg)
1358 		netmap_vp_reg(&hostna->up, onoff);
1359 
1360 	if (onoff) {
1361 		u_int i;
1362 		/* intercept the hwna nm_nofify callback on the hw rings */
1363 		for (i = 0; i < hwna->num_rx_rings; i++) {
1364 			hwna->rx_rings[i]->save_notify = hwna->rx_rings[i]->nm_notify;
1365 			hwna->rx_rings[i]->nm_notify = bna->nm_intr_notify;
1366 		}
1367 		i = hwna->num_rx_rings; /* for safety */
1368 		/* save the host ring notify unconditionally */
1369 		for (; i < netmap_real_rings(hwna, NR_RX); i++) {
1370 			hwna->rx_rings[i]->save_notify =
1371 				hwna->rx_rings[i]->nm_notify;
1372 			if (hostna->na_bdg) {
1373 				/* also intercept the host ring notify */
1374 				hwna->rx_rings[i]->nm_notify =
1375 					netmap_bwrap_intr_notify;
1376 				na->tx_rings[i]->nm_sync = na->nm_txsync;
1377 			}
1378 		}
1379 		if (na->active_fds == 0)
1380 			na->na_flags |= NAF_NETMAP_ON;
1381 	} else {
1382 		u_int i;
1383 
1384 		if (na->active_fds == 0)
1385 			na->na_flags &= ~NAF_NETMAP_ON;
1386 
1387 		/* reset all notify callbacks (including host ring) */
1388 		for (i = 0; i < netmap_all_rings(hwna, NR_RX); i++) {
1389 			hwna->rx_rings[i]->nm_notify =
1390 				hwna->rx_rings[i]->save_notify;
1391 			hwna->rx_rings[i]->save_notify = NULL;
1392 		}
1393 		hwna->na_lut.lut = NULL;
1394 		hwna->na_lut.plut = NULL;
1395 		hwna->na_lut.objtotal = 0;
1396 		hwna->na_lut.objsize = 0;
1397 
1398 		/* reset the number of host rings to default */
1399 		for_rx_tx(t) {
1400 			nma_set_host_nrings(hwna, t, 1);
1401 		}
1402 
1403 	}
1404 
1405 	return 0;
1406 }
1407 
1408 /* nm_config callback for bwrap */
1409 static int
1410 netmap_bwrap_config(struct netmap_adapter *na, struct nm_config_info *info)
1411 {
1412 	struct netmap_bwrap_adapter *bna =
1413 		(struct netmap_bwrap_adapter *)na;
1414 	struct netmap_adapter *hwna = bna->hwna;
1415 	int error;
1416 
1417 	/* cache the lut in the embedded host adapter */
1418 	error = netmap_mem_get_lut(hwna->nm_mem, &bna->host.up.na_lut);
1419 	if (error)
1420 		return error;
1421 
1422 	/* Forward the request to the hwna. It may happen that nobody
1423 	 * registered hwna yet, so netmap_mem_get_lut() may have not
1424 	 * been called yet. */
1425 	error = netmap_mem_get_lut(hwna->nm_mem, &hwna->na_lut);
1426 	if (error)
1427 		return error;
1428 	netmap_update_config(hwna);
1429 	/* swap the results and propagate */
1430 	info->num_tx_rings = hwna->num_rx_rings;
1431 	info->num_tx_descs = hwna->num_rx_desc;
1432 	info->num_rx_rings = hwna->num_tx_rings;
1433 	info->num_rx_descs = hwna->num_tx_desc;
1434 	info->rx_buf_maxsize = hwna->rx_buf_maxsize;
1435 
1436 	if (na->na_flags & NAF_HOST_RINGS) {
1437 		struct netmap_adapter *hostna = &bna->host.up;
1438 		enum txrx t;
1439 
1440 		/* limit the number of host rings to that of hw */
1441 		if (na->na_flags & NAF_HOST_ALL) {
1442 			hostna->num_tx_rings = nma_get_nrings(hwna, NR_RX);
1443 			hostna->num_rx_rings = nma_get_nrings(hwna, NR_TX);
1444 		} else {
1445 			nm_bound_var(&hostna->num_tx_rings, 1, 1,
1446 				nma_get_nrings(hwna, NR_TX), NULL);
1447 			nm_bound_var(&hostna->num_rx_rings, 1, 1,
1448 				nma_get_nrings(hwna, NR_RX), NULL);
1449 		}
1450 		for_rx_tx(t) {
1451 			enum txrx r = nm_txrx_swap(t);
1452 			u_int nr = nma_get_nrings(hostna, t);
1453 
1454 			nma_set_host_nrings(na, t, nr);
1455 			if (nma_get_host_nrings(hwna, t) < nr) {
1456 				nma_set_host_nrings(hwna, t, nr);
1457 			}
1458 			nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
1459 		}
1460 	}
1461 
1462 	return 0;
1463 }
1464 
1465 /* nm_bufcfg callback for bwrap */
1466 static int
1467 netmap_bwrap_bufcfg(struct netmap_kring *kring, uint64_t target)
1468 {
1469 	struct netmap_adapter *na = kring->na;
1470 	struct netmap_bwrap_adapter *bna =
1471 		(struct netmap_bwrap_adapter *)na;
1472 	struct netmap_adapter *hwna = bna->hwna;
1473 	struct netmap_kring *hwkring;
1474 	enum txrx r;
1475 	int error;
1476 
1477 	/* we need the hw kring that corresponds to the bwrap one:
1478 	 * remember that rx and tx are swapped
1479 	 */
1480 	r = nm_txrx_swap(kring->tx);
1481 	hwkring = NMR(hwna, r)[kring->ring_id];
1482 
1483 	/* copy down the offset information, forward the request
1484 	 * and copy up the results
1485 	 */
1486 	hwkring->offset_mask = kring->offset_mask;
1487 	hwkring->offset_max  = kring->offset_max;
1488 	hwkring->offset_gap  = kring->offset_gap;
1489 
1490 	error = hwkring->nm_bufcfg(hwkring, target);
1491 	if (error)
1492 		return error;
1493 
1494 	kring->hwbuf_len = hwkring->hwbuf_len;
1495 	kring->buf_align = hwkring->buf_align;
1496 
1497 	return 0;
1498 }
1499 
1500 /* nm_krings_create callback for bwrap */
1501 int
1502 netmap_bwrap_krings_create_common(struct netmap_adapter *na)
1503 {
1504 	struct netmap_bwrap_adapter *bna =
1505 		(struct netmap_bwrap_adapter *)na;
1506 	struct netmap_adapter *hwna = bna->hwna;
1507 	struct netmap_adapter *hostna = &bna->host.up;
1508 	int i, error = 0;
1509 	enum txrx t;
1510 
1511 	/* also create the hwna krings */
1512 	error = hwna->nm_krings_create(hwna);
1513 	if (error) {
1514 		return error;
1515 	}
1516 
1517 	/* increment the usage counter for all the hwna krings */
1518 	for_rx_tx(t) {
1519 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1520 			NMR(hwna, t)[i]->users++;
1521 			/* this to prevent deletion of the rings through
1522 			 * our krings, instead of through the hwna ones */
1523 			NMR(na, t)[i]->nr_kflags |= NKR_NEEDRING;
1524 		}
1525 	}
1526 
1527 	/* now create the actual rings */
1528 	error = netmap_mem_rings_create(hwna);
1529 	if (error) {
1530 		goto err_dec_users;
1531 	}
1532 
1533 	/* cross-link the netmap rings
1534 	 * The original number of rings comes from hwna,
1535 	 * rx rings on one side equals tx rings on the other.
1536 	 */
1537 	for_rx_tx(t) {
1538 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
1539 		for (i = 0; i < netmap_all_rings(hwna, r); i++) {
1540 			NMR(na, t)[i]->nkr_num_slots = NMR(hwna, r)[i]->nkr_num_slots;
1541 			NMR(na, t)[i]->ring = NMR(hwna, r)[i]->ring;
1542 		}
1543 	}
1544 
1545 	if (na->na_flags & NAF_HOST_RINGS) {
1546 		/* the hostna rings are the host rings of the bwrap.
1547 		 * The corresponding krings must point back to the
1548 		 * hostna
1549 		 */
1550 		hostna->tx_rings = &na->tx_rings[na->num_tx_rings];
1551 		hostna->rx_rings = &na->rx_rings[na->num_rx_rings];
1552 		for_rx_tx(t) {
1553 			for (i = 0; i < nma_get_nrings(hostna, t); i++) {
1554 				NMR(hostna, t)[i]->na = hostna;
1555 			}
1556 		}
1557 	}
1558 
1559 	return 0;
1560 
1561 err_dec_users:
1562 	for_rx_tx(t) {
1563 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1564 			NMR(hwna, t)[i]->users--;
1565 			NMR(na, t)[i]->users--;
1566 		}
1567 	}
1568 	hwna->nm_krings_delete(hwna);
1569 	return error;
1570 }
1571 
1572 
1573 void
1574 netmap_bwrap_krings_delete_common(struct netmap_adapter *na)
1575 {
1576 	struct netmap_bwrap_adapter *bna =
1577 		(struct netmap_bwrap_adapter *)na;
1578 	struct netmap_adapter *hwna = bna->hwna;
1579 	enum txrx t;
1580 	int i;
1581 
1582 	nm_prdis("%s", na->name);
1583 
1584 	/* decrement the usage counter for all the hwna krings */
1585 	for_rx_tx(t) {
1586 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1587 			NMR(hwna, t)[i]->users--;
1588 			NMR(na, t)[i]->users--;
1589 		}
1590 	}
1591 
1592 	/* delete any netmap rings that are no longer needed */
1593 	netmap_mem_rings_delete(hwna);
1594 	hwna->nm_krings_delete(hwna);
1595 }
1596 
1597 
1598 /* notify method for the bridge-->hwna direction */
1599 int
1600 netmap_bwrap_notify(struct netmap_kring *kring, int flags)
1601 {
1602 	struct netmap_adapter *na = kring->na;
1603 	struct netmap_bwrap_adapter *bna = na->na_private;
1604 	struct netmap_adapter *hwna = bna->hwna;
1605 	u_int ring_n = kring->ring_id;
1606 	u_int lim = kring->nkr_num_slots - 1;
1607 	struct netmap_kring *hw_kring;
1608 	int error;
1609 
1610 	nm_prdis("%s: na %s hwna %s",
1611 			(kring ? kring->name : "NULL!"),
1612 			(na ? na->name : "NULL!"),
1613 			(hwna ? hwna->name : "NULL!"));
1614 	hw_kring = hwna->tx_rings[ring_n];
1615 
1616 	if (nm_kr_tryget(hw_kring, 0, NULL)) {
1617 		return ENXIO;
1618 	}
1619 
1620 	/* first step: simulate a user wakeup on the rx ring */
1621 	netmap_vp_rxsync(kring, flags);
1622 	nm_prdis("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1623 		na->name, ring_n,
1624 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1625 		kring->rhead, kring->rcur, kring->rtail,
1626 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
1627 	/* second step: the new packets are sent on the tx ring
1628 	 * (which is actually the same ring)
1629 	 */
1630 	hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
1631 	error = hw_kring->nm_sync(hw_kring, flags);
1632 	if (error)
1633 		goto put_out;
1634 
1635 	/* third step: now we are back the rx ring */
1636 	/* claim ownership on all hw owned bufs */
1637 	kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
1638 
1639 	/* fourth step: the user goes to sleep again, causing another rxsync */
1640 	netmap_vp_rxsync(kring, flags);
1641 	nm_prdis("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1642 		na->name, ring_n,
1643 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1644 		kring->rhead, kring->rcur, kring->rtail,
1645 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
1646 put_out:
1647 	nm_kr_put(hw_kring);
1648 
1649 	return error ? error : NM_IRQ_COMPLETED;
1650 }
1651 
1652 
1653 /* nm_bdg_ctl callback for the bwrap.
1654  * Called on bridge-attach and detach, as an effect of valectl -[ahd].
1655  * On attach, it needs to provide a fake netmap_priv_d structure and
1656  * perform a netmap_do_regif() on the bwrap. This will put both the
1657  * bwrap and the hwna in netmap mode, with the netmap rings shared
1658  * and cross linked. Moroever, it will start intercepting interrupts
1659  * directed to hwna.
1660  */
1661 static int
1662 netmap_bwrap_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
1663 {
1664 	struct netmap_priv_d *npriv;
1665 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1666 	int error = 0;
1667 
1668 	if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
1669 		struct nmreq_vale_attach *req =
1670 			(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
1671 		if (req->reg.nr_ringid != 0 ||
1672 			(req->reg.nr_mode != NR_REG_ALL_NIC &&
1673 				req->reg.nr_mode != NR_REG_NIC_SW)) {
1674 			/* We only support attaching all the NIC rings
1675 			 * and/or the host stack. */
1676 			return EINVAL;
1677 		}
1678 		if (NETMAP_OWNED_BY_ANY(na)) {
1679 			return EBUSY;
1680 		}
1681 		if (bna->na_kpriv) {
1682 			/* nothing to do */
1683 			return 0;
1684 		}
1685 		npriv = netmap_priv_new();
1686 		if (npriv == NULL)
1687 			return ENOMEM;
1688 		npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */
1689 		error = netmap_do_regif(npriv, na, hdr);
1690 		if (error) {
1691 			netmap_priv_delete(npriv);
1692 			netmap_mem_restore(bna->hwna);
1693 			return error;
1694 		}
1695 		bna->na_kpriv = npriv;
1696 		na->na_flags |= NAF_BUSY;
1697 	} else {
1698 		if (na->active_fds == 0) /* not registered */
1699 			return EINVAL;
1700 		netmap_priv_delete(bna->na_kpriv);
1701 		bna->na_kpriv = NULL;
1702 		na->na_flags &= ~NAF_BUSY;
1703 		netmap_mem_restore(bna->hwna);
1704 	}
1705 
1706 	return error;
1707 }
1708 
1709 /* attach a bridge wrapper to the 'real' device */
1710 int
1711 netmap_bwrap_attach_common(struct netmap_adapter *na,
1712 		struct netmap_adapter *hwna)
1713 {
1714 	struct netmap_bwrap_adapter *bna;
1715 	struct netmap_adapter *hostna = NULL;
1716 	int error = 0;
1717 	enum txrx t;
1718 
1719 	/* make sure the NIC is not already in use */
1720 	if (NETMAP_OWNED_BY_ANY(hwna)) {
1721 		nm_prerr("NIC %s busy, cannot attach to bridge", hwna->name);
1722 		return EBUSY;
1723 	}
1724 
1725 	bna = (struct netmap_bwrap_adapter *)na;
1726 	/* make bwrap ifp point to the real ifp */
1727 	na->ifp = hwna->ifp;
1728 	if_ref(na->ifp);
1729 	na->na_private = bna;
1730 	/* fill the ring data for the bwrap adapter with rx/tx meanings
1731 	 * swapped. The real cross-linking will be done during register,
1732 	 * when all the krings will have been created.
1733 	 */
1734 	for_rx_tx(t) {
1735 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
1736 		nma_set_nrings(na, t, nma_get_nrings(hwna, r));
1737 		nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
1738 	}
1739 	na->nm_dtor = netmap_bwrap_dtor;
1740 	na->nm_config = netmap_bwrap_config;
1741 	na->nm_bufcfg = netmap_bwrap_bufcfg;
1742 	na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
1743 	na->pdev = hwna->pdev;
1744 	na->nm_mem = netmap_mem_get(hwna->nm_mem);
1745 	na->virt_hdr_len = hwna->virt_hdr_len;
1746 	na->rx_buf_maxsize = hwna->rx_buf_maxsize;
1747 
1748 	bna->hwna = hwna;
1749 	netmap_adapter_get(hwna);
1750 	hwna->na_private = bna; /* weak reference */
1751 	bna->saved_na_vp = hwna->na_vp;
1752 	hwna->na_vp = &bna->up;
1753 	bna->up.up.na_vp = &(bna->up);
1754 
1755 	if (hwna->na_flags & NAF_HOST_RINGS) {
1756 		if (hwna->na_flags & NAF_SW_ONLY)
1757 			na->na_flags |= NAF_SW_ONLY;
1758 		na->na_flags |= NAF_HOST_RINGS;
1759 		hostna = &bna->host.up;
1760 
1761 		snprintf(hostna->name, sizeof(hostna->name), "%s^", na->name);
1762 		hostna->ifp = hwna->ifp;
1763 		// hostna->nm_txsync = netmap_bwrap_host_txsync;
1764 		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
1765 		hostna->nm_mem = netmap_mem_get(na->nm_mem);
1766 		hostna->na_private = bna;
1767 		hostna->na_vp = &bna->up;
1768 		na->na_hostvp = hwna->na_hostvp =
1769 			hostna->na_hostvp = &bna->host;
1770 		hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
1771 		hostna->rx_buf_maxsize = hwna->rx_buf_maxsize;
1772 		/* bwrap_config() will determine the number of host rings */
1773 	}
1774 	if (hwna->na_flags & NAF_MOREFRAG)
1775 		na->na_flags |= NAF_MOREFRAG;
1776 
1777 	nm_prdis("%s<->%s txr %d txd %d rxr %d rxd %d",
1778 		na->name, if_name(ifp),
1779 		na->num_tx_rings, na->num_tx_desc,
1780 		na->num_rx_rings, na->num_rx_desc);
1781 
1782 	error = netmap_attach_common(na);
1783 	if (error) {
1784 		goto err_put;
1785 	}
1786 	hwna->na_flags |= NAF_BUSY;
1787 	return 0;
1788 
1789 err_put:
1790 	hwna->na_vp = hwna->na_hostvp = NULL;
1791 	netmap_adapter_put(hwna);
1792 	return error;
1793 
1794 }
1795 
1796 struct nm_bridge *
1797 netmap_init_bridges2(u_int n)
1798 {
1799 	int i;
1800 	struct nm_bridge *b;
1801 
1802 	b = nm_os_malloc(sizeof(struct nm_bridge) * n);
1803 	if (b == NULL)
1804 		return NULL;
1805 	for (i = 0; i < n; i++)
1806 		BDG_RWINIT(&b[i]);
1807 	return b;
1808 }
1809 
1810 void
1811 netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
1812 {
1813 	int i;
1814 
1815 	if (b == NULL)
1816 		return;
1817 
1818 	for (i = 0; i < n; i++)
1819 		BDG_RWDESTROY(&b[i]);
1820 	nm_os_free(b);
1821 }
1822 
1823 int
1824 netmap_init_bridges(void)
1825 {
1826 #ifdef CONFIG_NET_NS
1827 	return netmap_bns_register();
1828 #else
1829 	nm_bridges = netmap_init_bridges2(vale_max_bridges);
1830 	if (nm_bridges == NULL)
1831 		return ENOMEM;
1832 	return 0;
1833 #endif
1834 }
1835 
1836 void
1837 netmap_uninit_bridges(void)
1838 {
1839 #ifdef CONFIG_NET_NS
1840 	netmap_bns_unregister();
1841 #else
1842 	netmap_uninit_bridges2(nm_bridges, vale_max_bridges);
1843 #endif
1844 }
1845