xref: /freebsd/sys/dev/netmap/netmap_bdg.c (revision 42249ef2)
1 /*
2  * Copyright (C) 2013-2016 Universita` di Pisa
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *   1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  *   2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 
28 /*
29  * This module implements the VALE switch for netmap
30 
31 --- VALE SWITCH ---
32 
33 NMG_LOCK() serializes all modifications to switches and ports.
34 A switch cannot be deleted until all ports are gone.
35 
36 For each switch, an SX lock (RWlock on linux) protects
37 deletion of ports. When configuring or deleting a new port, the
38 lock is acquired in exclusive mode (after holding NMG_LOCK).
39 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
40 The lock is held throughout the entire forwarding cycle,
41 during which the thread may incur in a page fault.
42 Hence it is important that sleepable shared locks are used.
43 
44 On the rx ring, the per-port lock is grabbed initially to reserve
45 a number of slot in the ring, then the lock is released,
46 packets are copied from source to destination, and then
47 the lock is acquired again and the receive ring is updated.
48 (A similar thing is done on the tx ring for NIC and host stack
49 ports attached to the switch)
50 
51  */
52 
53 /*
54  * OS-specific code that is used only within this file.
55  * Other OS-specific code that must be accessed by drivers
56  * is present in netmap_kern.h
57  */
58 
59 #if defined(__FreeBSD__)
60 #include <sys/cdefs.h> /* prerequisite */
61 __FBSDID("$FreeBSD$");
62 
63 #include <sys/types.h>
64 #include <sys/errno.h>
65 #include <sys/param.h>	/* defines used in kernel.h */
66 #include <sys/kernel.h>	/* types used in module initialization */
67 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
68 #include <sys/sockio.h>
69 #include <sys/socketvar.h>	/* struct socket */
70 #include <sys/malloc.h>
71 #include <sys/poll.h>
72 #include <sys/rwlock.h>
73 #include <sys/socket.h> /* sockaddrs */
74 #include <sys/selinfo.h>
75 #include <sys/sysctl.h>
76 #include <net/if.h>
77 #include <net/if_var.h>
78 #include <net/bpf.h>		/* BIOCIMMEDIATE */
79 #include <machine/bus.h>	/* bus_dmamap_* */
80 #include <sys/endian.h>
81 #include <sys/refcount.h>
82 #include <sys/smp.h>
83 
84 
85 #elif defined(linux)
86 
87 #include "bsd_glue.h"
88 
89 #elif defined(__APPLE__)
90 
91 #warning OSX support is only partial
92 #include "osx_glue.h"
93 
94 #elif defined(_WIN32)
95 #include "win_glue.h"
96 
97 #else
98 
99 #error	Unsupported platform
100 
101 #endif /* unsupported */
102 
103 /*
104  * common headers
105  */
106 
107 #include <net/netmap.h>
108 #include <dev/netmap/netmap_kern.h>
109 #include <dev/netmap/netmap_mem2.h>
110 
111 #include <dev/netmap/netmap_bdg.h>
112 
113 const char*
114 netmap_bdg_name(struct netmap_vp_adapter *vp)
115 {
116 	struct nm_bridge *b = vp->na_bdg;
117 	if (b == NULL)
118 		return NULL;
119 	return b->bdg_basename;
120 }
121 
122 
123 #ifndef CONFIG_NET_NS
124 /*
125  * XXX in principle nm_bridges could be created dynamically
126  * Right now we have a static array and deletions are protected
127  * by an exclusive lock.
128  */
129 struct nm_bridge *nm_bridges;
130 #endif /* !CONFIG_NET_NS */
131 
132 
133 static int
134 nm_is_id_char(const char c)
135 {
136 	return (c >= 'a' && c <= 'z') ||
137 	       (c >= 'A' && c <= 'Z') ||
138 	       (c >= '0' && c <= '9') ||
139 	       (c == '_');
140 }
141 
142 /* Validate the name of a bdg port and return the
143  * position of the ":" character. */
144 static int
145 nm_bdg_name_validate(const char *name, size_t prefixlen)
146 {
147 	int colon_pos = -1;
148 	int i;
149 
150 	if (!name || strlen(name) < prefixlen) {
151 		return -1;
152 	}
153 
154 	for (i = 0; i < NM_BDG_IFNAMSIZ && name[i]; i++) {
155 		if (name[i] == ':') {
156 			colon_pos = i;
157 			break;
158 		} else if (!nm_is_id_char(name[i])) {
159 			return -1;
160 		}
161 	}
162 
163 	if (strlen(name) - colon_pos > IFNAMSIZ) {
164 		/* interface name too long */
165 		return -1;
166 	}
167 
168 	return colon_pos;
169 }
170 
171 /*
172  * locate a bridge among the existing ones.
173  * MUST BE CALLED WITH NMG_LOCK()
174  *
175  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
176  * We assume that this is called with a name of at least NM_NAME chars.
177  */
178 struct nm_bridge *
179 nm_find_bridge(const char *name, int create, struct netmap_bdg_ops *ops)
180 {
181 	int i, namelen;
182 	struct nm_bridge *b = NULL, *bridges;
183 	u_int num_bridges;
184 
185 	NMG_LOCK_ASSERT();
186 
187 	netmap_bns_getbridges(&bridges, &num_bridges);
188 
189 	namelen = nm_bdg_name_validate(name,
190 			(ops != NULL ? strlen(ops->name) : 0));
191 	if (namelen < 0) {
192 		nm_prerr("invalid bridge name %s", name ? name : NULL);
193 		return NULL;
194 	}
195 
196 	/* lookup the name, remember empty slot if there is one */
197 	for (i = 0; i < num_bridges; i++) {
198 		struct nm_bridge *x = bridges + i;
199 
200 		if ((x->bdg_flags & NM_BDG_ACTIVE) + x->bdg_active_ports == 0) {
201 			if (create && b == NULL)
202 				b = x;	/* record empty slot */
203 		} else if (x->bdg_namelen != namelen) {
204 			continue;
205 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
206 			nm_prdis("found '%.*s' at %d", namelen, name, i);
207 			b = x;
208 			break;
209 		}
210 	}
211 	if (i == num_bridges && b) { /* name not found, can create entry */
212 		/* initialize the bridge */
213 		nm_prdis("create new bridge %s with ports %d", b->bdg_basename,
214 			b->bdg_active_ports);
215 		b->ht = nm_os_malloc(sizeof(struct nm_hash_ent) * NM_BDG_HASH);
216 		if (b->ht == NULL) {
217 			nm_prerr("failed to allocate hash table");
218 			return NULL;
219 		}
220 		strncpy(b->bdg_basename, name, namelen);
221 		b->bdg_namelen = namelen;
222 		b->bdg_active_ports = 0;
223 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
224 			b->bdg_port_index[i] = i;
225 		/* set the default function */
226 		b->bdg_ops = b->bdg_saved_ops = *ops;
227 		b->private_data = b->ht;
228 		b->bdg_flags = 0;
229 		NM_BNS_GET(b);
230 	}
231 	return b;
232 }
233 
234 
235 int
236 netmap_bdg_free(struct nm_bridge *b)
237 {
238 	if ((b->bdg_flags & NM_BDG_ACTIVE) + b->bdg_active_ports != 0) {
239 		return EBUSY;
240 	}
241 
242 	nm_prdis("marking bridge %s as free", b->bdg_basename);
243 	nm_os_free(b->ht);
244 	memset(&b->bdg_ops, 0, sizeof(b->bdg_ops));
245 	memset(&b->bdg_saved_ops, 0, sizeof(b->bdg_saved_ops));
246 	b->bdg_flags = 0;
247 	NM_BNS_PUT(b);
248 	return 0;
249 }
250 
251 /* Called by external kernel modules (e.g., Openvswitch).
252  * to modify the private data previously given to regops().
253  * 'name' may be just bridge's name (including ':' if it
254  * is not just NM_BDG_NAME).
255  * Called without NMG_LOCK.
256  */
257 int
258 netmap_bdg_update_private_data(const char *name, bdg_update_private_data_fn_t callback,
259 	void *callback_data, void *auth_token)
260 {
261 	void *private_data = NULL;
262 	struct nm_bridge *b;
263 	int error = 0;
264 
265 	NMG_LOCK();
266 	b = nm_find_bridge(name, 0 /* don't create */, NULL);
267 	if (!b) {
268 		error = EINVAL;
269 		goto unlock_update_priv;
270 	}
271 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
272 		error = EACCES;
273 		goto unlock_update_priv;
274 	}
275 	BDG_WLOCK(b);
276 	private_data = callback(b->private_data, callback_data, &error);
277 	b->private_data = private_data;
278 	BDG_WUNLOCK(b);
279 
280 unlock_update_priv:
281 	NMG_UNLOCK();
282 	return error;
283 }
284 
285 
286 
287 /* remove from bridge b the ports in slots hw and sw
288  * (sw can be -1 if not needed)
289  */
290 void
291 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
292 {
293 	int s_hw = hw, s_sw = sw;
294 	int i, lim =b->bdg_active_ports;
295 	uint32_t *tmp = b->tmp_bdg_port_index;
296 
297 	/*
298 	New algorithm:
299 	make a copy of bdg_port_index;
300 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
301 	in the array of bdg_port_index, replacing them with
302 	entries from the bottom of the array;
303 	decrement bdg_active_ports;
304 	acquire BDG_WLOCK() and copy back the array.
305 	 */
306 
307 	if (netmap_debug & NM_DEBUG_BDG)
308 		nm_prinf("detach %d and %d (lim %d)", hw, sw, lim);
309 	/* make a copy of the list of active ports, update it,
310 	 * and then copy back within BDG_WLOCK().
311 	 */
312 	memcpy(b->tmp_bdg_port_index, b->bdg_port_index, sizeof(b->tmp_bdg_port_index));
313 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
314 		if (hw >= 0 && tmp[i] == hw) {
315 			nm_prdis("detach hw %d at %d", hw, i);
316 			lim--; /* point to last active port */
317 			tmp[i] = tmp[lim]; /* swap with i */
318 			tmp[lim] = hw;	/* now this is inactive */
319 			hw = -1;
320 		} else if (sw >= 0 && tmp[i] == sw) {
321 			nm_prdis("detach sw %d at %d", sw, i);
322 			lim--;
323 			tmp[i] = tmp[lim];
324 			tmp[lim] = sw;
325 			sw = -1;
326 		} else {
327 			i++;
328 		}
329 	}
330 	if (hw >= 0 || sw >= 0) {
331 		nm_prerr("delete failed hw %d sw %d, should panic...", hw, sw);
332 	}
333 
334 	BDG_WLOCK(b);
335 	if (b->bdg_ops.dtor)
336 		b->bdg_ops.dtor(b->bdg_ports[s_hw]);
337 	b->bdg_ports[s_hw] = NULL;
338 	if (s_sw >= 0) {
339 		b->bdg_ports[s_sw] = NULL;
340 	}
341 	memcpy(b->bdg_port_index, b->tmp_bdg_port_index, sizeof(b->tmp_bdg_port_index));
342 	b->bdg_active_ports = lim;
343 	BDG_WUNLOCK(b);
344 
345 	nm_prdis("now %d active ports", lim);
346 	netmap_bdg_free(b);
347 }
348 
349 
350 /* nm_bdg_ctl callback for VALE ports */
351 int
352 netmap_vp_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
353 {
354 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
355 	struct nm_bridge *b = vpna->na_bdg;
356 
357 	if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
358 		return 0; /* nothing to do */
359 	}
360 	if (b) {
361 		netmap_set_all_rings(na, 0 /* disable */);
362 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
363 		vpna->na_bdg = NULL;
364 		netmap_set_all_rings(na, 1 /* enable */);
365 	}
366 	/* I have took reference just for attach */
367 	netmap_adapter_put(na);
368 	return 0;
369 }
370 
371 int
372 netmap_default_bdg_attach(const char *name, struct netmap_adapter *na,
373 		struct nm_bridge *b)
374 {
375 	return NM_NEED_BWRAP;
376 }
377 
378 /* Try to get a reference to a netmap adapter attached to a VALE switch.
379  * If the adapter is found (or is created), this function returns 0, a
380  * non NULL pointer is returned into *na, and the caller holds a
381  * reference to the adapter.
382  * If an adapter is not found, then no reference is grabbed and the
383  * function returns an error code, or 0 if there is just a VALE prefix
384  * mismatch. Therefore the caller holds a reference when
385  * (*na != NULL && return == 0).
386  */
387 int
388 netmap_get_bdg_na(struct nmreq_header *hdr, struct netmap_adapter **na,
389 	struct netmap_mem_d *nmd, int create, struct netmap_bdg_ops *ops)
390 {
391 	char *nr_name = hdr->nr_name;
392 	const char *ifname;
393 	struct ifnet *ifp = NULL;
394 	int error = 0;
395 	struct netmap_vp_adapter *vpna, *hostna = NULL;
396 	struct nm_bridge *b;
397 	uint32_t i, j;
398 	uint32_t cand = NM_BDG_NOPORT, cand2 = NM_BDG_NOPORT;
399 	int needed;
400 
401 	*na = NULL;     /* default return value */
402 
403 	/* first try to see if this is a bridge port. */
404 	NMG_LOCK_ASSERT();
405 	if (strncmp(nr_name, ops->name, strlen(ops->name) - 1)) {
406 		return 0;  /* no error, but no VALE prefix */
407 	}
408 
409 	b = nm_find_bridge(nr_name, create, ops);
410 	if (b == NULL) {
411 		nm_prdis("no bridges available for '%s'", nr_name);
412 		return (create ? ENOMEM : ENXIO);
413 	}
414 	if (strlen(nr_name) < b->bdg_namelen) /* impossible */
415 		panic("x");
416 
417 	/* Now we are sure that name starts with the bridge's name,
418 	 * lookup the port in the bridge. We need to scan the entire
419 	 * list. It is not important to hold a WLOCK on the bridge
420 	 * during the search because NMG_LOCK already guarantees
421 	 * that there are no other possible writers.
422 	 */
423 
424 	/* lookup in the local list of ports */
425 	for (j = 0; j < b->bdg_active_ports; j++) {
426 		i = b->bdg_port_index[j];
427 		vpna = b->bdg_ports[i];
428 		nm_prdis("checking %s", vpna->up.name);
429 		if (!strcmp(vpna->up.name, nr_name)) {
430 			netmap_adapter_get(&vpna->up);
431 			nm_prdis("found existing if %s refs %d", nr_name)
432 			*na = &vpna->up;
433 			return 0;
434 		}
435 	}
436 	/* not found, should we create it? */
437 	if (!create)
438 		return ENXIO;
439 	/* yes we should, see if we have space to attach entries */
440 	needed = 2; /* in some cases we only need 1 */
441 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
442 		nm_prerr("bridge full %d, cannot create new port", b->bdg_active_ports);
443 		return ENOMEM;
444 	}
445 	/* record the next two ports available, but do not allocate yet */
446 	cand = b->bdg_port_index[b->bdg_active_ports];
447 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
448 	nm_prdis("+++ bridge %s port %s used %d avail %d %d",
449 		b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
450 
451 	/*
452 	 * try see if there is a matching NIC with this name
453 	 * (after the bridge's name)
454 	 */
455 	ifname = nr_name + b->bdg_namelen + 1;
456 	ifp = ifunit_ref(ifname);
457 	if (!ifp) {
458 		/* Create an ephemeral virtual port.
459 		 * This block contains all the ephemeral-specific logic.
460 		 */
461 
462 		if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
463 			error = EINVAL;
464 			goto out;
465 		}
466 
467 		/* bdg_netmap_attach creates a struct netmap_adapter */
468 		error = b->bdg_ops.vp_create(hdr, NULL, nmd, &vpna);
469 		if (error) {
470 			if (netmap_debug & NM_DEBUG_BDG)
471 				nm_prerr("error %d", error);
472 			goto out;
473 		}
474 		/* shortcut - we can skip get_hw_na(),
475 		 * ownership check and nm_bdg_attach()
476 		 */
477 
478 	} else {
479 		struct netmap_adapter *hw;
480 
481 		/* the vale:nic syntax is only valid for some commands */
482 		switch (hdr->nr_reqtype) {
483 		case NETMAP_REQ_VALE_ATTACH:
484 		case NETMAP_REQ_VALE_DETACH:
485 		case NETMAP_REQ_VALE_POLLING_ENABLE:
486 		case NETMAP_REQ_VALE_POLLING_DISABLE:
487 			break; /* ok */
488 		default:
489 			error = EINVAL;
490 			goto out;
491 		}
492 
493 		error = netmap_get_hw_na(ifp, nmd, &hw);
494 		if (error || hw == NULL)
495 			goto out;
496 
497 		/* host adapter might not be created */
498 		error = hw->nm_bdg_attach(nr_name, hw, b);
499 		if (error == NM_NEED_BWRAP) {
500 			error = b->bdg_ops.bwrap_attach(nr_name, hw);
501 		}
502 		if (error)
503 			goto out;
504 		vpna = hw->na_vp;
505 		hostna = hw->na_hostvp;
506 		if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
507 			/* Check if we need to skip the host rings. */
508 			struct nmreq_vale_attach *areq =
509 				(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
510 			if (areq->reg.nr_mode != NR_REG_NIC_SW) {
511 				hostna = NULL;
512 			}
513 		}
514 	}
515 
516 	BDG_WLOCK(b);
517 	vpna->bdg_port = cand;
518 	nm_prdis("NIC  %p to bridge port %d", vpna, cand);
519 	/* bind the port to the bridge (virtual ports are not active) */
520 	b->bdg_ports[cand] = vpna;
521 	vpna->na_bdg = b;
522 	b->bdg_active_ports++;
523 	if (hostna != NULL) {
524 		/* also bind the host stack to the bridge */
525 		b->bdg_ports[cand2] = hostna;
526 		hostna->bdg_port = cand2;
527 		hostna->na_bdg = b;
528 		b->bdg_active_ports++;
529 		nm_prdis("host %p to bridge port %d", hostna, cand2);
530 	}
531 	nm_prdis("if %s refs %d", ifname, vpna->up.na_refcount);
532 	BDG_WUNLOCK(b);
533 	*na = &vpna->up;
534 	netmap_adapter_get(*na);
535 
536 out:
537 	if (ifp)
538 		if_rele(ifp);
539 
540 	return error;
541 }
542 
543 
544 int
545 nm_is_bwrap(struct netmap_adapter *na)
546 {
547 	return na->nm_register == netmap_bwrap_reg;
548 }
549 
550 
551 struct nm_bdg_polling_state;
552 struct
553 nm_bdg_kthread {
554 	struct nm_kctx *nmk;
555 	u_int qfirst;
556 	u_int qlast;
557 	struct nm_bdg_polling_state *bps;
558 };
559 
560 struct nm_bdg_polling_state {
561 	bool configured;
562 	bool stopped;
563 	struct netmap_bwrap_adapter *bna;
564 	uint32_t mode;
565 	u_int qfirst;
566 	u_int qlast;
567 	u_int cpu_from;
568 	u_int ncpus;
569 	struct nm_bdg_kthread *kthreads;
570 };
571 
572 static void
573 netmap_bwrap_polling(void *data)
574 {
575 	struct nm_bdg_kthread *nbk = data;
576 	struct netmap_bwrap_adapter *bna;
577 	u_int qfirst, qlast, i;
578 	struct netmap_kring **kring0, *kring;
579 
580 	if (!nbk)
581 		return;
582 	qfirst = nbk->qfirst;
583 	qlast = nbk->qlast;
584 	bna = nbk->bps->bna;
585 	kring0 = NMR(bna->hwna, NR_RX);
586 
587 	for (i = qfirst; i < qlast; i++) {
588 		kring = kring0[i];
589 		kring->nm_notify(kring, 0);
590 	}
591 }
592 
593 static int
594 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)
595 {
596 	struct nm_kctx_cfg kcfg;
597 	int i, j;
598 
599 	bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus);
600 	if (bps->kthreads == NULL)
601 		return ENOMEM;
602 
603 	bzero(&kcfg, sizeof(kcfg));
604 	kcfg.worker_fn = netmap_bwrap_polling;
605 	for (i = 0; i < bps->ncpus; i++) {
606 		struct nm_bdg_kthread *t = bps->kthreads + i;
607 		int all = (bps->ncpus == 1 &&
608 			bps->mode == NETMAP_POLLING_MODE_SINGLE_CPU);
609 		int affinity = bps->cpu_from + i;
610 
611 		t->bps = bps;
612 		t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;
613 		t->qlast = all ? bps->qlast : t->qfirst + 1;
614 		if (netmap_verbose)
615 			nm_prinf("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,
616 				t->qlast);
617 
618 		kcfg.type = i;
619 		kcfg.worker_private = t;
620 		t->nmk = nm_os_kctx_create(&kcfg, NULL);
621 		if (t->nmk == NULL) {
622 			goto cleanup;
623 		}
624 		nm_os_kctx_worker_setaff(t->nmk, affinity);
625 	}
626 	return 0;
627 
628 cleanup:
629 	for (j = 0; j < i; j++) {
630 		struct nm_bdg_kthread *t = bps->kthreads + i;
631 		nm_os_kctx_destroy(t->nmk);
632 	}
633 	nm_os_free(bps->kthreads);
634 	return EFAULT;
635 }
636 
637 /* A variant of ptnetmap_start_kthreads() */
638 static int
639 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)
640 {
641 	int error, i, j;
642 
643 	if (!bps) {
644 		nm_prerr("polling is not configured");
645 		return EFAULT;
646 	}
647 	bps->stopped = false;
648 
649 	for (i = 0; i < bps->ncpus; i++) {
650 		struct nm_bdg_kthread *t = bps->kthreads + i;
651 		error = nm_os_kctx_worker_start(t->nmk);
652 		if (error) {
653 			nm_prerr("error in nm_kthread_start(): %d", error);
654 			goto cleanup;
655 		}
656 	}
657 	return 0;
658 
659 cleanup:
660 	for (j = 0; j < i; j++) {
661 		struct nm_bdg_kthread *t = bps->kthreads + i;
662 		nm_os_kctx_worker_stop(t->nmk);
663 	}
664 	bps->stopped = true;
665 	return error;
666 }
667 
668 static void
669 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)
670 {
671 	int i;
672 
673 	if (!bps)
674 		return;
675 
676 	for (i = 0; i < bps->ncpus; i++) {
677 		struct nm_bdg_kthread *t = bps->kthreads + i;
678 		nm_os_kctx_worker_stop(t->nmk);
679 		nm_os_kctx_destroy(t->nmk);
680 	}
681 	bps->stopped = true;
682 }
683 
684 static int
685 get_polling_cfg(struct nmreq_vale_polling *req, struct netmap_adapter *na,
686 		struct nm_bdg_polling_state *bps)
687 {
688 	unsigned int avail_cpus, core_from;
689 	unsigned int qfirst, qlast;
690 	uint32_t i = req->nr_first_cpu_id;
691 	uint32_t req_cpus = req->nr_num_polling_cpus;
692 
693 	avail_cpus = nm_os_ncpus();
694 
695 	if (req_cpus == 0) {
696 		nm_prerr("req_cpus must be > 0");
697 		return EINVAL;
698 	} else if (req_cpus >= avail_cpus) {
699 		nm_prerr("Cannot use all the CPUs in the system");
700 		return EINVAL;
701 	}
702 
703 	if (req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU) {
704 		/* Use a separate core for each ring. If nr_num_polling_cpus>1
705 		 * more consecutive rings are polled.
706 		 * For example, if nr_first_cpu_id=2 and nr_num_polling_cpus=2,
707 		 * ring 2 and 3 are polled by core 2 and 3, respectively. */
708 		if (i + req_cpus > nma_get_nrings(na, NR_RX)) {
709 			nm_prerr("Rings %u-%u not in range (have %d rings)",
710 				i, i + req_cpus, nma_get_nrings(na, NR_RX));
711 			return EINVAL;
712 		}
713 		qfirst = i;
714 		qlast = qfirst + req_cpus;
715 		core_from = qfirst;
716 
717 	} else if (req->nr_mode == NETMAP_POLLING_MODE_SINGLE_CPU) {
718 		/* Poll all the rings using a core specified by nr_first_cpu_id.
719 		 * the number of cores must be 1. */
720 		if (req_cpus != 1) {
721 			nm_prerr("ncpus must be 1 for NETMAP_POLLING_MODE_SINGLE_CPU "
722 				"(was %d)", req_cpus);
723 			return EINVAL;
724 		}
725 		qfirst = 0;
726 		qlast = nma_get_nrings(na, NR_RX);
727 		core_from = i;
728 	} else {
729 		nm_prerr("Invalid polling mode");
730 		return EINVAL;
731 	}
732 
733 	bps->mode = req->nr_mode;
734 	bps->qfirst = qfirst;
735 	bps->qlast = qlast;
736 	bps->cpu_from = core_from;
737 	bps->ncpus = req_cpus;
738 	nm_prinf("%s qfirst %u qlast %u cpu_from %u ncpus %u",
739 		req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU ?
740 		"MULTI" : "SINGLE",
741 		qfirst, qlast, core_from, req_cpus);
742 	return 0;
743 }
744 
745 static int
746 nm_bdg_ctl_polling_start(struct nmreq_vale_polling *req, struct netmap_adapter *na)
747 {
748 	struct nm_bdg_polling_state *bps;
749 	struct netmap_bwrap_adapter *bna;
750 	int error;
751 
752 	bna = (struct netmap_bwrap_adapter *)na;
753 	if (bna->na_polling_state) {
754 		nm_prerr("ERROR adapter already in polling mode");
755 		return EFAULT;
756 	}
757 
758 	bps = nm_os_malloc(sizeof(*bps));
759 	if (!bps)
760 		return ENOMEM;
761 	bps->configured = false;
762 	bps->stopped = true;
763 
764 	if (get_polling_cfg(req, na, bps)) {
765 		nm_os_free(bps);
766 		return EINVAL;
767 	}
768 
769 	if (nm_bdg_create_kthreads(bps)) {
770 		nm_os_free(bps);
771 		return EFAULT;
772 	}
773 
774 	bps->configured = true;
775 	bna->na_polling_state = bps;
776 	bps->bna = bna;
777 
778 	/* disable interrupts if possible */
779 	nma_intr_enable(bna->hwna, 0);
780 	/* start kthread now */
781 	error = nm_bdg_polling_start_kthreads(bps);
782 	if (error) {
783 		nm_prerr("ERROR nm_bdg_polling_start_kthread()");
784 		nm_os_free(bps->kthreads);
785 		nm_os_free(bps);
786 		bna->na_polling_state = NULL;
787 		nma_intr_enable(bna->hwna, 1);
788 	}
789 	return error;
790 }
791 
792 static int
793 nm_bdg_ctl_polling_stop(struct netmap_adapter *na)
794 {
795 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;
796 	struct nm_bdg_polling_state *bps;
797 
798 	if (!bna->na_polling_state) {
799 		nm_prerr("ERROR adapter is not in polling mode");
800 		return EFAULT;
801 	}
802 	bps = bna->na_polling_state;
803 	nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);
804 	bps->configured = false;
805 	nm_os_free(bps);
806 	bna->na_polling_state = NULL;
807 	/* reenable interrupts */
808 	nma_intr_enable(bna->hwna, 1);
809 	return 0;
810 }
811 
812 int
813 nm_bdg_polling(struct nmreq_header *hdr)
814 {
815 	struct nmreq_vale_polling *req =
816 		(struct nmreq_vale_polling *)(uintptr_t)hdr->nr_body;
817 	struct netmap_adapter *na = NULL;
818 	int error = 0;
819 
820 	NMG_LOCK();
821 	error = netmap_get_vale_na(hdr, &na, NULL, /*create=*/0);
822 	if (na && !error) {
823 		if (!nm_is_bwrap(na)) {
824 			error = EOPNOTSUPP;
825 		} else if (hdr->nr_reqtype == NETMAP_BDG_POLLING_ON) {
826 			error = nm_bdg_ctl_polling_start(req, na);
827 			if (!error)
828 				netmap_adapter_get(na);
829 		} else {
830 			error = nm_bdg_ctl_polling_stop(na);
831 			if (!error)
832 				netmap_adapter_put(na);
833 		}
834 		netmap_adapter_put(na);
835 	} else if (!na && !error) {
836 		/* Not VALE port. */
837 		error = EINVAL;
838 	}
839 	NMG_UNLOCK();
840 
841 	return error;
842 }
843 
844 /* Called by external kernel modules (e.g., Openvswitch).
845  * to set configure/lookup/dtor functions of a VALE instance.
846  * Register callbacks to the given bridge. 'name' may be just
847  * bridge's name (including ':' if it is not just NM_BDG_NAME).
848  *
849  * Called without NMG_LOCK.
850  */
851 
852 int
853 netmap_bdg_regops(const char *name, struct netmap_bdg_ops *bdg_ops, void *private_data, void *auth_token)
854 {
855 	struct nm_bridge *b;
856 	int error = 0;
857 
858 	NMG_LOCK();
859 	b = nm_find_bridge(name, 0 /* don't create */, NULL);
860 	if (!b) {
861 		error = ENXIO;
862 		goto unlock_regops;
863 	}
864 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
865 		error = EACCES;
866 		goto unlock_regops;
867 	}
868 
869 	BDG_WLOCK(b);
870 	if (!bdg_ops) {
871 		/* resetting the bridge */
872 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
873 		b->bdg_ops = b->bdg_saved_ops;
874 		b->private_data = b->ht;
875 	} else {
876 		/* modifying the bridge */
877 		b->private_data = private_data;
878 #define nm_bdg_override(m) if (bdg_ops->m) b->bdg_ops.m = bdg_ops->m
879 		nm_bdg_override(lookup);
880 		nm_bdg_override(config);
881 		nm_bdg_override(dtor);
882 		nm_bdg_override(vp_create);
883 		nm_bdg_override(bwrap_attach);
884 #undef nm_bdg_override
885 
886 	}
887 	BDG_WUNLOCK(b);
888 
889 unlock_regops:
890 	NMG_UNLOCK();
891 	return error;
892 }
893 
894 
895 int
896 netmap_bdg_config(struct nm_ifreq *nr)
897 {
898 	struct nm_bridge *b;
899 	int error = EINVAL;
900 
901 	NMG_LOCK();
902 	b = nm_find_bridge(nr->nifr_name, 0, NULL);
903 	if (!b) {
904 		NMG_UNLOCK();
905 		return error;
906 	}
907 	NMG_UNLOCK();
908 	/* Don't call config() with NMG_LOCK() held */
909 	BDG_RLOCK(b);
910 	if (b->bdg_ops.config != NULL)
911 		error = b->bdg_ops.config(nr);
912 	BDG_RUNLOCK(b);
913 	return error;
914 }
915 
916 
917 /* nm_register callback for VALE ports */
918 int
919 netmap_vp_reg(struct netmap_adapter *na, int onoff)
920 {
921 	struct netmap_vp_adapter *vpna =
922 		(struct netmap_vp_adapter*)na;
923 
924 	/* persistent ports may be put in netmap mode
925 	 * before being attached to a bridge
926 	 */
927 	if (vpna->na_bdg)
928 		BDG_WLOCK(vpna->na_bdg);
929 	if (onoff) {
930 		netmap_krings_mode_commit(na, onoff);
931 		if (na->active_fds == 0)
932 			na->na_flags |= NAF_NETMAP_ON;
933 		 /* XXX on FreeBSD, persistent VALE ports should also
934 		 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
935 		 */
936 	} else {
937 		if (na->active_fds == 0)
938 			na->na_flags &= ~NAF_NETMAP_ON;
939 		netmap_krings_mode_commit(na, onoff);
940 	}
941 	if (vpna->na_bdg)
942 		BDG_WUNLOCK(vpna->na_bdg);
943 	return 0;
944 }
945 
946 
947 /* rxsync code used by VALE ports nm_rxsync callback and also
948  * internally by the brwap
949  */
950 static int
951 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
952 {
953 	struct netmap_adapter *na = kring->na;
954 	struct netmap_ring *ring = kring->ring;
955 	u_int nm_i, lim = kring->nkr_num_slots - 1;
956 	u_int head = kring->rhead;
957 	int n;
958 
959 	if (head > lim) {
960 		nm_prerr("ouch dangerous reset!!!");
961 		n = netmap_ring_reinit(kring);
962 		goto done;
963 	}
964 
965 	/* First part, import newly received packets. */
966 	/* actually nothing to do here, they are already in the kring */
967 
968 	/* Second part, skip past packets that userspace has released. */
969 	nm_i = kring->nr_hwcur;
970 	if (nm_i != head) {
971 		/* consistency check, but nothing really important here */
972 		for (n = 0; likely(nm_i != head); n++) {
973 			struct netmap_slot *slot = &ring->slot[nm_i];
974 			void *addr = NMB(na, slot);
975 
976 			if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
977 				nm_prerr("bad buffer index %d, ignore ?",
978 					slot->buf_idx);
979 			}
980 			slot->flags &= ~NS_BUF_CHANGED;
981 			nm_i = nm_next(nm_i, lim);
982 		}
983 		kring->nr_hwcur = head;
984 	}
985 
986 	n = 0;
987 done:
988 	return n;
989 }
990 
991 /*
992  * nm_rxsync callback for VALE ports
993  * user process reading from a VALE switch.
994  * Already protected against concurrent calls from userspace,
995  * but we must acquire the queue's lock to protect against
996  * writers on the same queue.
997  */
998 int
999 netmap_vp_rxsync(struct netmap_kring *kring, int flags)
1000 {
1001 	int n;
1002 
1003 	mtx_lock(&kring->q_lock);
1004 	n = netmap_vp_rxsync_locked(kring, flags);
1005 	mtx_unlock(&kring->q_lock);
1006 	return n;
1007 }
1008 
1009 int
1010 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna,
1011 		struct netmap_bdg_ops *ops)
1012 {
1013 	return ops->bwrap_attach(nr_name, hwna);
1014 }
1015 
1016 
1017 /* Bridge wrapper code (bwrap).
1018  * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
1019  * VALE switch.
1020  * The main task is to swap the meaning of tx and rx rings to match the
1021  * expectations of the VALE switch code (see nm_bdg_flush).
1022  *
1023  * The bwrap works by interposing a netmap_bwrap_adapter between the
1024  * rest of the system and the hwna. The netmap_bwrap_adapter looks like
1025  * a netmap_vp_adapter to the rest the system, but, internally, it
1026  * translates all callbacks to what the hwna expects.
1027  *
1028  * Note that we have to intercept callbacks coming from two sides:
1029  *
1030  *  - callbacks coming from the netmap module are intercepted by
1031  *    passing around the netmap_bwrap_adapter instead of the hwna
1032  *
1033  *  - callbacks coming from outside of the netmap module only know
1034  *    about the hwna. This, however, only happens in interrupt
1035  *    handlers, where only the hwna->nm_notify callback is called.
1036  *    What the bwrap does is to overwrite the hwna->nm_notify callback
1037  *    with its own netmap_bwrap_intr_notify.
1038  *    XXX This assumes that the hwna->nm_notify callback was the
1039  *    standard netmap_notify(), as it is the case for nic adapters.
1040  *    Any additional action performed by hwna->nm_notify will not be
1041  *    performed by netmap_bwrap_intr_notify.
1042  *
1043  * Additionally, the bwrap can optionally attach the host rings pair
1044  * of the wrapped adapter to a different port of the switch.
1045  */
1046 
1047 
1048 static void
1049 netmap_bwrap_dtor(struct netmap_adapter *na)
1050 {
1051 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1052 	struct netmap_adapter *hwna = bna->hwna;
1053 	struct nm_bridge *b = bna->up.na_bdg,
1054 		*bh = bna->host.na_bdg;
1055 
1056 	if (bna->host.up.nm_mem)
1057 		netmap_mem_put(bna->host.up.nm_mem);
1058 
1059 	if (b) {
1060 		netmap_bdg_detach_common(b, bna->up.bdg_port,
1061 			    (bh ? bna->host.bdg_port : -1));
1062 	}
1063 
1064 	nm_prdis("na %p", na);
1065 	na->ifp = NULL;
1066 	bna->host.up.ifp = NULL;
1067 	hwna->na_vp = bna->saved_na_vp;
1068 	hwna->na_hostvp = NULL;
1069 	hwna->na_private = NULL;
1070 	hwna->na_flags &= ~NAF_BUSY;
1071 	netmap_adapter_put(hwna);
1072 
1073 }
1074 
1075 
1076 /*
1077  * Intr callback for NICs connected to a bridge.
1078  * Simply ignore tx interrupts (maybe we could try to recover space ?)
1079  * and pass received packets from nic to the bridge.
1080  *
1081  * XXX TODO check locking: this is called from the interrupt
1082  * handler so we should make sure that the interface is not
1083  * disconnected while passing down an interrupt.
1084  *
1085  * Note, no user process can access this NIC or the host stack.
1086  * The only part of the ring that is significant are the slots,
1087  * and head/cur/tail are set from the kring as needed
1088  * (part as a receive ring, part as a transmit ring).
1089  *
1090  * callback that overwrites the hwna notify callback.
1091  * Packets come from the outside or from the host stack and are put on an
1092  * hwna rx ring.
1093  * The bridge wrapper then sends the packets through the bridge.
1094  */
1095 static int
1096 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
1097 {
1098 	struct netmap_adapter *na = kring->na;
1099 	struct netmap_bwrap_adapter *bna = na->na_private;
1100 	struct netmap_kring *bkring;
1101 	struct netmap_vp_adapter *vpna = &bna->up;
1102 	u_int ring_nr = kring->ring_id;
1103 	int ret = NM_IRQ_COMPLETED;
1104 	int error;
1105 
1106 	if (netmap_debug & NM_DEBUG_RXINTR)
1107 	    nm_prinf("%s %s 0x%x", na->name, kring->name, flags);
1108 
1109 	bkring = vpna->up.tx_rings[ring_nr];
1110 
1111 	/* make sure the ring is not disabled */
1112 	if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {
1113 		return EIO;
1114 	}
1115 
1116 	if (netmap_debug & NM_DEBUG_RXINTR)
1117 	    nm_prinf("%s head %d cur %d tail %d",  na->name,
1118 		kring->rhead, kring->rcur, kring->rtail);
1119 
1120 	/* simulate a user wakeup on the rx ring
1121 	 * fetch packets that have arrived.
1122 	 */
1123 	error = kring->nm_sync(kring, 0);
1124 	if (error)
1125 		goto put_out;
1126 	if (kring->nr_hwcur == kring->nr_hwtail) {
1127 		if (netmap_verbose)
1128 			nm_prlim(1, "interrupt with no packets on %s",
1129 				kring->name);
1130 		goto put_out;
1131 	}
1132 
1133 	/* new packets are kring->rcur to kring->nr_hwtail, and the bkring
1134 	 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
1135 	 * to push all packets out.
1136 	 */
1137 	bkring->rhead = bkring->rcur = kring->nr_hwtail;
1138 
1139 	bkring->nm_sync(bkring, flags);
1140 
1141 	/* mark all buffers as released on this ring */
1142 	kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
1143 	/* another call to actually release the buffers */
1144 	error = kring->nm_sync(kring, 0);
1145 
1146 	/* The second rxsync may have further advanced hwtail. If this happens,
1147 	 *  return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */
1148 	if (kring->rcur != kring->nr_hwtail) {
1149 		ret = NM_IRQ_RESCHED;
1150 	}
1151 put_out:
1152 	nm_kr_put(kring);
1153 
1154 	return error ? error : ret;
1155 }
1156 
1157 
1158 /* nm_register callback for bwrap */
1159 int
1160 netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
1161 {
1162 	struct netmap_bwrap_adapter *bna =
1163 		(struct netmap_bwrap_adapter *)na;
1164 	struct netmap_adapter *hwna = bna->hwna;
1165 	struct netmap_vp_adapter *hostna = &bna->host;
1166 	int error, i;
1167 	enum txrx t;
1168 
1169 	nm_prdis("%s %s", na->name, onoff ? "on" : "off");
1170 
1171 	if (onoff) {
1172 		/* netmap_do_regif has been called on the bwrap na.
1173 		 * We need to pass the information about the
1174 		 * memory allocator down to the hwna before
1175 		 * putting it in netmap mode
1176 		 */
1177 		hwna->na_lut = na->na_lut;
1178 
1179 		if (hostna->na_bdg) {
1180 			/* if the host rings have been attached to switch,
1181 			 * we need to copy the memory allocator information
1182 			 * in the hostna also
1183 			 */
1184 			hostna->up.na_lut = na->na_lut;
1185 		}
1186 
1187 	}
1188 
1189 	/* pass down the pending ring state information */
1190 	for_rx_tx(t) {
1191 		for (i = 0; i < netmap_all_rings(na, t); i++) {
1192 			NMR(hwna, nm_txrx_swap(t))[i]->nr_pending_mode =
1193 				NMR(na, t)[i]->nr_pending_mode;
1194 		}
1195 	}
1196 
1197 	/* forward the request to the hwna */
1198 	error = hwna->nm_register(hwna, onoff);
1199 	if (error)
1200 		return error;
1201 
1202 	/* copy up the current ring state information */
1203 	for_rx_tx(t) {
1204 		for (i = 0; i < netmap_all_rings(na, t); i++) {
1205 			struct netmap_kring *kring = NMR(hwna, nm_txrx_swap(t))[i];
1206 			NMR(na, t)[i]->nr_mode = kring->nr_mode;
1207 		}
1208 	}
1209 
1210 	/* impersonate a netmap_vp_adapter */
1211 	netmap_vp_reg(na, onoff);
1212 	if (hostna->na_bdg)
1213 		netmap_vp_reg(&hostna->up, onoff);
1214 
1215 	if (onoff) {
1216 		u_int i;
1217 		/* intercept the hwna nm_nofify callback on the hw rings */
1218 		for (i = 0; i < hwna->num_rx_rings; i++) {
1219 			hwna->rx_rings[i]->save_notify = hwna->rx_rings[i]->nm_notify;
1220 			hwna->rx_rings[i]->nm_notify = netmap_bwrap_intr_notify;
1221 		}
1222 		i = hwna->num_rx_rings; /* for safety */
1223 		/* save the host ring notify unconditionally */
1224 		for (; i < netmap_real_rings(hwna, NR_RX); i++) {
1225 			hwna->rx_rings[i]->save_notify =
1226 				hwna->rx_rings[i]->nm_notify;
1227 			if (hostna->na_bdg) {
1228 				/* also intercept the host ring notify */
1229 				hwna->rx_rings[i]->nm_notify =
1230 					netmap_bwrap_intr_notify;
1231 				na->tx_rings[i]->nm_sync = na->nm_txsync;
1232 			}
1233 		}
1234 		if (na->active_fds == 0)
1235 			na->na_flags |= NAF_NETMAP_ON;
1236 	} else {
1237 		u_int i;
1238 
1239 		if (na->active_fds == 0)
1240 			na->na_flags &= ~NAF_NETMAP_ON;
1241 
1242 		/* reset all notify callbacks (including host ring) */
1243 		for (i = 0; i < netmap_all_rings(hwna, NR_RX); i++) {
1244 			hwna->rx_rings[i]->nm_notify =
1245 				hwna->rx_rings[i]->save_notify;
1246 			hwna->rx_rings[i]->save_notify = NULL;
1247 		}
1248 		hwna->na_lut.lut = NULL;
1249 		hwna->na_lut.plut = NULL;
1250 		hwna->na_lut.objtotal = 0;
1251 		hwna->na_lut.objsize = 0;
1252 
1253 		/* pass ownership of the netmap rings to the hwna */
1254 		for_rx_tx(t) {
1255 			for (i = 0; i < netmap_all_rings(na, t); i++) {
1256 				NMR(na, t)[i]->ring = NULL;
1257 			}
1258 		}
1259 		/* reset the number of host rings to default */
1260 		for_rx_tx(t) {
1261 			nma_set_host_nrings(hwna, t, 1);
1262 		}
1263 
1264 	}
1265 
1266 	return 0;
1267 }
1268 
1269 /* nm_config callback for bwrap */
1270 static int
1271 netmap_bwrap_config(struct netmap_adapter *na, struct nm_config_info *info)
1272 {
1273 	struct netmap_bwrap_adapter *bna =
1274 		(struct netmap_bwrap_adapter *)na;
1275 	struct netmap_adapter *hwna = bna->hwna;
1276 	int error;
1277 
1278 	/* Forward the request to the hwna. It may happen that nobody
1279 	 * registered hwna yet, so netmap_mem_get_lut() may have not
1280 	 * been called yet. */
1281 	error = netmap_mem_get_lut(hwna->nm_mem, &hwna->na_lut);
1282 	if (error)
1283 		return error;
1284 	netmap_update_config(hwna);
1285 	/* swap the results and propagate */
1286 	info->num_tx_rings = hwna->num_rx_rings;
1287 	info->num_tx_descs = hwna->num_rx_desc;
1288 	info->num_rx_rings = hwna->num_tx_rings;
1289 	info->num_rx_descs = hwna->num_tx_desc;
1290 	info->rx_buf_maxsize = hwna->rx_buf_maxsize;
1291 
1292 	return 0;
1293 }
1294 
1295 
1296 /* nm_krings_create callback for bwrap */
1297 int
1298 netmap_bwrap_krings_create_common(struct netmap_adapter *na)
1299 {
1300 	struct netmap_bwrap_adapter *bna =
1301 		(struct netmap_bwrap_adapter *)na;
1302 	struct netmap_adapter *hwna = bna->hwna;
1303 	struct netmap_adapter *hostna = &bna->host.up;
1304 	int i, error = 0;
1305 	enum txrx t;
1306 
1307 	/* also create the hwna krings */
1308 	error = hwna->nm_krings_create(hwna);
1309 	if (error) {
1310 		return error;
1311 	}
1312 
1313 	/* increment the usage counter for all the hwna krings */
1314 	for_rx_tx(t) {
1315 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1316 			NMR(hwna, t)[i]->users++;
1317 		}
1318 	}
1319 
1320 	/* now create the actual rings */
1321 	error = netmap_mem_rings_create(hwna);
1322 	if (error) {
1323 		goto err_dec_users;
1324 	}
1325 
1326 	/* cross-link the netmap rings
1327 	 * The original number of rings comes from hwna,
1328 	 * rx rings on one side equals tx rings on the other.
1329 	 */
1330 	for_rx_tx(t) {
1331 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
1332 		for (i = 0; i < netmap_all_rings(hwna, r); i++) {
1333 			NMR(na, t)[i]->nkr_num_slots = NMR(hwna, r)[i]->nkr_num_slots;
1334 			NMR(na, t)[i]->ring = NMR(hwna, r)[i]->ring;
1335 		}
1336 	}
1337 
1338 	if (na->na_flags & NAF_HOST_RINGS) {
1339 		/* the hostna rings are the host rings of the bwrap.
1340 		 * The corresponding krings must point back to the
1341 		 * hostna
1342 		 */
1343 		hostna->tx_rings = &na->tx_rings[na->num_tx_rings];
1344 		hostna->rx_rings = &na->rx_rings[na->num_rx_rings];
1345 		for_rx_tx(t) {
1346 			for (i = 0; i < nma_get_nrings(hostna, t); i++) {
1347 				NMR(hostna, t)[i]->na = hostna;
1348 			}
1349 		}
1350 	}
1351 
1352 	return 0;
1353 
1354 err_dec_users:
1355 	for_rx_tx(t) {
1356 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1357 			NMR(hwna, t)[i]->users--;
1358 		}
1359 	}
1360 	hwna->nm_krings_delete(hwna);
1361 	return error;
1362 }
1363 
1364 
1365 void
1366 netmap_bwrap_krings_delete_common(struct netmap_adapter *na)
1367 {
1368 	struct netmap_bwrap_adapter *bna =
1369 		(struct netmap_bwrap_adapter *)na;
1370 	struct netmap_adapter *hwna = bna->hwna;
1371 	enum txrx t;
1372 	int i;
1373 
1374 	nm_prdis("%s", na->name);
1375 
1376 	/* decrement the usage counter for all the hwna krings */
1377 	for_rx_tx(t) {
1378 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1379 			NMR(hwna, t)[i]->users--;
1380 		}
1381 	}
1382 
1383 	/* delete any netmap rings that are no longer needed */
1384 	netmap_mem_rings_delete(hwna);
1385 	hwna->nm_krings_delete(hwna);
1386 }
1387 
1388 
1389 /* notify method for the bridge-->hwna direction */
1390 int
1391 netmap_bwrap_notify(struct netmap_kring *kring, int flags)
1392 {
1393 	struct netmap_adapter *na = kring->na;
1394 	struct netmap_bwrap_adapter *bna = na->na_private;
1395 	struct netmap_adapter *hwna = bna->hwna;
1396 	u_int ring_n = kring->ring_id;
1397 	u_int lim = kring->nkr_num_slots - 1;
1398 	struct netmap_kring *hw_kring;
1399 	int error;
1400 
1401 	nm_prdis("%s: na %s hwna %s",
1402 			(kring ? kring->name : "NULL!"),
1403 			(na ? na->name : "NULL!"),
1404 			(hwna ? hwna->name : "NULL!"));
1405 	hw_kring = hwna->tx_rings[ring_n];
1406 
1407 	if (nm_kr_tryget(hw_kring, 0, NULL)) {
1408 		return ENXIO;
1409 	}
1410 
1411 	/* first step: simulate a user wakeup on the rx ring */
1412 	netmap_vp_rxsync(kring, flags);
1413 	nm_prdis("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1414 		na->name, ring_n,
1415 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1416 		kring->rhead, kring->rcur, kring->rtail,
1417 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
1418 	/* second step: the new packets are sent on the tx ring
1419 	 * (which is actually the same ring)
1420 	 */
1421 	hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
1422 	error = hw_kring->nm_sync(hw_kring, flags);
1423 	if (error)
1424 		goto put_out;
1425 
1426 	/* third step: now we are back the rx ring */
1427 	/* claim ownership on all hw owned bufs */
1428 	kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
1429 
1430 	/* fourth step: the user goes to sleep again, causing another rxsync */
1431 	netmap_vp_rxsync(kring, flags);
1432 	nm_prdis("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1433 		na->name, ring_n,
1434 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1435 		kring->rhead, kring->rcur, kring->rtail,
1436 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
1437 put_out:
1438 	nm_kr_put(hw_kring);
1439 
1440 	return error ? error : NM_IRQ_COMPLETED;
1441 }
1442 
1443 
1444 /* nm_bdg_ctl callback for the bwrap.
1445  * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd].
1446  * On attach, it needs to provide a fake netmap_priv_d structure and
1447  * perform a netmap_do_regif() on the bwrap. This will put both the
1448  * bwrap and the hwna in netmap mode, with the netmap rings shared
1449  * and cross linked. Moroever, it will start intercepting interrupts
1450  * directed to hwna.
1451  */
1452 static int
1453 netmap_bwrap_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
1454 {
1455 	struct netmap_priv_d *npriv;
1456 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1457 	int error = 0;
1458 
1459 	if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
1460 		struct nmreq_vale_attach *req =
1461 			(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
1462 		if (req->reg.nr_ringid != 0 ||
1463 			(req->reg.nr_mode != NR_REG_ALL_NIC &&
1464 				req->reg.nr_mode != NR_REG_NIC_SW)) {
1465 			/* We only support attaching all the NIC rings
1466 			 * and/or the host stack. */
1467 			return EINVAL;
1468 		}
1469 		if (NETMAP_OWNED_BY_ANY(na)) {
1470 			return EBUSY;
1471 		}
1472 		if (bna->na_kpriv) {
1473 			/* nothing to do */
1474 			return 0;
1475 		}
1476 		npriv = netmap_priv_new();
1477 		if (npriv == NULL)
1478 			return ENOMEM;
1479 		npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */
1480 		error = netmap_do_regif(npriv, na, req->reg.nr_mode,
1481 					req->reg.nr_ringid, req->reg.nr_flags);
1482 		if (error) {
1483 			netmap_priv_delete(npriv);
1484 			return error;
1485 		}
1486 		bna->na_kpriv = npriv;
1487 		na->na_flags |= NAF_BUSY;
1488 	} else {
1489 		if (na->active_fds == 0) /* not registered */
1490 			return EINVAL;
1491 		netmap_priv_delete(bna->na_kpriv);
1492 		bna->na_kpriv = NULL;
1493 		na->na_flags &= ~NAF_BUSY;
1494 	}
1495 
1496 	return error;
1497 }
1498 
1499 /* attach a bridge wrapper to the 'real' device */
1500 int
1501 netmap_bwrap_attach_common(struct netmap_adapter *na,
1502 		struct netmap_adapter *hwna)
1503 {
1504 	struct netmap_bwrap_adapter *bna;
1505 	struct netmap_adapter *hostna = NULL;
1506 	int error = 0;
1507 	enum txrx t;
1508 
1509 	/* make sure the NIC is not already in use */
1510 	if (NETMAP_OWNED_BY_ANY(hwna)) {
1511 		nm_prerr("NIC %s busy, cannot attach to bridge", hwna->name);
1512 		return EBUSY;
1513 	}
1514 
1515 	bna = (struct netmap_bwrap_adapter *)na;
1516 	/* make bwrap ifp point to the real ifp */
1517 	na->ifp = hwna->ifp;
1518 	if_ref(na->ifp);
1519 	na->na_private = bna;
1520 	/* fill the ring data for the bwrap adapter with rx/tx meanings
1521 	 * swapped. The real cross-linking will be done during register,
1522 	 * when all the krings will have been created.
1523 	 */
1524 	for_rx_tx(t) {
1525 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
1526 		nma_set_nrings(na, t, nma_get_nrings(hwna, r));
1527 		nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
1528 	}
1529 	na->nm_dtor = netmap_bwrap_dtor;
1530 	na->nm_config = netmap_bwrap_config;
1531 	na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
1532 	na->pdev = hwna->pdev;
1533 	na->nm_mem = netmap_mem_get(hwna->nm_mem);
1534 	na->virt_hdr_len = hwna->virt_hdr_len;
1535 	na->rx_buf_maxsize = hwna->rx_buf_maxsize;
1536 
1537 	bna->hwna = hwna;
1538 	netmap_adapter_get(hwna);
1539 	hwna->na_private = bna; /* weak reference */
1540 	bna->saved_na_vp = hwna->na_vp;
1541 	hwna->na_vp = &bna->up;
1542 	bna->up.up.na_vp = &(bna->up);
1543 
1544 	if (hwna->na_flags & NAF_HOST_RINGS) {
1545 		if (hwna->na_flags & NAF_SW_ONLY)
1546 			na->na_flags |= NAF_SW_ONLY;
1547 		na->na_flags |= NAF_HOST_RINGS;
1548 		hostna = &bna->host.up;
1549 
1550 		/* limit the number of host rings to that of hw */
1551 		nm_bound_var(&hostna->num_tx_rings, 1, 1,
1552 				nma_get_nrings(hwna, NR_TX), NULL);
1553 		nm_bound_var(&hostna->num_rx_rings, 1, 1,
1554 				nma_get_nrings(hwna, NR_RX), NULL);
1555 
1556 		snprintf(hostna->name, sizeof(hostna->name), "%s^", na->name);
1557 		hostna->ifp = hwna->ifp;
1558 		for_rx_tx(t) {
1559 			enum txrx r = nm_txrx_swap(t);
1560 			u_int nr = nma_get_nrings(hostna, t);
1561 
1562 			nma_set_nrings(hostna, t, nr);
1563 			nma_set_host_nrings(na, t, nr);
1564 			if (nma_get_host_nrings(hwna, t) < nr) {
1565 				nma_set_host_nrings(hwna, t, nr);
1566 			}
1567 			nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
1568 		}
1569 		// hostna->nm_txsync = netmap_bwrap_host_txsync;
1570 		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
1571 		hostna->nm_mem = netmap_mem_get(na->nm_mem);
1572 		hostna->na_private = bna;
1573 		hostna->na_vp = &bna->up;
1574 		na->na_hostvp = hwna->na_hostvp =
1575 			hostna->na_hostvp = &bna->host;
1576 		hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
1577 		hostna->rx_buf_maxsize = hwna->rx_buf_maxsize;
1578 	}
1579 	if (hwna->na_flags & NAF_MOREFRAG)
1580 		na->na_flags |= NAF_MOREFRAG;
1581 
1582 	nm_prdis("%s<->%s txr %d txd %d rxr %d rxd %d",
1583 		na->name, ifp->if_xname,
1584 		na->num_tx_rings, na->num_tx_desc,
1585 		na->num_rx_rings, na->num_rx_desc);
1586 
1587 	error = netmap_attach_common(na);
1588 	if (error) {
1589 		goto err_put;
1590 	}
1591 	hwna->na_flags |= NAF_BUSY;
1592 	return 0;
1593 
1594 err_put:
1595 	hwna->na_vp = hwna->na_hostvp = NULL;
1596 	netmap_adapter_put(hwna);
1597 	return error;
1598 
1599 }
1600 
1601 struct nm_bridge *
1602 netmap_init_bridges2(u_int n)
1603 {
1604 	int i;
1605 	struct nm_bridge *b;
1606 
1607 	b = nm_os_malloc(sizeof(struct nm_bridge) * n);
1608 	if (b == NULL)
1609 		return NULL;
1610 	for (i = 0; i < n; i++)
1611 		BDG_RWINIT(&b[i]);
1612 	return b;
1613 }
1614 
1615 void
1616 netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
1617 {
1618 	int i;
1619 
1620 	if (b == NULL)
1621 		return;
1622 
1623 	for (i = 0; i < n; i++)
1624 		BDG_RWDESTROY(&b[i]);
1625 	nm_os_free(b);
1626 }
1627 
1628 int
1629 netmap_init_bridges(void)
1630 {
1631 #ifdef CONFIG_NET_NS
1632 	return netmap_bns_register();
1633 #else
1634 	nm_bridges = netmap_init_bridges2(NM_BRIDGES);
1635 	if (nm_bridges == NULL)
1636 		return ENOMEM;
1637 	return 0;
1638 #endif
1639 }
1640 
1641 void
1642 netmap_uninit_bridges(void)
1643 {
1644 #ifdef CONFIG_NET_NS
1645 	netmap_bns_unregister();
1646 #else
1647 	netmap_uninit_bridges2(nm_bridges, NM_BRIDGES);
1648 #endif
1649 }
1650