xref: /freebsd/sys/dev/netmap/netmap.c (revision 0957b409)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2011-2014 Matteo Landi
5  * Copyright (C) 2011-2016 Luigi Rizzo
6  * Copyright (C) 2011-2016 Giuseppe Lettieri
7  * Copyright (C) 2011-2016 Vincenzo Maffione
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *   1. Redistributions of source code must retain the above copyright
14  *      notice, this list of conditions and the following disclaimer.
15  *   2. Redistributions in binary form must reproduce the above copyright
16  *      notice, this list of conditions and the following disclaimer in the
17  *      documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 
33 /*
34  * $FreeBSD$
35  *
36  * This module supports memory mapped access to network devices,
37  * see netmap(4).
38  *
39  * The module uses a large, memory pool allocated by the kernel
40  * and accessible as mmapped memory by multiple userspace threads/processes.
41  * The memory pool contains packet buffers and "netmap rings",
42  * i.e. user-accessible copies of the interface's queues.
43  *
44  * Access to the network card works like this:
45  * 1. a process/thread issues one or more open() on /dev/netmap, to create
46  *    select()able file descriptor on which events are reported.
47  * 2. on each descriptor, the process issues an ioctl() to identify
48  *    the interface that should report events to the file descriptor.
49  * 3. on each descriptor, the process issues an mmap() request to
50  *    map the shared memory region within the process' address space.
51  *    The list of interesting queues is indicated by a location in
52  *    the shared memory region.
53  * 4. using the functions in the netmap(4) userspace API, a process
54  *    can look up the occupation state of a queue, access memory buffers,
55  *    and retrieve received packets or enqueue packets to transmit.
56  * 5. using some ioctl()s the process can synchronize the userspace view
57  *    of the queue with the actual status in the kernel. This includes both
58  *    receiving the notification of new packets, and transmitting new
59  *    packets on the output interface.
60  * 6. select() or poll() can be used to wait for events on individual
61  *    transmit or receive queues (or all queues for a given interface).
62  *
63 
64 		SYNCHRONIZATION (USER)
65 
66 The netmap rings and data structures may be shared among multiple
67 user threads or even independent processes.
68 Any synchronization among those threads/processes is delegated
69 to the threads themselves. Only one thread at a time can be in
70 a system call on the same netmap ring. The OS does not enforce
71 this and only guarantees against system crashes in case of
72 invalid usage.
73 
74 		LOCKING (INTERNAL)
75 
76 Within the kernel, access to the netmap rings is protected as follows:
77 
78 - a spinlock on each ring, to handle producer/consumer races on
79   RX rings attached to the host stack (against multiple host
80   threads writing from the host stack to the same ring),
81   and on 'destination' rings attached to a VALE switch
82   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
83   protecting multiple active senders for the same destination)
84 
85 - an atomic variable to guarantee that there is at most one
86   instance of *_*xsync() on the ring at any time.
87   For rings connected to user file
88   descriptors, an atomic_test_and_set() protects this, and the
89   lock on the ring is not actually used.
90   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
91   is also used to prevent multiple executions (the driver might indeed
92   already guarantee this).
93   For NIC TX rings connected to a VALE switch, the lock arbitrates
94   access to the queue (both when allocating buffers and when pushing
95   them out).
96 
97 - *xsync() should be protected against initializations of the card.
98   On FreeBSD most devices have the reset routine protected by
99   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
100   the RING protection on rx_reset(), this should be added.
101 
102   On linux there is an external lock on the tx path, which probably
103   also arbitrates access to the reset routine. XXX to be revised
104 
105 - a per-interface core_lock protecting access from the host stack
106   while interfaces may be detached from netmap mode.
107   XXX there should be no need for this lock if we detach the interfaces
108   only while they are down.
109 
110 
111 --- VALE SWITCH ---
112 
113 NMG_LOCK() serializes all modifications to switches and ports.
114 A switch cannot be deleted until all ports are gone.
115 
116 For each switch, an SX lock (RWlock on linux) protects
117 deletion of ports. When configuring or deleting a new port, the
118 lock is acquired in exclusive mode (after holding NMG_LOCK).
119 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
120 The lock is held throughout the entire forwarding cycle,
121 during which the thread may incur in a page fault.
122 Hence it is important that sleepable shared locks are used.
123 
124 On the rx ring, the per-port lock is grabbed initially to reserve
125 a number of slot in the ring, then the lock is released,
126 packets are copied from source to destination, and then
127 the lock is acquired again and the receive ring is updated.
128 (A similar thing is done on the tx ring for NIC and host stack
129 ports attached to the switch)
130 
131  */
132 
133 
134 /* --- internals ----
135  *
136  * Roadmap to the code that implements the above.
137  *
138  * > 1. a process/thread issues one or more open() on /dev/netmap, to create
139  * >    select()able file descriptor on which events are reported.
140  *
141  *  	Internally, we allocate a netmap_priv_d structure, that will be
142  *  	initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
143  *  	structure for each open().
144  *
145  *      os-specific:
146  *  	    FreeBSD: see netmap_open() (netmap_freebsd.c)
147  *  	    linux:   see linux_netmap_open() (netmap_linux.c)
148  *
149  * > 2. on each descriptor, the process issues an ioctl() to identify
150  * >    the interface that should report events to the file descriptor.
151  *
152  * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
153  * 	Most important things happen in netmap_get_na() and
154  * 	netmap_do_regif(), called from there. Additional details can be
155  * 	found in the comments above those functions.
156  *
157  * 	In all cases, this action creates/takes-a-reference-to a
158  * 	netmap_*_adapter describing the port, and allocates a netmap_if
159  * 	and all necessary netmap rings, filling them with netmap buffers.
160  *
161  *      In this phase, the sync callbacks for each ring are set (these are used
162  *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
163  *      The adapter creation/initialization code puts them in the
164  * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
165  * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
166  * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
167  * 	actually call netmap_krings_create() to perform this and the other
168  * 	common stuff. netmap_krings_create() also takes care of the host rings,
169  * 	if needed, by setting their sync callbacks appropriately.
170  *
171  * 	Additional actions depend on the kind of netmap_adapter that has been
172  * 	registered:
173  *
174  * 	- netmap_hw_adapter:  	     [netmap.c]
175  * 	     This is a system netdev/ifp with native netmap support.
176  * 	     The ifp is detached from the host stack by redirecting:
177  * 	       - transmissions (from the network stack) to netmap_transmit()
178  * 	       - receive notifications to the nm_notify() callback for
179  * 	         this adapter. The callback is normally netmap_notify(), unless
180  * 	         the ifp is attached to a bridge using bwrap, in which case it
181  * 	         is netmap_bwrap_intr_notify().
182  *
183  * 	- netmap_generic_adapter:      [netmap_generic.c]
184  * 	      A system netdev/ifp without native netmap support.
185  *
186  * 	(the decision about native/non native support is taken in
187  * 	 netmap_get_hw_na(), called by netmap_get_na())
188  *
189  * 	- netmap_vp_adapter 		[netmap_vale.c]
190  * 	      Returned by netmap_get_bdg_na().
191  * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
192  * 	      are created on the fly if they don't already exist, and are
193  * 	      always attached to a bridge.
194  * 	      Persistent VALE ports must must be created separately, and i
195  * 	      then attached like normal NICs. The NIOCREGIF we are examining
196  * 	      will find them only if they had previosly been created and
197  * 	      attached (see VALE_CTL below).
198  *
199  * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
200  * 	      Returned by netmap_get_pipe_na().
201  * 	      Both pipe ends are created, if they didn't already exist.
202  *
203  * 	- netmap_monitor_adapter      [netmap_monitor.c]
204  * 	      Returned by netmap_get_monitor_na().
205  * 	      If successful, the nm_sync callbacks of the monitored adapter
206  * 	      will be intercepted by the returned monitor.
207  *
208  * 	- netmap_bwrap_adapter	      [netmap_vale.c]
209  * 	      Cannot be obtained in this way, see VALE_CTL below
210  *
211  *
212  * 	os-specific:
213  * 	    linux: we first go through linux_netmap_ioctl() to
214  * 	           adapt the FreeBSD interface to the linux one.
215  *
216  *
217  * > 3. on each descriptor, the process issues an mmap() request to
218  * >    map the shared memory region within the process' address space.
219  * >    The list of interesting queues is indicated by a location in
220  * >    the shared memory region.
221  *
222  *      os-specific:
223  *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
224  *  	    linux:   linux_netmap_mmap (netmap_linux.c).
225  *
226  * > 4. using the functions in the netmap(4) userspace API, a process
227  * >    can look up the occupation state of a queue, access memory buffers,
228  * >    and retrieve received packets or enqueue packets to transmit.
229  *
230  * 	these actions do not involve the kernel.
231  *
232  * > 5. using some ioctl()s the process can synchronize the userspace view
233  * >    of the queue with the actual status in the kernel. This includes both
234  * >    receiving the notification of new packets, and transmitting new
235  * >    packets on the output interface.
236  *
237  * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
238  * 	cases. They invoke the nm_sync callbacks on the netmap_kring
239  * 	structures, as initialized in step 2 and maybe later modified
240  * 	by a monitor. Monitors, however, will always call the original
241  * 	callback before doing anything else.
242  *
243  *
244  * > 6. select() or poll() can be used to wait for events on individual
245  * >    transmit or receive queues (or all queues for a given interface).
246  *
247  * 	Implemented in netmap_poll(). This will call the same nm_sync()
248  * 	callbacks as in step 5 above.
249  *
250  * 	os-specific:
251  * 		linux: we first go through linux_netmap_poll() to adapt
252  * 		       the FreeBSD interface to the linux one.
253  *
254  *
255  *  ----  VALE_CTL -----
256  *
257  *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
258  *  nr_cmd in the nmreq structure. These subcommands are handled by
259  *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
260  *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
261  *  subcommands, respectively.
262  *
263  *  Any network interface known to the system (including a persistent VALE
264  *  port) can be attached to a VALE switch by issuing the
265  *  NETMAP_REQ_VALE_ATTACH command. After the attachment, persistent VALE ports
266  *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
267  *  attachment of other interfaces, instead, requires the creation of a
268  *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
269  *  netmap mode. This may require the creation of a netmap_generic_adapter if
270  *  we have no native support for the interface, or if generic adapters have
271  *  been forced by sysctl.
272  *
273  *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
274  *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
275  *  callback.  In the case of the bwrap, the callback creates the
276  *  netmap_bwrap_adapter.  The initialization of the bwrap is then
277  *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
278  *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
279  *  A generic adapter for the wrapped ifp will be created if needed, when
280  *  netmap_get_bdg_na() calls netmap_get_hw_na().
281  *
282  *
283  *  ---- DATAPATHS -----
284  *
285  *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
286  *
287  *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
288  *
289  *    - tx from netmap userspace:
290  *	 concurrently:
291  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
292  *                kring->nm_sync() == DEVICE_netmap_txsync()
293  *           2) device interrupt handler
294  *                na->nm_notify()  == netmap_notify()
295  *    - rx from netmap userspace:
296  *       concurrently:
297  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
298  *                kring->nm_sync() == DEVICE_netmap_rxsync()
299  *           2) device interrupt handler
300  *                na->nm_notify()  == netmap_notify()
301  *    - rx from host stack
302  *       concurrently:
303  *           1) host stack
304  *                netmap_transmit()
305  *                  na->nm_notify  == netmap_notify()
306  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
307  *                kring->nm_sync() == netmap_rxsync_from_host
308  *                  netmap_rxsync_from_host(na, NULL, NULL)
309  *    - tx to host stack
310  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
311  *             kring->nm_sync() == netmap_txsync_to_host
312  *               netmap_txsync_to_host(na)
313  *                 nm_os_send_up()
314  *                   FreeBSD: na->if_input() == ether_input()
315  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
316  *
317  *
318  *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
319  *
320  *    na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
321  *
322  *    - tx from netmap userspace:
323  *       concurrently:
324  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
325  *               kring->nm_sync() == generic_netmap_txsync()
326  *                   nm_os_generic_xmit_frame()
327  *                       linux:   dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
328  *                           ifp->ndo_start_xmit == generic_ndo_start_xmit()
329  *                               gna->save_start_xmit == orig. dev. start_xmit
330  *                       FreeBSD: na->if_transmit() == orig. dev if_transmit
331  *           2) generic_mbuf_destructor()
332  *                   na->nm_notify() == netmap_notify()
333  *    - rx from netmap userspace:
334  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
335  *               kring->nm_sync() == generic_netmap_rxsync()
336  *                   mbq_safe_dequeue()
337  *           2) device driver
338  *               generic_rx_handler()
339  *                   mbq_safe_enqueue()
340  *                   na->nm_notify() == netmap_notify()
341  *    - rx from host stack
342  *        FreeBSD: same as native
343  *        Linux: same as native except:
344  *           1) host stack
345  *               dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
346  *                   ifp->ndo_start_xmit == generic_ndo_start_xmit()
347  *                       netmap_transmit()
348  *                           na->nm_notify() == netmap_notify()
349  *    - tx to host stack (same as native):
350  *
351  *
352  *                           -= VALE =-
353  *
354  *   INCOMING:
355  *
356  *      - VALE ports:
357  *          ioctl(NIOCTXSYNC)/netmap_poll() in process context
358  *              kring->nm_sync() == netmap_vp_txsync()
359  *
360  *      - system device with native support:
361  *         from cable:
362  *             interrupt
363  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
364  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
365  *                     netmap_vp_txsync()
366  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
367  *         from host stack:
368  *             netmap_transmit()
369  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
370  *                     kring->nm_sync() == netmap_rxsync_from_host()
371  *                     netmap_vp_txsync()
372  *
373  *      - system device with generic support:
374  *         from device driver:
375  *            generic_rx_handler()
376  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
377  *                     kring->nm_sync() == generic_netmap_rxsync()
378  *                     netmap_vp_txsync()
379  *                     kring->nm_sync() == generic_netmap_rxsync()
380  *         from host stack:
381  *            netmap_transmit()
382  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
383  *                     kring->nm_sync() == netmap_rxsync_from_host()
384  *                     netmap_vp_txsync()
385  *
386  *   (all cases) --> nm_bdg_flush()
387  *                      dest_na->nm_notify() == (see below)
388  *
389  *   OUTGOING:
390  *
391  *      - VALE ports:
392  *         concurrently:
393  *             1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
394  *                    kring->nm_sync() == netmap_vp_rxsync()
395  *             2) from nm_bdg_flush()
396  *                    na->nm_notify() == netmap_notify()
397  *
398  *      - system device with native support:
399  *          to cable:
400  *             na->nm_notify() == netmap_bwrap_notify()
401  *                 netmap_vp_rxsync()
402  *                 kring->nm_sync() == DEVICE_netmap_txsync()
403  *                 netmap_vp_rxsync()
404  *          to host stack:
405  *                 netmap_vp_rxsync()
406  *                 kring->nm_sync() == netmap_txsync_to_host
407  *                 netmap_vp_rxsync_locked()
408  *
409  *      - system device with generic adapter:
410  *          to device driver:
411  *             na->nm_notify() == netmap_bwrap_notify()
412  *                 netmap_vp_rxsync()
413  *                 kring->nm_sync() == generic_netmap_txsync()
414  *                 netmap_vp_rxsync()
415  *          to host stack:
416  *                 netmap_vp_rxsync()
417  *                 kring->nm_sync() == netmap_txsync_to_host
418  *                 netmap_vp_rxsync()
419  *
420  */
421 
422 /*
423  * OS-specific code that is used only within this file.
424  * Other OS-specific code that must be accessed by drivers
425  * is present in netmap_kern.h
426  */
427 
428 #if defined(__FreeBSD__)
429 #include <sys/cdefs.h> /* prerequisite */
430 #include <sys/types.h>
431 #include <sys/errno.h>
432 #include <sys/param.h>	/* defines used in kernel.h */
433 #include <sys/kernel.h>	/* types used in module initialization */
434 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
435 #include <sys/filio.h>	/* FIONBIO */
436 #include <sys/sockio.h>
437 #include <sys/socketvar.h>	/* struct socket */
438 #include <sys/malloc.h>
439 #include <sys/poll.h>
440 #include <sys/rwlock.h>
441 #include <sys/socket.h> /* sockaddrs */
442 #include <sys/selinfo.h>
443 #include <sys/sysctl.h>
444 #include <sys/jail.h>
445 #include <net/vnet.h>
446 #include <net/if.h>
447 #include <net/if_var.h>
448 #include <net/bpf.h>		/* BIOCIMMEDIATE */
449 #include <machine/bus.h>	/* bus_dmamap_* */
450 #include <sys/endian.h>
451 #include <sys/refcount.h>
452 #include <net/ethernet.h>	/* ETHER_BPF_MTAP */
453 
454 
455 #elif defined(linux)
456 
457 #include "bsd_glue.h"
458 
459 #elif defined(__APPLE__)
460 
461 #warning OSX support is only partial
462 #include "osx_glue.h"
463 
464 #elif defined (_WIN32)
465 
466 #include "win_glue.h"
467 
468 #else
469 
470 #error	Unsupported platform
471 
472 #endif /* unsupported */
473 
474 /*
475  * common headers
476  */
477 #include <net/netmap.h>
478 #include <dev/netmap/netmap_kern.h>
479 #include <dev/netmap/netmap_mem2.h>
480 
481 
482 /* user-controlled variables */
483 int netmap_verbose;
484 #ifdef CONFIG_NETMAP_DEBUG
485 int netmap_debug;
486 #endif /* CONFIG_NETMAP_DEBUG */
487 
488 static int netmap_no_timestamp; /* don't timestamp on rxsync */
489 int netmap_no_pendintr = 1;
490 int netmap_txsync_retry = 2;
491 static int netmap_fwd = 0;	/* force transparent forwarding */
492 
493 /*
494  * netmap_admode selects the netmap mode to use.
495  * Invalid values are reset to NETMAP_ADMODE_BEST
496  */
497 enum {	NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
498 	NETMAP_ADMODE_NATIVE,	/* either native or none */
499 	NETMAP_ADMODE_GENERIC,	/* force generic */
500 	NETMAP_ADMODE_LAST };
501 static int netmap_admode = NETMAP_ADMODE_BEST;
502 
503 /* netmap_generic_mit controls mitigation of RX notifications for
504  * the generic netmap adapter. The value is a time interval in
505  * nanoseconds. */
506 int netmap_generic_mit = 100*1000;
507 
508 /* We use by default netmap-aware qdiscs with generic netmap adapters,
509  * even if there can be a little performance hit with hardware NICs.
510  * However, using the qdisc is the safer approach, for two reasons:
511  * 1) it prevents non-fifo qdiscs to break the TX notification
512  *    scheme, which is based on mbuf destructors when txqdisc is
513  *    not used.
514  * 2) it makes it possible to transmit over software devices that
515  *    change skb->dev, like bridge, veth, ...
516  *
517  * Anyway users looking for the best performance should
518  * use native adapters.
519  */
520 #ifdef linux
521 int netmap_generic_txqdisc = 1;
522 #endif
523 
524 /* Default number of slots and queues for generic adapters. */
525 int netmap_generic_ringsize = 1024;
526 int netmap_generic_rings = 1;
527 
528 /* Non-zero to enable checksum offloading in NIC drivers */
529 int netmap_generic_hwcsum = 0;
530 
531 /* Non-zero if ptnet devices are allowed to use virtio-net headers. */
532 int ptnet_vnet_hdr = 1;
533 
534 /*
535  * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
536  * in some other operating systems
537  */
538 SYSBEGIN(main_init);
539 
540 SYSCTL_DECL(_dev_netmap);
541 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
542 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
543 		CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
544 #ifdef CONFIG_NETMAP_DEBUG
545 SYSCTL_INT(_dev_netmap, OID_AUTO, debug,
546 		CTLFLAG_RW, &netmap_debug, 0, "Debug messages");
547 #endif /* CONFIG_NETMAP_DEBUG */
548 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
549 		CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
550 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr,
551 		0, "Always look for new received packets.");
552 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
553 		&netmap_txsync_retry, 0, "Number of txsync loops in bridge's flush.");
554 
555 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0,
556 		"Force NR_FORWARD mode");
557 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0,
558 		"Adapter mode. 0 selects the best option available,"
559 		"1 forces native adapter, 2 forces emulated adapter");
560 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_hwcsum, CTLFLAG_RW, &netmap_generic_hwcsum,
561 		0, "Hardware checksums. 0 to disable checksum generation by the NIC (default),"
562 		"1 to enable checksum generation by the NIC");
563 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit,
564 		0, "RX notification interval in nanoseconds");
565 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW,
566 		&netmap_generic_ringsize, 0,
567 		"Number of per-ring slots for emulated netmap mode");
568 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW,
569 		&netmap_generic_rings, 0,
570 		"Number of TX/RX queues for emulated netmap adapters");
571 #ifdef linux
572 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW,
573 		&netmap_generic_txqdisc, 0, "Use qdisc for generic adapters");
574 #endif
575 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr,
576 		0, "Allow ptnet devices to use virtio-net headers");
577 
578 SYSEND;
579 
580 NMG_LOCK_T	netmap_global_lock;
581 
582 /*
583  * mark the ring as stopped, and run through the locks
584  * to make sure other users get to see it.
585  * stopped must be either NR_KR_STOPPED (for unbounded stop)
586  * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
587  */
588 static void
589 netmap_disable_ring(struct netmap_kring *kr, int stopped)
590 {
591 	nm_kr_stop(kr, stopped);
592 	// XXX check if nm_kr_stop is sufficient
593 	mtx_lock(&kr->q_lock);
594 	mtx_unlock(&kr->q_lock);
595 	nm_kr_put(kr);
596 }
597 
598 /* stop or enable a single ring */
599 void
600 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
601 {
602 	if (stopped)
603 		netmap_disable_ring(NMR(na, t)[ring_id], stopped);
604 	else
605 		NMR(na, t)[ring_id]->nkr_stopped = 0;
606 }
607 
608 
609 /* stop or enable all the rings of na */
610 void
611 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
612 {
613 	int i;
614 	enum txrx t;
615 
616 	if (!nm_netmap_on(na))
617 		return;
618 
619 	for_rx_tx(t) {
620 		for (i = 0; i < netmap_real_rings(na, t); i++) {
621 			netmap_set_ring(na, i, t, stopped);
622 		}
623 	}
624 }
625 
626 /*
627  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
628  * to finish and prevents any new one from starting.  Call this before turning
629  * netmap mode off, or before removing the hardware rings (e.g., on module
630  * onload).
631  */
632 void
633 netmap_disable_all_rings(struct ifnet *ifp)
634 {
635 	if (NM_NA_VALID(ifp)) {
636 		netmap_set_all_rings(NA(ifp), NM_KR_STOPPED);
637 	}
638 }
639 
640 /*
641  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
642  * adapter's rings In linux drivers, this should be placed near each
643  * napi_enable().
644  */
645 void
646 netmap_enable_all_rings(struct ifnet *ifp)
647 {
648 	if (NM_NA_VALID(ifp)) {
649 		netmap_set_all_rings(NA(ifp), 0 /* enabled */);
650 	}
651 }
652 
653 void
654 netmap_make_zombie(struct ifnet *ifp)
655 {
656 	if (NM_NA_VALID(ifp)) {
657 		struct netmap_adapter *na = NA(ifp);
658 		netmap_set_all_rings(na, NM_KR_LOCKED);
659 		na->na_flags |= NAF_ZOMBIE;
660 		netmap_set_all_rings(na, 0);
661 	}
662 }
663 
664 void
665 netmap_undo_zombie(struct ifnet *ifp)
666 {
667 	if (NM_NA_VALID(ifp)) {
668 		struct netmap_adapter *na = NA(ifp);
669 		if (na->na_flags & NAF_ZOMBIE) {
670 			netmap_set_all_rings(na, NM_KR_LOCKED);
671 			na->na_flags &= ~NAF_ZOMBIE;
672 			netmap_set_all_rings(na, 0);
673 		}
674 	}
675 }
676 
677 /*
678  * generic bound_checking function
679  */
680 u_int
681 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
682 {
683 	u_int oldv = *v;
684 	const char *op = NULL;
685 
686 	if (dflt < lo)
687 		dflt = lo;
688 	if (dflt > hi)
689 		dflt = hi;
690 	if (oldv < lo) {
691 		*v = dflt;
692 		op = "Bump";
693 	} else if (oldv > hi) {
694 		*v = hi;
695 		op = "Clamp";
696 	}
697 	if (op && msg)
698 		nm_prinf("%s %s to %d (was %d)", op, msg, *v, oldv);
699 	return *v;
700 }
701 
702 
703 /*
704  * packet-dump function, user-supplied or static buffer.
705  * The destination buffer must be at least 30+4*len
706  */
707 const char *
708 nm_dump_buf(char *p, int len, int lim, char *dst)
709 {
710 	static char _dst[8192];
711 	int i, j, i0;
712 	static char hex[] ="0123456789abcdef";
713 	char *o;	/* output position */
714 
715 #define P_HI(x)	hex[((x) & 0xf0)>>4]
716 #define P_LO(x)	hex[((x) & 0xf)]
717 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
718 	if (!dst)
719 		dst = _dst;
720 	if (lim <= 0 || lim > len)
721 		lim = len;
722 	o = dst;
723 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
724 	o += strlen(o);
725 	/* hexdump routine */
726 	for (i = 0; i < lim; ) {
727 		sprintf(o, "%5d: ", i);
728 		o += strlen(o);
729 		memset(o, ' ', 48);
730 		i0 = i;
731 		for (j=0; j < 16 && i < lim; i++, j++) {
732 			o[j*3] = P_HI(p[i]);
733 			o[j*3+1] = P_LO(p[i]);
734 		}
735 		i = i0;
736 		for (j=0; j < 16 && i < lim; i++, j++)
737 			o[j + 48] = P_C(p[i]);
738 		o[j+48] = '\n';
739 		o += j+49;
740 	}
741 	*o = '\0';
742 #undef P_HI
743 #undef P_LO
744 #undef P_C
745 	return dst;
746 }
747 
748 
749 /*
750  * Fetch configuration from the device, to cope with dynamic
751  * reconfigurations after loading the module.
752  */
753 /* call with NMG_LOCK held */
754 int
755 netmap_update_config(struct netmap_adapter *na)
756 {
757 	struct nm_config_info info;
758 
759 	bzero(&info, sizeof(info));
760 	if (na->nm_config == NULL ||
761 	    na->nm_config(na, &info)) {
762 		/* take whatever we had at init time */
763 		info.num_tx_rings = na->num_tx_rings;
764 		info.num_tx_descs = na->num_tx_desc;
765 		info.num_rx_rings = na->num_rx_rings;
766 		info.num_rx_descs = na->num_rx_desc;
767 		info.rx_buf_maxsize = na->rx_buf_maxsize;
768 	}
769 
770 	if (na->num_tx_rings == info.num_tx_rings &&
771 	    na->num_tx_desc == info.num_tx_descs &&
772 	    na->num_rx_rings == info.num_rx_rings &&
773 	    na->num_rx_desc == info.num_rx_descs &&
774 	    na->rx_buf_maxsize == info.rx_buf_maxsize)
775 		return 0; /* nothing changed */
776 	if (na->active_fds == 0) {
777 		na->num_tx_rings = info.num_tx_rings;
778 		na->num_tx_desc = info.num_tx_descs;
779 		na->num_rx_rings = info.num_rx_rings;
780 		na->num_rx_desc = info.num_rx_descs;
781 		na->rx_buf_maxsize = info.rx_buf_maxsize;
782 		if (netmap_verbose)
783 			nm_prinf("configuration changed for %s: txring %d x %d, "
784 				"rxring %d x %d, rxbufsz %d",
785 				na->name, na->num_tx_rings, na->num_tx_desc,
786 				na->num_rx_rings, na->num_rx_desc, na->rx_buf_maxsize);
787 		return 0;
788 	}
789 	nm_prerr("WARNING: configuration changed for %s while active: "
790 		"txring %d x %d, rxring %d x %d, rxbufsz %d",
791 		na->name, info.num_tx_rings, info.num_tx_descs,
792 		info.num_rx_rings, info.num_rx_descs,
793 		info.rx_buf_maxsize);
794 	return 1;
795 }
796 
797 /* nm_sync callbacks for the host rings */
798 static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
799 static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
800 
801 /* create the krings array and initialize the fields common to all adapters.
802  * The array layout is this:
803  *
804  *                    +----------+
805  * na->tx_rings ----->|          | \
806  *                    |          |  } na->num_tx_ring
807  *                    |          | /
808  *                    +----------+
809  *                    |          |    host tx kring
810  * na->rx_rings ----> +----------+
811  *                    |          | \
812  *                    |          |  } na->num_rx_rings
813  *                    |          | /
814  *                    +----------+
815  *                    |          |    host rx kring
816  *                    +----------+
817  * na->tailroom ----->|          | \
818  *                    |          |  } tailroom bytes
819  *                    |          | /
820  *                    +----------+
821  *
822  * Note: for compatibility, host krings are created even when not needed.
823  * The tailroom space is currently used by vale ports for allocating leases.
824  */
825 /* call with NMG_LOCK held */
826 int
827 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
828 {
829 	u_int i, len, ndesc;
830 	struct netmap_kring *kring;
831 	u_int n[NR_TXRX];
832 	enum txrx t;
833 	int err = 0;
834 
835 	if (na->tx_rings != NULL) {
836 		if (netmap_debug & NM_DEBUG_ON)
837 			nm_prerr("warning: krings were already created");
838 		return 0;
839 	}
840 
841 	/* account for the (possibly fake) host rings */
842 	n[NR_TX] = netmap_all_rings(na, NR_TX);
843 	n[NR_RX] = netmap_all_rings(na, NR_RX);
844 
845 	len = (n[NR_TX] + n[NR_RX]) *
846 		(sizeof(struct netmap_kring) + sizeof(struct netmap_kring *))
847 		+ tailroom;
848 
849 	na->tx_rings = nm_os_malloc((size_t)len);
850 	if (na->tx_rings == NULL) {
851 		nm_prerr("Cannot allocate krings");
852 		return ENOMEM;
853 	}
854 	na->rx_rings = na->tx_rings + n[NR_TX];
855 	na->tailroom = na->rx_rings + n[NR_RX];
856 
857 	/* link the krings in the krings array */
858 	kring = (struct netmap_kring *)((char *)na->tailroom + tailroom);
859 	for (i = 0; i < n[NR_TX] + n[NR_RX]; i++) {
860 		na->tx_rings[i] = kring;
861 		kring++;
862 	}
863 
864 	/*
865 	 * All fields in krings are 0 except the one initialized below.
866 	 * but better be explicit on important kring fields.
867 	 */
868 	for_rx_tx(t) {
869 		ndesc = nma_get_ndesc(na, t);
870 		for (i = 0; i < n[t]; i++) {
871 			kring = NMR(na, t)[i];
872 			bzero(kring, sizeof(*kring));
873 			kring->notify_na = na;
874 			kring->ring_id = i;
875 			kring->tx = t;
876 			kring->nkr_num_slots = ndesc;
877 			kring->nr_mode = NKR_NETMAP_OFF;
878 			kring->nr_pending_mode = NKR_NETMAP_OFF;
879 			if (i < nma_get_nrings(na, t)) {
880 				kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
881 			} else {
882 				if (!(na->na_flags & NAF_HOST_RINGS))
883 					kring->nr_kflags |= NKR_FAKERING;
884 				kring->nm_sync = (t == NR_TX ?
885 						netmap_txsync_to_host:
886 						netmap_rxsync_from_host);
887 			}
888 			kring->nm_notify = na->nm_notify;
889 			kring->rhead = kring->rcur = kring->nr_hwcur = 0;
890 			/*
891 			 * IMPORTANT: Always keep one slot empty.
892 			 */
893 			kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
894 			snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
895 					nm_txrx2str(t), i);
896 			nm_prdis("ktx %s h %d c %d t %d",
897 				kring->name, kring->rhead, kring->rcur, kring->rtail);
898 			err = nm_os_selinfo_init(&kring->si, kring->name);
899 			if (err) {
900 				netmap_krings_delete(na);
901 				return err;
902 			}
903 			mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
904 			kring->na = na;	/* setting this field marks the mutex as initialized */
905 		}
906 		err = nm_os_selinfo_init(&na->si[t], na->name);
907 		if (err) {
908 			netmap_krings_delete(na);
909 			return err;
910 		}
911 	}
912 
913 	return 0;
914 }
915 
916 
917 /* undo the actions performed by netmap_krings_create */
918 /* call with NMG_LOCK held */
919 void
920 netmap_krings_delete(struct netmap_adapter *na)
921 {
922 	struct netmap_kring **kring = na->tx_rings;
923 	enum txrx t;
924 
925 	if (na->tx_rings == NULL) {
926 		if (netmap_debug & NM_DEBUG_ON)
927 			nm_prerr("warning: krings were already deleted");
928 		return;
929 	}
930 
931 	for_rx_tx(t)
932 		nm_os_selinfo_uninit(&na->si[t]);
933 
934 	/* we rely on the krings layout described above */
935 	for ( ; kring != na->tailroom; kring++) {
936 		if ((*kring)->na != NULL)
937 			mtx_destroy(&(*kring)->q_lock);
938 		nm_os_selinfo_uninit(&(*kring)->si);
939 	}
940 	nm_os_free(na->tx_rings);
941 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
942 }
943 
944 
945 /*
946  * Destructor for NIC ports. They also have an mbuf queue
947  * on the rings connected to the host so we need to purge
948  * them first.
949  */
950 /* call with NMG_LOCK held */
951 void
952 netmap_hw_krings_delete(struct netmap_adapter *na)
953 {
954 	u_int lim = netmap_real_rings(na, NR_RX), i;
955 
956 	for (i = nma_get_nrings(na, NR_RX); i < lim; i++) {
957 		struct mbq *q = &NMR(na, NR_RX)[i]->rx_queue;
958 		nm_prdis("destroy sw mbq with len %d", mbq_len(q));
959 		mbq_purge(q);
960 		mbq_safe_fini(q);
961 	}
962 	netmap_krings_delete(na);
963 }
964 
965 static void
966 netmap_mem_drop(struct netmap_adapter *na)
967 {
968 	int last = netmap_mem_deref(na->nm_mem, na);
969 	/* if the native allocator had been overrided on regif,
970 	 * restore it now and drop the temporary one
971 	 */
972 	if (last && na->nm_mem_prev) {
973 		netmap_mem_put(na->nm_mem);
974 		na->nm_mem = na->nm_mem_prev;
975 		na->nm_mem_prev = NULL;
976 	}
977 }
978 
979 /*
980  * Undo everything that was done in netmap_do_regif(). In particular,
981  * call nm_register(ifp,0) to stop netmap mode on the interface and
982  * revert to normal operation.
983  */
984 /* call with NMG_LOCK held */
985 static void netmap_unset_ringid(struct netmap_priv_d *);
986 static void netmap_krings_put(struct netmap_priv_d *);
987 void
988 netmap_do_unregif(struct netmap_priv_d *priv)
989 {
990 	struct netmap_adapter *na = priv->np_na;
991 
992 	NMG_LOCK_ASSERT();
993 	na->active_fds--;
994 	/* unset nr_pending_mode and possibly release exclusive mode */
995 	netmap_krings_put(priv);
996 
997 #ifdef	WITH_MONITOR
998 	/* XXX check whether we have to do something with monitor
999 	 * when rings change nr_mode. */
1000 	if (na->active_fds <= 0) {
1001 		/* walk through all the rings and tell any monitor
1002 		 * that the port is going to exit netmap mode
1003 		 */
1004 		netmap_monitor_stop(na);
1005 	}
1006 #endif
1007 
1008 	if (na->active_fds <= 0 || nm_kring_pending(priv)) {
1009 		na->nm_register(na, 0);
1010 	}
1011 
1012 	/* delete rings and buffers that are no longer needed */
1013 	netmap_mem_rings_delete(na);
1014 
1015 	if (na->active_fds <= 0) {	/* last instance */
1016 		/*
1017 		 * (TO CHECK) We enter here
1018 		 * when the last reference to this file descriptor goes
1019 		 * away. This means we cannot have any pending poll()
1020 		 * or interrupt routine operating on the structure.
1021 		 * XXX The file may be closed in a thread while
1022 		 * another thread is using it.
1023 		 * Linux keeps the file opened until the last reference
1024 		 * by any outstanding ioctl/poll or mmap is gone.
1025 		 * FreeBSD does not track mmap()s (but we do) and
1026 		 * wakes up any sleeping poll(). Need to check what
1027 		 * happens if the close() occurs while a concurrent
1028 		 * syscall is running.
1029 		 */
1030 		if (netmap_debug & NM_DEBUG_ON)
1031 			nm_prinf("deleting last instance for %s", na->name);
1032 
1033 		if (nm_netmap_on(na)) {
1034 			nm_prerr("BUG: netmap on while going to delete the krings");
1035 		}
1036 
1037 		na->nm_krings_delete(na);
1038 	}
1039 
1040 	/* possibily decrement counter of tx_si/rx_si users */
1041 	netmap_unset_ringid(priv);
1042 	/* delete the nifp */
1043 	netmap_mem_if_delete(na, priv->np_nifp);
1044 	/* drop the allocator */
1045 	netmap_mem_drop(na);
1046 	/* mark the priv as unregistered */
1047 	priv->np_na = NULL;
1048 	priv->np_nifp = NULL;
1049 }
1050 
1051 struct netmap_priv_d*
1052 netmap_priv_new(void)
1053 {
1054 	struct netmap_priv_d *priv;
1055 
1056 	priv = nm_os_malloc(sizeof(struct netmap_priv_d));
1057 	if (priv == NULL)
1058 		return NULL;
1059 	priv->np_refs = 1;
1060 	nm_os_get_module();
1061 	return priv;
1062 }
1063 
1064 /*
1065  * Destructor of the netmap_priv_d, called when the fd is closed
1066  * Action: undo all the things done by NIOCREGIF,
1067  * On FreeBSD we need to track whether there are active mmap()s,
1068  * and we use np_active_mmaps for that. On linux, the field is always 0.
1069  * Return: 1 if we can free priv, 0 otherwise.
1070  *
1071  */
1072 /* call with NMG_LOCK held */
1073 void
1074 netmap_priv_delete(struct netmap_priv_d *priv)
1075 {
1076 	struct netmap_adapter *na = priv->np_na;
1077 
1078 	/* number of active references to this fd */
1079 	if (--priv->np_refs > 0) {
1080 		return;
1081 	}
1082 	nm_os_put_module();
1083 	if (na) {
1084 		netmap_do_unregif(priv);
1085 	}
1086 	netmap_unget_na(na, priv->np_ifp);
1087 	bzero(priv, sizeof(*priv));	/* for safety */
1088 	nm_os_free(priv);
1089 }
1090 
1091 
1092 /* call with NMG_LOCK *not* held */
1093 void
1094 netmap_dtor(void *data)
1095 {
1096 	struct netmap_priv_d *priv = data;
1097 
1098 	NMG_LOCK();
1099 	netmap_priv_delete(priv);
1100 	NMG_UNLOCK();
1101 }
1102 
1103 
1104 /*
1105  * Handlers for synchronization of the rings from/to the host stack.
1106  * These are associated to a network interface and are just another
1107  * ring pair managed by userspace.
1108  *
1109  * Netmap also supports transparent forwarding (NS_FORWARD and NR_FORWARD
1110  * flags):
1111  *
1112  * - Before releasing buffers on hw RX rings, the application can mark
1113  *   them with the NS_FORWARD flag. During the next RXSYNC or poll(), they
1114  *   will be forwarded to the host stack, similarly to what happened if
1115  *   the application moved them to the host TX ring.
1116  *
1117  * - Before releasing buffers on the host RX ring, the application can
1118  *   mark them with the NS_FORWARD flag. During the next RXSYNC or poll(),
1119  *   they will be forwarded to the hw TX rings, saving the application
1120  *   from doing the same task in user-space.
1121  *
1122  * Transparent fowarding can be enabled per-ring, by setting the NR_FORWARD
1123  * flag, or globally with the netmap_fwd sysctl.
1124  *
1125  * The transfer NIC --> host is relatively easy, just encapsulate
1126  * into mbufs and we are done. The host --> NIC side is slightly
1127  * harder because there might not be room in the tx ring so it
1128  * might take a while before releasing the buffer.
1129  */
1130 
1131 
1132 /*
1133  * Pass a whole queue of mbufs to the host stack as coming from 'dst'
1134  * We do not need to lock because the queue is private.
1135  * After this call the queue is empty.
1136  */
1137 static void
1138 netmap_send_up(struct ifnet *dst, struct mbq *q)
1139 {
1140 	struct mbuf *m;
1141 	struct mbuf *head = NULL, *prev = NULL;
1142 
1143 	/* Send packets up, outside the lock; head/prev machinery
1144 	 * is only useful for Windows. */
1145 	while ((m = mbq_dequeue(q)) != NULL) {
1146 		if (netmap_debug & NM_DEBUG_HOST)
1147 			nm_prinf("sending up pkt %p size %d", m, MBUF_LEN(m));
1148 		prev = nm_os_send_up(dst, m, prev);
1149 		if (head == NULL)
1150 			head = prev;
1151 	}
1152 	if (head)
1153 		nm_os_send_up(dst, NULL, head);
1154 	mbq_fini(q);
1155 }
1156 
1157 
1158 /*
1159  * Scan the buffers from hwcur to ring->head, and put a copy of those
1160  * marked NS_FORWARD (or all of them if forced) into a queue of mbufs.
1161  * Drop remaining packets in the unlikely event
1162  * of an mbuf shortage.
1163  */
1164 static void
1165 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1166 {
1167 	u_int const lim = kring->nkr_num_slots - 1;
1168 	u_int const head = kring->rhead;
1169 	u_int n;
1170 	struct netmap_adapter *na = kring->na;
1171 
1172 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1173 		struct mbuf *m;
1174 		struct netmap_slot *slot = &kring->ring->slot[n];
1175 
1176 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1177 			continue;
1178 		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1179 			nm_prlim(5, "bad pkt at %d len %d", n, slot->len);
1180 			continue;
1181 		}
1182 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1183 		/* XXX TODO: adapt to the case of a multisegment packet */
1184 		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1185 
1186 		if (m == NULL)
1187 			break;
1188 		mbq_enqueue(q, m);
1189 	}
1190 }
1191 
1192 static inline int
1193 _nm_may_forward(struct netmap_kring *kring)
1194 {
1195 	return	((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
1196 		 kring->na->na_flags & NAF_HOST_RINGS &&
1197 		 kring->tx == NR_RX);
1198 }
1199 
1200 static inline int
1201 nm_may_forward_up(struct netmap_kring *kring)
1202 {
1203 	return	_nm_may_forward(kring) &&
1204 		 kring->ring_id != kring->na->num_rx_rings;
1205 }
1206 
1207 static inline int
1208 nm_may_forward_down(struct netmap_kring *kring, int sync_flags)
1209 {
1210 	return	_nm_may_forward(kring) &&
1211 		 (sync_flags & NAF_CAN_FORWARD_DOWN) &&
1212 		 kring->ring_id == kring->na->num_rx_rings;
1213 }
1214 
1215 /*
1216  * Send to the NIC rings packets marked NS_FORWARD between
1217  * kring->nr_hwcur and kring->rhead.
1218  * Called under kring->rx_queue.lock on the sw rx ring.
1219  *
1220  * It can only be called if the user opened all the TX hw rings,
1221  * see NAF_CAN_FORWARD_DOWN flag.
1222  * We can touch the TX netmap rings (slots, head and cur) since
1223  * we are in poll/ioctl system call context, and the application
1224  * is not supposed to touch the ring (using a different thread)
1225  * during the execution of the system call.
1226  */
1227 static u_int
1228 netmap_sw_to_nic(struct netmap_adapter *na)
1229 {
1230 	struct netmap_kring *kring = na->rx_rings[na->num_rx_rings];
1231 	struct netmap_slot *rxslot = kring->ring->slot;
1232 	u_int i, rxcur = kring->nr_hwcur;
1233 	u_int const head = kring->rhead;
1234 	u_int const src_lim = kring->nkr_num_slots - 1;
1235 	u_int sent = 0;
1236 
1237 	/* scan rings to find space, then fill as much as possible */
1238 	for (i = 0; i < na->num_tx_rings; i++) {
1239 		struct netmap_kring *kdst = na->tx_rings[i];
1240 		struct netmap_ring *rdst = kdst->ring;
1241 		u_int const dst_lim = kdst->nkr_num_slots - 1;
1242 
1243 		/* XXX do we trust ring or kring->rcur,rtail ? */
1244 		for (; rxcur != head && !nm_ring_empty(rdst);
1245 		     rxcur = nm_next(rxcur, src_lim) ) {
1246 			struct netmap_slot *src, *dst, tmp;
1247 			u_int dst_head = rdst->head;
1248 
1249 			src = &rxslot[rxcur];
1250 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1251 				continue;
1252 
1253 			sent++;
1254 
1255 			dst = &rdst->slot[dst_head];
1256 
1257 			tmp = *src;
1258 
1259 			src->buf_idx = dst->buf_idx;
1260 			src->flags = NS_BUF_CHANGED;
1261 
1262 			dst->buf_idx = tmp.buf_idx;
1263 			dst->len = tmp.len;
1264 			dst->flags = NS_BUF_CHANGED;
1265 
1266 			rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
1267 		}
1268 		/* if (sent) XXX txsync ? it would be just an optimization */
1269 	}
1270 	return sent;
1271 }
1272 
1273 
1274 /*
1275  * netmap_txsync_to_host() passes packets up. We are called from a
1276  * system call in user process context, and the only contention
1277  * can be among multiple user threads erroneously calling
1278  * this routine concurrently.
1279  */
1280 static int
1281 netmap_txsync_to_host(struct netmap_kring *kring, int flags)
1282 {
1283 	struct netmap_adapter *na = kring->na;
1284 	u_int const lim = kring->nkr_num_slots - 1;
1285 	u_int const head = kring->rhead;
1286 	struct mbq q;
1287 
1288 	/* Take packets from hwcur to head and pass them up.
1289 	 * Force hwcur = head since netmap_grab_packets() stops at head
1290 	 */
1291 	mbq_init(&q);
1292 	netmap_grab_packets(kring, &q, 1 /* force */);
1293 	nm_prdis("have %d pkts in queue", mbq_len(&q));
1294 	kring->nr_hwcur = head;
1295 	kring->nr_hwtail = head + lim;
1296 	if (kring->nr_hwtail > lim)
1297 		kring->nr_hwtail -= lim + 1;
1298 
1299 	netmap_send_up(na->ifp, &q);
1300 	return 0;
1301 }
1302 
1303 
1304 /*
1305  * rxsync backend for packets coming from the host stack.
1306  * They have been put in kring->rx_queue by netmap_transmit().
1307  * We protect access to the kring using kring->rx_queue.lock
1308  *
1309  * also moves to the nic hw rings any packet the user has marked
1310  * for transparent-mode forwarding, then sets the NR_FORWARD
1311  * flag in the kring to let the caller push them out
1312  */
1313 static int
1314 netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
1315 {
1316 	struct netmap_adapter *na = kring->na;
1317 	struct netmap_ring *ring = kring->ring;
1318 	u_int nm_i, n;
1319 	u_int const lim = kring->nkr_num_slots - 1;
1320 	u_int const head = kring->rhead;
1321 	int ret = 0;
1322 	struct mbq *q = &kring->rx_queue, fq;
1323 
1324 	mbq_init(&fq); /* fq holds packets to be freed */
1325 
1326 	mbq_lock(q);
1327 
1328 	/* First part: import newly received packets */
1329 	n = mbq_len(q);
1330 	if (n) { /* grab packets from the queue */
1331 		struct mbuf *m;
1332 		uint32_t stop_i;
1333 
1334 		nm_i = kring->nr_hwtail;
1335 		stop_i = nm_prev(kring->nr_hwcur, lim);
1336 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1337 			int len = MBUF_LEN(m);
1338 			struct netmap_slot *slot = &ring->slot[nm_i];
1339 
1340 			m_copydata(m, 0, len, NMB(na, slot));
1341 			nm_prdis("nm %d len %d", nm_i, len);
1342 			if (netmap_debug & NM_DEBUG_HOST)
1343 				nm_prinf("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1344 
1345 			slot->len = len;
1346 			slot->flags = 0;
1347 			nm_i = nm_next(nm_i, lim);
1348 			mbq_enqueue(&fq, m);
1349 		}
1350 		kring->nr_hwtail = nm_i;
1351 	}
1352 
1353 	/*
1354 	 * Second part: skip past packets that userspace has released.
1355 	 */
1356 	nm_i = kring->nr_hwcur;
1357 	if (nm_i != head) { /* something was released */
1358 		if (nm_may_forward_down(kring, flags)) {
1359 			ret = netmap_sw_to_nic(na);
1360 			if (ret > 0) {
1361 				kring->nr_kflags |= NR_FORWARD;
1362 				ret = 0;
1363 			}
1364 		}
1365 		kring->nr_hwcur = head;
1366 	}
1367 
1368 	mbq_unlock(q);
1369 
1370 	mbq_purge(&fq);
1371 	mbq_fini(&fq);
1372 
1373 	return ret;
1374 }
1375 
1376 
1377 /* Get a netmap adapter for the port.
1378  *
1379  * If it is possible to satisfy the request, return 0
1380  * with *na containing the netmap adapter found.
1381  * Otherwise return an error code, with *na containing NULL.
1382  *
1383  * When the port is attached to a bridge, we always return
1384  * EBUSY.
1385  * Otherwise, if the port is already bound to a file descriptor,
1386  * then we unconditionally return the existing adapter into *na.
1387  * In all the other cases, we return (into *na) either native,
1388  * generic or NULL, according to the following table:
1389  *
1390  *					native_support
1391  * active_fds   dev.netmap.admode         YES     NO
1392  * -------------------------------------------------------
1393  *    >0              *                 NA(ifp) NA(ifp)
1394  *
1395  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1396  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1397  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1398  *
1399  */
1400 static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
1401 int
1402 netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na)
1403 {
1404 	/* generic support */
1405 	int i = netmap_admode;	/* Take a snapshot. */
1406 	struct netmap_adapter *prev_na;
1407 	int error = 0;
1408 
1409 	*na = NULL; /* default */
1410 
1411 	/* reset in case of invalid value */
1412 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1413 		i = netmap_admode = NETMAP_ADMODE_BEST;
1414 
1415 	if (NM_NA_VALID(ifp)) {
1416 		prev_na = NA(ifp);
1417 		/* If an adapter already exists, return it if
1418 		 * there are active file descriptors or if
1419 		 * netmap is not forced to use generic
1420 		 * adapters.
1421 		 */
1422 		if (NETMAP_OWNED_BY_ANY(prev_na)
1423 			|| i != NETMAP_ADMODE_GENERIC
1424 			|| prev_na->na_flags & NAF_FORCE_NATIVE
1425 #ifdef WITH_PIPES
1426 			/* ugly, but we cannot allow an adapter switch
1427 			 * if some pipe is referring to this one
1428 			 */
1429 			|| prev_na->na_next_pipe > 0
1430 #endif
1431 		) {
1432 			*na = prev_na;
1433 			goto assign_mem;
1434 		}
1435 	}
1436 
1437 	/* If there isn't native support and netmap is not allowed
1438 	 * to use generic adapters, we cannot satisfy the request.
1439 	 */
1440 	if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
1441 		return EOPNOTSUPP;
1442 
1443 	/* Otherwise, create a generic adapter and return it,
1444 	 * saving the previously used netmap adapter, if any.
1445 	 *
1446 	 * Note that here 'prev_na', if not NULL, MUST be a
1447 	 * native adapter, and CANNOT be a generic one. This is
1448 	 * true because generic adapters are created on demand, and
1449 	 * destroyed when not used anymore. Therefore, if the adapter
1450 	 * currently attached to an interface 'ifp' is generic, it
1451 	 * must be that
1452 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1453 	 * Consequently, if NA(ifp) is generic, we will enter one of
1454 	 * the branches above. This ensures that we never override
1455 	 * a generic adapter with another generic adapter.
1456 	 */
1457 	error = generic_netmap_attach(ifp);
1458 	if (error)
1459 		return error;
1460 
1461 	*na = NA(ifp);
1462 
1463 assign_mem:
1464 	if (nmd != NULL && !((*na)->na_flags & NAF_MEM_OWNER) &&
1465 	    (*na)->active_fds == 0 && ((*na)->nm_mem != nmd)) {
1466 		(*na)->nm_mem_prev = (*na)->nm_mem;
1467 		(*na)->nm_mem = netmap_mem_get(nmd);
1468 	}
1469 
1470 	return 0;
1471 }
1472 
1473 /*
1474  * MUST BE CALLED UNDER NMG_LOCK()
1475  *
1476  * Get a refcounted reference to a netmap adapter attached
1477  * to the interface specified by req.
1478  * This is always called in the execution of an ioctl().
1479  *
1480  * Return ENXIO if the interface specified by the request does
1481  * not exist, ENOTSUP if netmap is not supported by the interface,
1482  * EBUSY if the interface is already attached to a bridge,
1483  * EINVAL if parameters are invalid, ENOMEM if needed resources
1484  * could not be allocated.
1485  * If successful, hold a reference to the netmap adapter.
1486  *
1487  * If the interface specified by req is a system one, also keep
1488  * a reference to it and return a valid *ifp.
1489  */
1490 int
1491 netmap_get_na(struct nmreq_header *hdr,
1492 	      struct netmap_adapter **na, struct ifnet **ifp,
1493 	      struct netmap_mem_d *nmd, int create)
1494 {
1495 	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1496 	int error = 0;
1497 	struct netmap_adapter *ret = NULL;
1498 	int nmd_ref = 0;
1499 
1500 	*na = NULL;     /* default return value */
1501 	*ifp = NULL;
1502 
1503 	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1504 		return EINVAL;
1505 	}
1506 
1507 	if (req->nr_mode == NR_REG_PIPE_MASTER ||
1508 			req->nr_mode == NR_REG_PIPE_SLAVE) {
1509 		/* Do not accept deprecated pipe modes. */
1510 		nm_prerr("Deprecated pipe nr_mode, use xx{yy or xx}yy syntax");
1511 		return EINVAL;
1512 	}
1513 
1514 	NMG_LOCK_ASSERT();
1515 
1516 	/* if the request contain a memid, try to find the
1517 	 * corresponding memory region
1518 	 */
1519 	if (nmd == NULL && req->nr_mem_id) {
1520 		nmd = netmap_mem_find(req->nr_mem_id);
1521 		if (nmd == NULL)
1522 			return EINVAL;
1523 		/* keep the rereference */
1524 		nmd_ref = 1;
1525 	}
1526 
1527 	/* We cascade through all possible types of netmap adapter.
1528 	 * All netmap_get_*_na() functions return an error and an na,
1529 	 * with the following combinations:
1530 	 *
1531 	 * error    na
1532 	 *   0	   NULL		type doesn't match
1533 	 *  !0	   NULL		type matches, but na creation/lookup failed
1534 	 *   0	  !NULL		type matches and na created/found
1535 	 *  !0    !NULL		impossible
1536 	 */
1537 	error = netmap_get_null_na(hdr, na, nmd, create);
1538 	if (error || *na != NULL)
1539 		goto out;
1540 
1541 	/* try to see if this is a monitor port */
1542 	error = netmap_get_monitor_na(hdr, na, nmd, create);
1543 	if (error || *na != NULL)
1544 		goto out;
1545 
1546 	/* try to see if this is a pipe port */
1547 	error = netmap_get_pipe_na(hdr, na, nmd, create);
1548 	if (error || *na != NULL)
1549 		goto out;
1550 
1551 	/* try to see if this is a bridge port */
1552 	error = netmap_get_vale_na(hdr, na, nmd, create);
1553 	if (error)
1554 		goto out;
1555 
1556 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1557 		goto out;
1558 
1559 	/*
1560 	 * This must be a hardware na, lookup the name in the system.
1561 	 * Note that by hardware we actually mean "it shows up in ifconfig".
1562 	 * This may still be a tap, a veth/epair, or even a
1563 	 * persistent VALE port.
1564 	 */
1565 	*ifp = ifunit_ref(hdr->nr_name);
1566 	if (*ifp == NULL) {
1567 		error = ENXIO;
1568 		goto out;
1569 	}
1570 
1571 	error = netmap_get_hw_na(*ifp, nmd, &ret);
1572 	if (error)
1573 		goto out;
1574 
1575 	*na = ret;
1576 	netmap_adapter_get(ret);
1577 
1578 out:
1579 	if (error) {
1580 		if (ret)
1581 			netmap_adapter_put(ret);
1582 		if (*ifp) {
1583 			if_rele(*ifp);
1584 			*ifp = NULL;
1585 		}
1586 	}
1587 	if (nmd_ref)
1588 		netmap_mem_put(nmd);
1589 
1590 	return error;
1591 }
1592 
1593 /* undo netmap_get_na() */
1594 void
1595 netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)
1596 {
1597 	if (ifp)
1598 		if_rele(ifp);
1599 	if (na)
1600 		netmap_adapter_put(na);
1601 }
1602 
1603 
1604 #define NM_FAIL_ON(t) do {						\
1605 	if (unlikely(t)) {						\
1606 		nm_prlim(5, "%s: fail '" #t "' "				\
1607 			"h %d c %d t %d "				\
1608 			"rh %d rc %d rt %d "				\
1609 			"hc %d ht %d",					\
1610 			kring->name,					\
1611 			head, cur, ring->tail,				\
1612 			kring->rhead, kring->rcur, kring->rtail,	\
1613 			kring->nr_hwcur, kring->nr_hwtail);		\
1614 		return kring->nkr_num_slots;				\
1615 	}								\
1616 } while (0)
1617 
1618 /*
1619  * validate parameters on entry for *_txsync()
1620  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1621  * in case of error.
1622  *
1623  * rhead, rcur and rtail=hwtail are stored from previous round.
1624  * hwcur is the next packet to send to the ring.
1625  *
1626  * We want
1627  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1628  *
1629  * hwcur, rhead, rtail and hwtail are reliable
1630  */
1631 u_int
1632 nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1633 {
1634 	u_int head = ring->head; /* read only once */
1635 	u_int cur = ring->cur; /* read only once */
1636 	u_int n = kring->nkr_num_slots;
1637 
1638 	nm_prdis(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1639 		kring->name,
1640 		kring->nr_hwcur, kring->nr_hwtail,
1641 		ring->head, ring->cur, ring->tail);
1642 #if 1 /* kernel sanity checks; but we can trust the kring. */
1643 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
1644 	    kring->rtail >= n ||  kring->nr_hwtail >= n);
1645 #endif /* kernel sanity checks */
1646 	/*
1647 	 * user sanity checks. We only use head,
1648 	 * A, B, ... are possible positions for head:
1649 	 *
1650 	 *  0    A  rhead   B  rtail   C  n-1
1651 	 *  0    D  rtail   E  rhead   F  n-1
1652 	 *
1653 	 * B, F, D are valid. A, C, E are wrong
1654 	 */
1655 	if (kring->rtail >= kring->rhead) {
1656 		/* want rhead <= head <= rtail */
1657 		NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
1658 		/* and also head <= cur <= rtail */
1659 		NM_FAIL_ON(cur < head || cur > kring->rtail);
1660 	} else { /* here rtail < rhead */
1661 		/* we need head outside rtail .. rhead */
1662 		NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
1663 
1664 		/* two cases now: head <= rtail or head >= rhead  */
1665 		if (head <= kring->rtail) {
1666 			/* want head <= cur <= rtail */
1667 			NM_FAIL_ON(cur < head || cur > kring->rtail);
1668 		} else { /* head >= rhead */
1669 			/* cur must be outside rtail..head */
1670 			NM_FAIL_ON(cur > kring->rtail && cur < head);
1671 		}
1672 	}
1673 	if (ring->tail != kring->rtail) {
1674 		nm_prlim(5, "%s tail overwritten was %d need %d", kring->name,
1675 			ring->tail, kring->rtail);
1676 		ring->tail = kring->rtail;
1677 	}
1678 	kring->rhead = head;
1679 	kring->rcur = cur;
1680 	return head;
1681 }
1682 
1683 
1684 /*
1685  * validate parameters on entry for *_rxsync()
1686  * Returns ring->head if ok, kring->nkr_num_slots on error.
1687  *
1688  * For a valid configuration,
1689  * hwcur <= head <= cur <= tail <= hwtail
1690  *
1691  * We only consider head and cur.
1692  * hwcur and hwtail are reliable.
1693  *
1694  */
1695 u_int
1696 nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1697 {
1698 	uint32_t const n = kring->nkr_num_slots;
1699 	uint32_t head, cur;
1700 
1701 	nm_prdis(5,"%s kc %d kt %d h %d c %d t %d",
1702 		kring->name,
1703 		kring->nr_hwcur, kring->nr_hwtail,
1704 		ring->head, ring->cur, ring->tail);
1705 	/*
1706 	 * Before storing the new values, we should check they do not
1707 	 * move backwards. However:
1708 	 * - head is not an issue because the previous value is hwcur;
1709 	 * - cur could in principle go back, however it does not matter
1710 	 *   because we are processing a brand new rxsync()
1711 	 */
1712 	cur = kring->rcur = ring->cur;	/* read only once */
1713 	head = kring->rhead = ring->head;	/* read only once */
1714 #if 1 /* kernel sanity checks */
1715 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
1716 #endif /* kernel sanity checks */
1717 	/* user sanity checks */
1718 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1719 		/* want hwcur <= rhead <= hwtail */
1720 		NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
1721 		/* and also rhead <= rcur <= hwtail */
1722 		NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1723 	} else {
1724 		/* we need rhead outside hwtail..hwcur */
1725 		NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
1726 		/* two cases now: head <= hwtail or head >= hwcur  */
1727 		if (head <= kring->nr_hwtail) {
1728 			/* want head <= cur <= hwtail */
1729 			NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1730 		} else {
1731 			/* cur must be outside hwtail..head */
1732 			NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
1733 		}
1734 	}
1735 	if (ring->tail != kring->rtail) {
1736 		nm_prlim(5, "%s tail overwritten was %d need %d",
1737 			kring->name,
1738 			ring->tail, kring->rtail);
1739 		ring->tail = kring->rtail;
1740 	}
1741 	return head;
1742 }
1743 
1744 
1745 /*
1746  * Error routine called when txsync/rxsync detects an error.
1747  * Can't do much more than resetting head = cur = hwcur, tail = hwtail
1748  * Return 1 on reinit.
1749  *
1750  * This routine is only called by the upper half of the kernel.
1751  * It only reads hwcur (which is changed only by the upper half, too)
1752  * and hwtail (which may be changed by the lower half, but only on
1753  * a tx ring and only to increase it, so any error will be recovered
1754  * on the next call). For the above, we don't strictly need to call
1755  * it under lock.
1756  */
1757 int
1758 netmap_ring_reinit(struct netmap_kring *kring)
1759 {
1760 	struct netmap_ring *ring = kring->ring;
1761 	u_int i, lim = kring->nkr_num_slots - 1;
1762 	int errors = 0;
1763 
1764 	// XXX KASSERT nm_kr_tryget
1765 	nm_prlim(10, "called for %s", kring->name);
1766 	// XXX probably wrong to trust userspace
1767 	kring->rhead = ring->head;
1768 	kring->rcur  = ring->cur;
1769 	kring->rtail = ring->tail;
1770 
1771 	if (ring->cur > lim)
1772 		errors++;
1773 	if (ring->head > lim)
1774 		errors++;
1775 	if (ring->tail > lim)
1776 		errors++;
1777 	for (i = 0; i <= lim; i++) {
1778 		u_int idx = ring->slot[i].buf_idx;
1779 		u_int len = ring->slot[i].len;
1780 		if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1781 			nm_prlim(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1782 			ring->slot[i].buf_idx = 0;
1783 			ring->slot[i].len = 0;
1784 		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1785 			ring->slot[i].len = 0;
1786 			nm_prlim(5, "bad len at slot %d idx %d len %d", i, idx, len);
1787 		}
1788 	}
1789 	if (errors) {
1790 		nm_prlim(10, "total %d errors", errors);
1791 		nm_prlim(10, "%s reinit, cur %d -> %d tail %d -> %d",
1792 			kring->name,
1793 			ring->cur, kring->nr_hwcur,
1794 			ring->tail, kring->nr_hwtail);
1795 		ring->head = kring->rhead = kring->nr_hwcur;
1796 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1797 		ring->tail = kring->rtail = kring->nr_hwtail;
1798 	}
1799 	return (errors ? 1 : 0);
1800 }
1801 
1802 /* interpret the ringid and flags fields of an nmreq, by translating them
1803  * into a pair of intervals of ring indices:
1804  *
1805  * [priv->np_txqfirst, priv->np_txqlast) and
1806  * [priv->np_rxqfirst, priv->np_rxqlast)
1807  *
1808  */
1809 int
1810 netmap_interp_ringid(struct netmap_priv_d *priv, uint32_t nr_mode,
1811 			uint16_t nr_ringid, uint64_t nr_flags)
1812 {
1813 	struct netmap_adapter *na = priv->np_na;
1814 	int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
1815 	enum txrx t;
1816 	u_int j;
1817 
1818 	for_rx_tx(t) {
1819 		if (nr_flags & excluded_direction[t]) {
1820 			priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1821 			continue;
1822 		}
1823 		switch (nr_mode) {
1824 		case NR_REG_ALL_NIC:
1825 		case NR_REG_NULL:
1826 			priv->np_qfirst[t] = 0;
1827 			priv->np_qlast[t] = nma_get_nrings(na, t);
1828 			nm_prdis("ALL/PIPE: %s %d %d", nm_txrx2str(t),
1829 				priv->np_qfirst[t], priv->np_qlast[t]);
1830 			break;
1831 		case NR_REG_SW:
1832 		case NR_REG_NIC_SW:
1833 			if (!(na->na_flags & NAF_HOST_RINGS)) {
1834 				nm_prerr("host rings not supported");
1835 				return EINVAL;
1836 			}
1837 			priv->np_qfirst[t] = (nr_mode == NR_REG_SW ?
1838 				nma_get_nrings(na, t) : 0);
1839 			priv->np_qlast[t] = netmap_all_rings(na, t);
1840 			nm_prdis("%s: %s %d %d", nr_mode == NR_REG_SW ? "SW" : "NIC+SW",
1841 				nm_txrx2str(t),
1842 				priv->np_qfirst[t], priv->np_qlast[t]);
1843 			break;
1844 		case NR_REG_ONE_NIC:
1845 			if (nr_ringid >= na->num_tx_rings &&
1846 					nr_ringid >= na->num_rx_rings) {
1847 				nm_prerr("invalid ring id %d", nr_ringid);
1848 				return EINVAL;
1849 			}
1850 			/* if not enough rings, use the first one */
1851 			j = nr_ringid;
1852 			if (j >= nma_get_nrings(na, t))
1853 				j = 0;
1854 			priv->np_qfirst[t] = j;
1855 			priv->np_qlast[t] = j + 1;
1856 			nm_prdis("ONE_NIC: %s %d %d", nm_txrx2str(t),
1857 				priv->np_qfirst[t], priv->np_qlast[t]);
1858 			break;
1859 		default:
1860 			nm_prerr("invalid regif type %d", nr_mode);
1861 			return EINVAL;
1862 		}
1863 	}
1864 	priv->np_flags = nr_flags;
1865 
1866 	/* Allow transparent forwarding mode in the host --> nic
1867 	 * direction only if all the TX hw rings have been opened. */
1868 	if (priv->np_qfirst[NR_TX] == 0 &&
1869 			priv->np_qlast[NR_TX] >= na->num_tx_rings) {
1870 		priv->np_sync_flags |= NAF_CAN_FORWARD_DOWN;
1871 	}
1872 
1873 	if (netmap_verbose) {
1874 		nm_prinf("%s: tx [%d,%d) rx [%d,%d) id %d",
1875 			na->name,
1876 			priv->np_qfirst[NR_TX],
1877 			priv->np_qlast[NR_TX],
1878 			priv->np_qfirst[NR_RX],
1879 			priv->np_qlast[NR_RX],
1880 			nr_ringid);
1881 	}
1882 	return 0;
1883 }
1884 
1885 
1886 /*
1887  * Set the ring ID. For devices with a single queue, a request
1888  * for all rings is the same as a single ring.
1889  */
1890 static int
1891 netmap_set_ringid(struct netmap_priv_d *priv, uint32_t nr_mode,
1892 		uint16_t nr_ringid, uint64_t nr_flags)
1893 {
1894 	struct netmap_adapter *na = priv->np_na;
1895 	int error;
1896 	enum txrx t;
1897 
1898 	error = netmap_interp_ringid(priv, nr_mode, nr_ringid, nr_flags);
1899 	if (error) {
1900 		return error;
1901 	}
1902 
1903 	priv->np_txpoll = (nr_flags & NR_NO_TX_POLL) ? 0 : 1;
1904 
1905 	/* optimization: count the users registered for more than
1906 	 * one ring, which are the ones sleeping on the global queue.
1907 	 * The default netmap_notify() callback will then
1908 	 * avoid signaling the global queue if nobody is using it
1909 	 */
1910 	for_rx_tx(t) {
1911 		if (nm_si_user(priv, t))
1912 			na->si_users[t]++;
1913 	}
1914 	return 0;
1915 }
1916 
1917 static void
1918 netmap_unset_ringid(struct netmap_priv_d *priv)
1919 {
1920 	struct netmap_adapter *na = priv->np_na;
1921 	enum txrx t;
1922 
1923 	for_rx_tx(t) {
1924 		if (nm_si_user(priv, t))
1925 			na->si_users[t]--;
1926 		priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1927 	}
1928 	priv->np_flags = 0;
1929 	priv->np_txpoll = 0;
1930 	priv->np_kloop_state = 0;
1931 }
1932 
1933 
1934 /* Set the nr_pending_mode for the requested rings.
1935  * If requested, also try to get exclusive access to the rings, provided
1936  * the rings we want to bind are not exclusively owned by a previous bind.
1937  */
1938 static int
1939 netmap_krings_get(struct netmap_priv_d *priv)
1940 {
1941 	struct netmap_adapter *na = priv->np_na;
1942 	u_int i;
1943 	struct netmap_kring *kring;
1944 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1945 	enum txrx t;
1946 
1947 	if (netmap_debug & NM_DEBUG_ON)
1948 		nm_prinf("%s: grabbing tx [%d, %d) rx [%d, %d)",
1949 			na->name,
1950 			priv->np_qfirst[NR_TX],
1951 			priv->np_qlast[NR_TX],
1952 			priv->np_qfirst[NR_RX],
1953 			priv->np_qlast[NR_RX]);
1954 
1955 	/* first round: check that all the requested rings
1956 	 * are neither alread exclusively owned, nor we
1957 	 * want exclusive ownership when they are already in use
1958 	 */
1959 	for_rx_tx(t) {
1960 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1961 			kring = NMR(na, t)[i];
1962 			if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
1963 			    (kring->users && excl))
1964 			{
1965 				nm_prdis("ring %s busy", kring->name);
1966 				return EBUSY;
1967 			}
1968 		}
1969 	}
1970 
1971 	/* second round: increment usage count (possibly marking them
1972 	 * as exclusive) and set the nr_pending_mode
1973 	 */
1974 	for_rx_tx(t) {
1975 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1976 			kring = NMR(na, t)[i];
1977 			kring->users++;
1978 			if (excl)
1979 				kring->nr_kflags |= NKR_EXCLUSIVE;
1980 	                kring->nr_pending_mode = NKR_NETMAP_ON;
1981 		}
1982 	}
1983 
1984 	return 0;
1985 
1986 }
1987 
1988 /* Undo netmap_krings_get(). This is done by clearing the exclusive mode
1989  * if was asked on regif, and unset the nr_pending_mode if we are the
1990  * last users of the involved rings. */
1991 static void
1992 netmap_krings_put(struct netmap_priv_d *priv)
1993 {
1994 	struct netmap_adapter *na = priv->np_na;
1995 	u_int i;
1996 	struct netmap_kring *kring;
1997 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1998 	enum txrx t;
1999 
2000 	nm_prdis("%s: releasing tx [%d, %d) rx [%d, %d)",
2001 			na->name,
2002 			priv->np_qfirst[NR_TX],
2003 			priv->np_qlast[NR_TX],
2004 			priv->np_qfirst[NR_RX],
2005 			priv->np_qlast[MR_RX]);
2006 
2007 	for_rx_tx(t) {
2008 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
2009 			kring = NMR(na, t)[i];
2010 			if (excl)
2011 				kring->nr_kflags &= ~NKR_EXCLUSIVE;
2012 			kring->users--;
2013 			if (kring->users == 0)
2014 				kring->nr_pending_mode = NKR_NETMAP_OFF;
2015 		}
2016 	}
2017 }
2018 
2019 static int
2020 nm_priv_rx_enabled(struct netmap_priv_d *priv)
2021 {
2022 	return (priv->np_qfirst[NR_RX] != priv->np_qlast[NR_RX]);
2023 }
2024 
2025 /* Validate the CSB entries for both directions (atok and ktoa).
2026  * To be called under NMG_LOCK(). */
2027 static int
2028 netmap_csb_validate(struct netmap_priv_d *priv, struct nmreq_opt_csb *csbo)
2029 {
2030 	struct nm_csb_atok *csb_atok_base =
2031 		(struct nm_csb_atok *)(uintptr_t)csbo->csb_atok;
2032 	struct nm_csb_ktoa *csb_ktoa_base =
2033 		(struct nm_csb_ktoa *)(uintptr_t)csbo->csb_ktoa;
2034 	enum txrx t;
2035 	int num_rings[NR_TXRX], tot_rings;
2036 	size_t entry_size[2];
2037 	void *csb_start[2];
2038 	int i;
2039 
2040 	if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) {
2041 		nm_prerr("Cannot update CSB while kloop is running");
2042 		return EBUSY;
2043 	}
2044 
2045 	tot_rings = 0;
2046 	for_rx_tx(t) {
2047 		num_rings[t] = priv->np_qlast[t] - priv->np_qfirst[t];
2048 		tot_rings += num_rings[t];
2049 	}
2050 	if (tot_rings <= 0)
2051 		return 0;
2052 
2053 	if (!(priv->np_flags & NR_EXCLUSIVE)) {
2054 		nm_prerr("CSB mode requires NR_EXCLUSIVE");
2055 		return EINVAL;
2056 	}
2057 
2058 	entry_size[0] = sizeof(*csb_atok_base);
2059 	entry_size[1] = sizeof(*csb_ktoa_base);
2060 	csb_start[0] = (void *)csb_atok_base;
2061 	csb_start[1] = (void *)csb_ktoa_base;
2062 
2063 	for (i = 0; i < 2; i++) {
2064 		/* On Linux we could use access_ok() to simplify
2065 		 * the validation. However, the advantage of
2066 		 * this approach is that it works also on
2067 		 * FreeBSD. */
2068 		size_t csb_size = tot_rings * entry_size[i];
2069 		void *tmp;
2070 		int err;
2071 
2072 		if ((uintptr_t)csb_start[i] & (entry_size[i]-1)) {
2073 			nm_prerr("Unaligned CSB address");
2074 			return EINVAL;
2075 		}
2076 
2077 		tmp = nm_os_malloc(csb_size);
2078 		if (!tmp)
2079 			return ENOMEM;
2080 		if (i == 0) {
2081 			/* Application --> kernel direction. */
2082 			err = copyin(csb_start[i], tmp, csb_size);
2083 		} else {
2084 			/* Kernel --> application direction. */
2085 			memset(tmp, 0, csb_size);
2086 			err = copyout(tmp, csb_start[i], csb_size);
2087 		}
2088 		nm_os_free(tmp);
2089 		if (err) {
2090 			nm_prerr("Invalid CSB address");
2091 			return err;
2092 		}
2093 	}
2094 
2095 	priv->np_csb_atok_base = csb_atok_base;
2096 	priv->np_csb_ktoa_base = csb_ktoa_base;
2097 
2098 	/* Initialize the CSB. */
2099 	for_rx_tx(t) {
2100 		for (i = 0; i < num_rings[t]; i++) {
2101 			struct netmap_kring *kring =
2102 				NMR(priv->np_na, t)[i + priv->np_qfirst[t]];
2103 			struct nm_csb_atok *csb_atok = csb_atok_base + i;
2104 			struct nm_csb_ktoa *csb_ktoa = csb_ktoa_base + i;
2105 
2106 			if (t == NR_RX) {
2107 				csb_atok += num_rings[NR_TX];
2108 				csb_ktoa += num_rings[NR_TX];
2109 			}
2110 
2111 			CSB_WRITE(csb_atok, head, kring->rhead);
2112 			CSB_WRITE(csb_atok, cur, kring->rcur);
2113 			CSB_WRITE(csb_atok, appl_need_kick, 1);
2114 			CSB_WRITE(csb_atok, sync_flags, 1);
2115 			CSB_WRITE(csb_ktoa, hwcur, kring->nr_hwcur);
2116 			CSB_WRITE(csb_ktoa, hwtail, kring->nr_hwtail);
2117 			CSB_WRITE(csb_ktoa, kern_need_kick, 1);
2118 
2119 			nm_prinf("csb_init for kring %s: head %u, cur %u, "
2120 				"hwcur %u, hwtail %u", kring->name,
2121 				kring->rhead, kring->rcur, kring->nr_hwcur,
2122 				kring->nr_hwtail);
2123 		}
2124 	}
2125 
2126 	return 0;
2127 }
2128 
2129 /* Ensure that the netmap adapter can support the given MTU.
2130  * @return EINVAL if the na cannot be set to mtu, 0 otherwise.
2131  */
2132 int
2133 netmap_buf_size_validate(const struct netmap_adapter *na, unsigned mtu) {
2134 	unsigned nbs = NETMAP_BUF_SIZE(na);
2135 
2136 	if (mtu <= na->rx_buf_maxsize) {
2137 		/* The MTU fits a single NIC slot. We only
2138 		 * Need to check that netmap buffers are
2139 		 * large enough to hold an MTU. NS_MOREFRAG
2140 		 * cannot be used in this case. */
2141 		if (nbs < mtu) {
2142 			nm_prerr("error: netmap buf size (%u) "
2143 				 "< device MTU (%u)", nbs, mtu);
2144 			return EINVAL;
2145 		}
2146 	} else {
2147 		/* More NIC slots may be needed to receive
2148 		 * or transmit a single packet. Check that
2149 		 * the adapter supports NS_MOREFRAG and that
2150 		 * netmap buffers are large enough to hold
2151 		 * the maximum per-slot size. */
2152 		if (!(na->na_flags & NAF_MOREFRAG)) {
2153 			nm_prerr("error: large MTU (%d) needed "
2154 				 "but %s does not support "
2155 				 "NS_MOREFRAG", mtu,
2156 				 na->ifp->if_xname);
2157 			return EINVAL;
2158 		} else if (nbs < na->rx_buf_maxsize) {
2159 			nm_prerr("error: using NS_MOREFRAG on "
2160 				 "%s requires netmap buf size "
2161 				 ">= %u", na->ifp->if_xname,
2162 				 na->rx_buf_maxsize);
2163 			return EINVAL;
2164 		} else {
2165 			nm_prinf("info: netmap application on "
2166 				 "%s needs to support "
2167 				 "NS_MOREFRAG "
2168 				 "(MTU=%u,netmap_buf_size=%u)",
2169 				 na->ifp->if_xname, mtu, nbs);
2170 		}
2171 	}
2172 	return 0;
2173 }
2174 
2175 
2176 /*
2177  * possibly move the interface to netmap-mode.
2178  * If success it returns a pointer to netmap_if, otherwise NULL.
2179  * This must be called with NMG_LOCK held.
2180  *
2181  * The following na callbacks are called in the process:
2182  *
2183  * na->nm_config()			[by netmap_update_config]
2184  * (get current number and size of rings)
2185  *
2186  *  	We have a generic one for linux (netmap_linux_config).
2187  *  	The bwrap has to override this, since it has to forward
2188  *  	the request to the wrapped adapter (netmap_bwrap_config).
2189  *
2190  *
2191  * na->nm_krings_create()
2192  * (create and init the krings array)
2193  *
2194  * 	One of the following:
2195  *
2196  *	* netmap_hw_krings_create, 			(hw ports)
2197  *		creates the standard layout for the krings
2198  * 		and adds the mbq (used for the host rings).
2199  *
2200  * 	* netmap_vp_krings_create			(VALE ports)
2201  * 		add leases and scratchpads
2202  *
2203  * 	* netmap_pipe_krings_create			(pipes)
2204  * 		create the krings and rings of both ends and
2205  * 		cross-link them
2206  *
2207  *      * netmap_monitor_krings_create 			(monitors)
2208  *      	avoid allocating the mbq
2209  *
2210  *      * netmap_bwrap_krings_create			(bwraps)
2211  *      	create both the brap krings array,
2212  *      	the krings array of the wrapped adapter, and
2213  *      	(if needed) the fake array for the host adapter
2214  *
2215  * na->nm_register(, 1)
2216  * (put the adapter in netmap mode)
2217  *
2218  * 	This may be one of the following:
2219  *
2220  * 	* netmap_hw_reg				        (hw ports)
2221  * 		checks that the ifp is still there, then calls
2222  * 		the hardware specific callback;
2223  *
2224  * 	* netmap_vp_reg					(VALE ports)
2225  *		If the port is connected to a bridge,
2226  *		set the NAF_NETMAP_ON flag under the
2227  *		bridge write lock.
2228  *
2229  *	* netmap_pipe_reg				(pipes)
2230  *		inform the other pipe end that it is no
2231  *		longer responsible for the lifetime of this
2232  *		pipe end
2233  *
2234  *	* netmap_monitor_reg				(monitors)
2235  *		intercept the sync callbacks of the monitored
2236  *		rings
2237  *
2238  *	* netmap_bwrap_reg				(bwraps)
2239  *		cross-link the bwrap and hwna rings,
2240  *		forward the request to the hwna, override
2241  *		the hwna notify callback (to get the frames
2242  *		coming from outside go through the bridge).
2243  *
2244  *
2245  */
2246 int
2247 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
2248 	uint32_t nr_mode, uint16_t nr_ringid, uint64_t nr_flags)
2249 {
2250 	struct netmap_if *nifp = NULL;
2251 	int error;
2252 
2253 	NMG_LOCK_ASSERT();
2254 	priv->np_na = na;     /* store the reference */
2255 	error = netmap_mem_finalize(na->nm_mem, na);
2256 	if (error)
2257 		goto err;
2258 
2259 	if (na->active_fds == 0) {
2260 
2261 		/* cache the allocator info in the na */
2262 		error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
2263 		if (error)
2264 			goto err_drop_mem;
2265 		nm_prdis("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
2266 					    na->na_lut.objsize);
2267 
2268 		/* ring configuration may have changed, fetch from the card */
2269 		netmap_update_config(na);
2270 	}
2271 
2272 	/* compute the range of tx and rx rings to monitor */
2273 	error = netmap_set_ringid(priv, nr_mode, nr_ringid, nr_flags);
2274 	if (error)
2275 		goto err_put_lut;
2276 
2277 	if (na->active_fds == 0) {
2278 		/*
2279 		 * If this is the first registration of the adapter,
2280 		 * perform sanity checks and create the in-kernel view
2281 		 * of the netmap rings (the netmap krings).
2282 		 */
2283 		if (na->ifp && nm_priv_rx_enabled(priv)) {
2284 			/* This netmap adapter is attached to an ifnet. */
2285 			unsigned mtu = nm_os_ifnet_mtu(na->ifp);
2286 
2287 			nm_prdis("%s: mtu %d rx_buf_maxsize %d netmap_buf_size %d",
2288 				na->name, mtu, na->rx_buf_maxsize, NETMAP_BUF_SIZE(na));
2289 
2290 			if (na->rx_buf_maxsize == 0) {
2291 				nm_prerr("%s: error: rx_buf_maxsize == 0", na->name);
2292 				error = EIO;
2293 				goto err_drop_mem;
2294 			}
2295 
2296 			error = netmap_buf_size_validate(na, mtu);
2297 			if (error)
2298 				goto err_drop_mem;
2299 		}
2300 
2301 		/*
2302 		 * Depending on the adapter, this may also create
2303 		 * the netmap rings themselves
2304 		 */
2305 		error = na->nm_krings_create(na);
2306 		if (error)
2307 			goto err_put_lut;
2308 
2309 	}
2310 
2311 	/* now the krings must exist and we can check whether some
2312 	 * previous bind has exclusive ownership on them, and set
2313 	 * nr_pending_mode
2314 	 */
2315 	error = netmap_krings_get(priv);
2316 	if (error)
2317 		goto err_del_krings;
2318 
2319 	/* create all needed missing netmap rings */
2320 	error = netmap_mem_rings_create(na);
2321 	if (error)
2322 		goto err_rel_excl;
2323 
2324 	/* in all cases, create a new netmap if */
2325 	nifp = netmap_mem_if_new(na, priv);
2326 	if (nifp == NULL) {
2327 		error = ENOMEM;
2328 		goto err_rel_excl;
2329 	}
2330 
2331 	if (nm_kring_pending(priv)) {
2332 		/* Some kring is switching mode, tell the adapter to
2333 		 * react on this. */
2334 		error = na->nm_register(na, 1);
2335 		if (error)
2336 			goto err_del_if;
2337 	}
2338 
2339 	/* Commit the reference. */
2340 	na->active_fds++;
2341 
2342 	/*
2343 	 * advertise that the interface is ready by setting np_nifp.
2344 	 * The barrier is needed because readers (poll, *SYNC and mmap)
2345 	 * check for priv->np_nifp != NULL without locking
2346 	 */
2347 	mb(); /* make sure previous writes are visible to all CPUs */
2348 	priv->np_nifp = nifp;
2349 
2350 	return 0;
2351 
2352 err_del_if:
2353 	netmap_mem_if_delete(na, nifp);
2354 err_rel_excl:
2355 	netmap_krings_put(priv);
2356 	netmap_mem_rings_delete(na);
2357 err_del_krings:
2358 	if (na->active_fds == 0)
2359 		na->nm_krings_delete(na);
2360 err_put_lut:
2361 	if (na->active_fds == 0)
2362 		memset(&na->na_lut, 0, sizeof(na->na_lut));
2363 err_drop_mem:
2364 	netmap_mem_drop(na);
2365 err:
2366 	priv->np_na = NULL;
2367 	return error;
2368 }
2369 
2370 
2371 /*
2372  * update kring and ring at the end of rxsync/txsync.
2373  */
2374 static inline void
2375 nm_sync_finalize(struct netmap_kring *kring)
2376 {
2377 	/*
2378 	 * Update ring tail to what the kernel knows
2379 	 * After txsync: head/rhead/hwcur might be behind cur/rcur
2380 	 * if no carrier.
2381 	 */
2382 	kring->ring->tail = kring->rtail = kring->nr_hwtail;
2383 
2384 	nm_prdis(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2385 		kring->name, kring->nr_hwcur, kring->nr_hwtail,
2386 		kring->rhead, kring->rcur, kring->rtail);
2387 }
2388 
2389 /* set ring timestamp */
2390 static inline void
2391 ring_timestamp_set(struct netmap_ring *ring)
2392 {
2393 	if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) {
2394 		microtime(&ring->ts);
2395 	}
2396 }
2397 
2398 static int nmreq_copyin(struct nmreq_header *, int);
2399 static int nmreq_copyout(struct nmreq_header *, int);
2400 static int nmreq_checkoptions(struct nmreq_header *);
2401 
2402 /*
2403  * ioctl(2) support for the "netmap" device.
2404  *
2405  * Following a list of accepted commands:
2406  * - NIOCCTRL		device control API
2407  * - NIOCTXSYNC		sync TX rings
2408  * - NIOCRXSYNC		sync RX rings
2409  * - SIOCGIFADDR	just for convenience
2410  * - NIOCGINFO		deprecated (legacy API)
2411  * - NIOCREGIF		deprecated (legacy API)
2412  *
2413  * Return 0 on success, errno otherwise.
2414  */
2415 int
2416 netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
2417 		struct thread *td, int nr_body_is_user)
2418 {
2419 	struct mbq q;	/* packets from RX hw queues to host stack */
2420 	struct netmap_adapter *na = NULL;
2421 	struct netmap_mem_d *nmd = NULL;
2422 	struct ifnet *ifp = NULL;
2423 	int error = 0;
2424 	u_int i, qfirst, qlast;
2425 	struct netmap_kring **krings;
2426 	int sync_flags;
2427 	enum txrx t;
2428 
2429 	switch (cmd) {
2430 	case NIOCCTRL: {
2431 		struct nmreq_header *hdr = (struct nmreq_header *)data;
2432 
2433 		if (hdr->nr_version < NETMAP_MIN_API ||
2434 		    hdr->nr_version > NETMAP_MAX_API) {
2435 			nm_prerr("API mismatch: got %d need %d",
2436 				hdr->nr_version, NETMAP_API);
2437 			return EINVAL;
2438 		}
2439 
2440 		/* Make a kernel-space copy of the user-space nr_body.
2441 		 * For convenince, the nr_body pointer and the pointers
2442 		 * in the options list will be replaced with their
2443 		 * kernel-space counterparts. The original pointers are
2444 		 * saved internally and later restored by nmreq_copyout
2445 		 */
2446 		error = nmreq_copyin(hdr, nr_body_is_user);
2447 		if (error) {
2448 			return error;
2449 		}
2450 
2451 		/* Sanitize hdr->nr_name. */
2452 		hdr->nr_name[sizeof(hdr->nr_name) - 1] = '\0';
2453 
2454 		switch (hdr->nr_reqtype) {
2455 		case NETMAP_REQ_REGISTER: {
2456 			struct nmreq_register *req =
2457 				(struct nmreq_register *)(uintptr_t)hdr->nr_body;
2458 			struct netmap_if *nifp;
2459 
2460 			/* Protect access to priv from concurrent requests. */
2461 			NMG_LOCK();
2462 			do {
2463 				struct nmreq_option *opt;
2464 				u_int memflags;
2465 
2466 				if (priv->np_nifp != NULL) {	/* thread already registered */
2467 					error = EBUSY;
2468 					break;
2469 				}
2470 
2471 #ifdef WITH_EXTMEM
2472 				opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
2473 						NETMAP_REQ_OPT_EXTMEM);
2474 				if (opt != NULL) {
2475 					struct nmreq_opt_extmem *e =
2476 						(struct nmreq_opt_extmem *)opt;
2477 
2478 					error = nmreq_checkduplicate(opt);
2479 					if (error) {
2480 						opt->nro_status = error;
2481 						break;
2482 					}
2483 					nmd = netmap_mem_ext_create(e->nro_usrptr,
2484 							&e->nro_info, &error);
2485 					opt->nro_status = error;
2486 					if (nmd == NULL)
2487 						break;
2488 				}
2489 #endif /* WITH_EXTMEM */
2490 
2491 				if (nmd == NULL && req->nr_mem_id) {
2492 					/* find the allocator and get a reference */
2493 					nmd = netmap_mem_find(req->nr_mem_id);
2494 					if (nmd == NULL) {
2495 						if (netmap_verbose) {
2496 							nm_prerr("%s: failed to find mem_id %u",
2497 									hdr->nr_name, req->nr_mem_id);
2498 						}
2499 						error = EINVAL;
2500 						break;
2501 					}
2502 				}
2503 				/* find the interface and a reference */
2504 				error = netmap_get_na(hdr, &na, &ifp, nmd,
2505 						      1 /* create */); /* keep reference */
2506 				if (error)
2507 					break;
2508 				if (NETMAP_OWNED_BY_KERN(na)) {
2509 					error = EBUSY;
2510 					break;
2511 				}
2512 
2513 				if (na->virt_hdr_len && !(req->nr_flags & NR_ACCEPT_VNET_HDR)) {
2514 					nm_prerr("virt_hdr_len=%d, but application does "
2515 						"not accept it", na->virt_hdr_len);
2516 					error = EIO;
2517 					break;
2518 				}
2519 
2520 				error = netmap_do_regif(priv, na, req->nr_mode,
2521 							req->nr_ringid, req->nr_flags);
2522 				if (error) {    /* reg. failed, release priv and ref */
2523 					break;
2524 				}
2525 
2526 				opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
2527 							NETMAP_REQ_OPT_CSB);
2528 				if (opt != NULL) {
2529 					struct nmreq_opt_csb *csbo =
2530 						(struct nmreq_opt_csb *)opt;
2531 					error = nmreq_checkduplicate(opt);
2532 					if (!error) {
2533 						error = netmap_csb_validate(priv, csbo);
2534 					}
2535 					opt->nro_status = error;
2536 					if (error) {
2537 						netmap_do_unregif(priv);
2538 						break;
2539 					}
2540 				}
2541 
2542 				nifp = priv->np_nifp;
2543 
2544 				/* return the offset of the netmap_if object */
2545 				req->nr_rx_rings = na->num_rx_rings;
2546 				req->nr_tx_rings = na->num_tx_rings;
2547 				req->nr_rx_slots = na->num_rx_desc;
2548 				req->nr_tx_slots = na->num_tx_desc;
2549 				error = netmap_mem_get_info(na->nm_mem, &req->nr_memsize, &memflags,
2550 					&req->nr_mem_id);
2551 				if (error) {
2552 					netmap_do_unregif(priv);
2553 					break;
2554 				}
2555 				if (memflags & NETMAP_MEM_PRIVATE) {
2556 					*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2557 				}
2558 				for_rx_tx(t) {
2559 					priv->np_si[t] = nm_si_user(priv, t) ?
2560 						&na->si[t] : &NMR(na, t)[priv->np_qfirst[t]]->si;
2561 				}
2562 
2563 				if (req->nr_extra_bufs) {
2564 					if (netmap_verbose)
2565 						nm_prinf("requested %d extra buffers",
2566 							req->nr_extra_bufs);
2567 					req->nr_extra_bufs = netmap_extra_alloc(na,
2568 						&nifp->ni_bufs_head, req->nr_extra_bufs);
2569 					if (netmap_verbose)
2570 						nm_prinf("got %d extra buffers", req->nr_extra_bufs);
2571 				}
2572 				req->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2573 
2574 				error = nmreq_checkoptions(hdr);
2575 				if (error) {
2576 					netmap_do_unregif(priv);
2577 					break;
2578 				}
2579 
2580 				/* store ifp reference so that priv destructor may release it */
2581 				priv->np_ifp = ifp;
2582 			} while (0);
2583 			if (error) {
2584 				netmap_unget_na(na, ifp);
2585 			}
2586 			/* release the reference from netmap_mem_find() or
2587 			 * netmap_mem_ext_create()
2588 			 */
2589 			if (nmd)
2590 				netmap_mem_put(nmd);
2591 			NMG_UNLOCK();
2592 			break;
2593 		}
2594 
2595 		case NETMAP_REQ_PORT_INFO_GET: {
2596 			struct nmreq_port_info_get *req =
2597 				(struct nmreq_port_info_get *)(uintptr_t)hdr->nr_body;
2598 
2599 			NMG_LOCK();
2600 			do {
2601 				u_int memflags;
2602 
2603 				if (hdr->nr_name[0] != '\0') {
2604 					/* Build a nmreq_register out of the nmreq_port_info_get,
2605 					 * so that we can call netmap_get_na(). */
2606 					struct nmreq_register regreq;
2607 					bzero(&regreq, sizeof(regreq));
2608 					regreq.nr_mode = NR_REG_ALL_NIC;
2609 					regreq.nr_tx_slots = req->nr_tx_slots;
2610 					regreq.nr_rx_slots = req->nr_rx_slots;
2611 					regreq.nr_tx_rings = req->nr_tx_rings;
2612 					regreq.nr_rx_rings = req->nr_rx_rings;
2613 					regreq.nr_mem_id = req->nr_mem_id;
2614 
2615 					/* get a refcount */
2616 					hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2617 					hdr->nr_body = (uintptr_t)&regreq;
2618 					error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2619 					hdr->nr_reqtype = NETMAP_REQ_PORT_INFO_GET; /* reset type */
2620 					hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2621 					if (error) {
2622 						na = NULL;
2623 						ifp = NULL;
2624 						break;
2625 					}
2626 					nmd = na->nm_mem; /* get memory allocator */
2627 				} else {
2628 					nmd = netmap_mem_find(req->nr_mem_id ? req->nr_mem_id : 1);
2629 					if (nmd == NULL) {
2630 						if (netmap_verbose)
2631 							nm_prerr("%s: failed to find mem_id %u",
2632 									hdr->nr_name,
2633 									req->nr_mem_id ? req->nr_mem_id : 1);
2634 						error = EINVAL;
2635 						break;
2636 					}
2637 				}
2638 
2639 				error = netmap_mem_get_info(nmd, &req->nr_memsize, &memflags,
2640 					&req->nr_mem_id);
2641 				if (error)
2642 					break;
2643 				if (na == NULL) /* only memory info */
2644 					break;
2645 				netmap_update_config(na);
2646 				req->nr_rx_rings = na->num_rx_rings;
2647 				req->nr_tx_rings = na->num_tx_rings;
2648 				req->nr_rx_slots = na->num_rx_desc;
2649 				req->nr_tx_slots = na->num_tx_desc;
2650 			} while (0);
2651 			netmap_unget_na(na, ifp);
2652 			NMG_UNLOCK();
2653 			break;
2654 		}
2655 #ifdef WITH_VALE
2656 		case NETMAP_REQ_VALE_ATTACH: {
2657 			error = netmap_vale_attach(hdr, NULL /* userspace request */);
2658 			break;
2659 		}
2660 
2661 		case NETMAP_REQ_VALE_DETACH: {
2662 			error = netmap_vale_detach(hdr, NULL /* userspace request */);
2663 			break;
2664 		}
2665 
2666 		case NETMAP_REQ_VALE_LIST: {
2667 			error = netmap_vale_list(hdr);
2668 			break;
2669 		}
2670 
2671 		case NETMAP_REQ_PORT_HDR_SET: {
2672 			struct nmreq_port_hdr *req =
2673 				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2674 			/* Build a nmreq_register out of the nmreq_port_hdr,
2675 			 * so that we can call netmap_get_bdg_na(). */
2676 			struct nmreq_register regreq;
2677 			bzero(&regreq, sizeof(regreq));
2678 			regreq.nr_mode = NR_REG_ALL_NIC;
2679 
2680 			/* For now we only support virtio-net headers, and only for
2681 			 * VALE ports, but this may change in future. Valid lengths
2682 			 * for the virtio-net header are 0 (no header), 10 and 12. */
2683 			if (req->nr_hdr_len != 0 &&
2684 				req->nr_hdr_len != sizeof(struct nm_vnet_hdr) &&
2685 					req->nr_hdr_len != 12) {
2686 				if (netmap_verbose)
2687 					nm_prerr("invalid hdr_len %u", req->nr_hdr_len);
2688 				error = EINVAL;
2689 				break;
2690 			}
2691 			NMG_LOCK();
2692 			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2693 			hdr->nr_body = (uintptr_t)&regreq;
2694 			error = netmap_get_vale_na(hdr, &na, NULL, 0);
2695 			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_SET;
2696 			hdr->nr_body = (uintptr_t)req;
2697 			if (na && !error) {
2698 				struct netmap_vp_adapter *vpna =
2699 					(struct netmap_vp_adapter *)na;
2700 				na->virt_hdr_len = req->nr_hdr_len;
2701 				if (na->virt_hdr_len) {
2702 					vpna->mfs = NETMAP_BUF_SIZE(na);
2703 				}
2704 				if (netmap_verbose)
2705 					nm_prinf("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
2706 				netmap_adapter_put(na);
2707 			} else if (!na) {
2708 				error = ENXIO;
2709 			}
2710 			NMG_UNLOCK();
2711 			break;
2712 		}
2713 
2714 		case NETMAP_REQ_PORT_HDR_GET: {
2715 			/* Get vnet-header length for this netmap port */
2716 			struct nmreq_port_hdr *req =
2717 				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2718 			/* Build a nmreq_register out of the nmreq_port_hdr,
2719 			 * so that we can call netmap_get_bdg_na(). */
2720 			struct nmreq_register regreq;
2721 			struct ifnet *ifp;
2722 
2723 			bzero(&regreq, sizeof(regreq));
2724 			regreq.nr_mode = NR_REG_ALL_NIC;
2725 			NMG_LOCK();
2726 			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2727 			hdr->nr_body = (uintptr_t)&regreq;
2728 			error = netmap_get_na(hdr, &na, &ifp, NULL, 0);
2729 			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_GET;
2730 			hdr->nr_body = (uintptr_t)req;
2731 			if (na && !error) {
2732 				req->nr_hdr_len = na->virt_hdr_len;
2733 			}
2734 			netmap_unget_na(na, ifp);
2735 			NMG_UNLOCK();
2736 			break;
2737 		}
2738 
2739 		case NETMAP_REQ_VALE_NEWIF: {
2740 			error = nm_vi_create(hdr);
2741 			break;
2742 		}
2743 
2744 		case NETMAP_REQ_VALE_DELIF: {
2745 			error = nm_vi_destroy(hdr->nr_name);
2746 			break;
2747 		}
2748 
2749 		case NETMAP_REQ_VALE_POLLING_ENABLE:
2750 		case NETMAP_REQ_VALE_POLLING_DISABLE: {
2751 			error = nm_bdg_polling(hdr);
2752 			break;
2753 		}
2754 #endif  /* WITH_VALE */
2755 		case NETMAP_REQ_POOLS_INFO_GET: {
2756 			/* Get information from the memory allocator used for
2757 			 * hdr->nr_name. */
2758 			struct nmreq_pools_info *req =
2759 				(struct nmreq_pools_info *)(uintptr_t)hdr->nr_body;
2760 			NMG_LOCK();
2761 			do {
2762 				/* Build a nmreq_register out of the nmreq_pools_info,
2763 				 * so that we can call netmap_get_na(). */
2764 				struct nmreq_register regreq;
2765 				bzero(&regreq, sizeof(regreq));
2766 				regreq.nr_mem_id = req->nr_mem_id;
2767 				regreq.nr_mode = NR_REG_ALL_NIC;
2768 
2769 				hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2770 				hdr->nr_body = (uintptr_t)&regreq;
2771 				error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2772 				hdr->nr_reqtype = NETMAP_REQ_POOLS_INFO_GET; /* reset type */
2773 				hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2774 				if (error) {
2775 					na = NULL;
2776 					ifp = NULL;
2777 					break;
2778 				}
2779 				nmd = na->nm_mem; /* grab the memory allocator */
2780 				if (nmd == NULL) {
2781 					error = EINVAL;
2782 					break;
2783 				}
2784 
2785 				/* Finalize the memory allocator, get the pools
2786 				 * information and release the allocator. */
2787 				error = netmap_mem_finalize(nmd, na);
2788 				if (error) {
2789 					break;
2790 				}
2791 				error = netmap_mem_pools_info_get(req, nmd);
2792 				netmap_mem_drop(na);
2793 			} while (0);
2794 			netmap_unget_na(na, ifp);
2795 			NMG_UNLOCK();
2796 			break;
2797 		}
2798 
2799 		case NETMAP_REQ_CSB_ENABLE: {
2800 			struct nmreq_option *opt;
2801 
2802 			opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
2803 						NETMAP_REQ_OPT_CSB);
2804 			if (opt == NULL) {
2805 				error = EINVAL;
2806 			} else {
2807 				struct nmreq_opt_csb *csbo =
2808 					(struct nmreq_opt_csb *)opt;
2809 				error = nmreq_checkduplicate(opt);
2810 				if (!error) {
2811 					NMG_LOCK();
2812 					error = netmap_csb_validate(priv, csbo);
2813 					NMG_UNLOCK();
2814 				}
2815 				opt->nro_status = error;
2816 			}
2817 			break;
2818 		}
2819 
2820 		case NETMAP_REQ_SYNC_KLOOP_START: {
2821 			error = netmap_sync_kloop(priv, hdr);
2822 			break;
2823 		}
2824 
2825 		case NETMAP_REQ_SYNC_KLOOP_STOP: {
2826 			error = netmap_sync_kloop_stop(priv);
2827 			break;
2828 		}
2829 
2830 		default: {
2831 			error = EINVAL;
2832 			break;
2833 		}
2834 		}
2835 		/* Write back request body to userspace and reset the
2836 		 * user-space pointer. */
2837 		error = nmreq_copyout(hdr, error);
2838 		break;
2839 	}
2840 
2841 	case NIOCTXSYNC:
2842 	case NIOCRXSYNC: {
2843 		if (unlikely(priv->np_nifp == NULL)) {
2844 			error = ENXIO;
2845 			break;
2846 		}
2847 		mb(); /* make sure following reads are not from cache */
2848 
2849 		if (unlikely(priv->np_csb_atok_base)) {
2850 			nm_prerr("Invalid sync in CSB mode");
2851 			error = EBUSY;
2852 			break;
2853 		}
2854 
2855 		na = priv->np_na;      /* we have a reference */
2856 
2857 		mbq_init(&q);
2858 		t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
2859 		krings = NMR(na, t);
2860 		qfirst = priv->np_qfirst[t];
2861 		qlast = priv->np_qlast[t];
2862 		sync_flags = priv->np_sync_flags;
2863 
2864 		for (i = qfirst; i < qlast; i++) {
2865 			struct netmap_kring *kring = krings[i];
2866 			struct netmap_ring *ring = kring->ring;
2867 
2868 			if (unlikely(nm_kr_tryget(kring, 1, &error))) {
2869 				error = (error ? EIO : 0);
2870 				continue;
2871 			}
2872 
2873 			if (cmd == NIOCTXSYNC) {
2874 				if (netmap_debug & NM_DEBUG_TXSYNC)
2875 					nm_prinf("pre txsync ring %d cur %d hwcur %d",
2876 					    i, ring->cur,
2877 					    kring->nr_hwcur);
2878 				if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2879 					netmap_ring_reinit(kring);
2880 				} else if (kring->nm_sync(kring, sync_flags | NAF_FORCE_RECLAIM) == 0) {
2881 					nm_sync_finalize(kring);
2882 				}
2883 				if (netmap_debug & NM_DEBUG_TXSYNC)
2884 					nm_prinf("post txsync ring %d cur %d hwcur %d",
2885 					    i, ring->cur,
2886 					    kring->nr_hwcur);
2887 			} else {
2888 				if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2889 					netmap_ring_reinit(kring);
2890 				}
2891 				if (nm_may_forward_up(kring)) {
2892 					/* transparent forwarding, see netmap_poll() */
2893 					netmap_grab_packets(kring, &q, netmap_fwd);
2894 				}
2895 				if (kring->nm_sync(kring, sync_flags | NAF_FORCE_READ) == 0) {
2896 					nm_sync_finalize(kring);
2897 				}
2898 				ring_timestamp_set(ring);
2899 			}
2900 			nm_kr_put(kring);
2901 		}
2902 
2903 		if (mbq_peek(&q)) {
2904 			netmap_send_up(na->ifp, &q);
2905 		}
2906 
2907 		break;
2908 	}
2909 
2910 	default: {
2911 		return netmap_ioctl_legacy(priv, cmd, data, td);
2912 		break;
2913 	}
2914 	}
2915 
2916 	return (error);
2917 }
2918 
2919 size_t
2920 nmreq_size_by_type(uint16_t nr_reqtype)
2921 {
2922 	switch (nr_reqtype) {
2923 	case NETMAP_REQ_REGISTER:
2924 		return sizeof(struct nmreq_register);
2925 	case NETMAP_REQ_PORT_INFO_GET:
2926 		return sizeof(struct nmreq_port_info_get);
2927 	case NETMAP_REQ_VALE_ATTACH:
2928 		return sizeof(struct nmreq_vale_attach);
2929 	case NETMAP_REQ_VALE_DETACH:
2930 		return sizeof(struct nmreq_vale_detach);
2931 	case NETMAP_REQ_VALE_LIST:
2932 		return sizeof(struct nmreq_vale_list);
2933 	case NETMAP_REQ_PORT_HDR_SET:
2934 	case NETMAP_REQ_PORT_HDR_GET:
2935 		return sizeof(struct nmreq_port_hdr);
2936 	case NETMAP_REQ_VALE_NEWIF:
2937 		return sizeof(struct nmreq_vale_newif);
2938 	case NETMAP_REQ_VALE_DELIF:
2939 	case NETMAP_REQ_SYNC_KLOOP_STOP:
2940 	case NETMAP_REQ_CSB_ENABLE:
2941 		return 0;
2942 	case NETMAP_REQ_VALE_POLLING_ENABLE:
2943 	case NETMAP_REQ_VALE_POLLING_DISABLE:
2944 		return sizeof(struct nmreq_vale_polling);
2945 	case NETMAP_REQ_POOLS_INFO_GET:
2946 		return sizeof(struct nmreq_pools_info);
2947 	case NETMAP_REQ_SYNC_KLOOP_START:
2948 		return sizeof(struct nmreq_sync_kloop_start);
2949 	}
2950 	return 0;
2951 }
2952 
2953 static size_t
2954 nmreq_opt_size_by_type(uint32_t nro_reqtype, uint64_t nro_size)
2955 {
2956 	size_t rv = sizeof(struct nmreq_option);
2957 #ifdef NETMAP_REQ_OPT_DEBUG
2958 	if (nro_reqtype & NETMAP_REQ_OPT_DEBUG)
2959 		return (nro_reqtype & ~NETMAP_REQ_OPT_DEBUG);
2960 #endif /* NETMAP_REQ_OPT_DEBUG */
2961 	switch (nro_reqtype) {
2962 #ifdef WITH_EXTMEM
2963 	case NETMAP_REQ_OPT_EXTMEM:
2964 		rv = sizeof(struct nmreq_opt_extmem);
2965 		break;
2966 #endif /* WITH_EXTMEM */
2967 	case NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS:
2968 		if (nro_size >= rv)
2969 			rv = nro_size;
2970 		break;
2971 	case NETMAP_REQ_OPT_CSB:
2972 		rv = sizeof(struct nmreq_opt_csb);
2973 		break;
2974 	case NETMAP_REQ_OPT_SYNC_KLOOP_MODE:
2975 		rv = sizeof(struct nmreq_opt_sync_kloop_mode);
2976 		break;
2977 	}
2978 	/* subtract the common header */
2979 	return rv - sizeof(struct nmreq_option);
2980 }
2981 
2982 int
2983 nmreq_copyin(struct nmreq_header *hdr, int nr_body_is_user)
2984 {
2985 	size_t rqsz, optsz, bufsz;
2986 	int error;
2987 	char *ker = NULL, *p;
2988 	struct nmreq_option **next, *src;
2989 	struct nmreq_option buf;
2990 	uint64_t *ptrs;
2991 
2992 	if (hdr->nr_reserved) {
2993 		if (netmap_verbose)
2994 			nm_prerr("nr_reserved must be zero");
2995 		return EINVAL;
2996 	}
2997 
2998 	if (!nr_body_is_user)
2999 		return 0;
3000 
3001 	hdr->nr_reserved = nr_body_is_user;
3002 
3003 	/* compute the total size of the buffer */
3004 	rqsz = nmreq_size_by_type(hdr->nr_reqtype);
3005 	if (rqsz > NETMAP_REQ_MAXSIZE) {
3006 		error = EMSGSIZE;
3007 		goto out_err;
3008 	}
3009 	if ((rqsz && hdr->nr_body == (uintptr_t)NULL) ||
3010 		(!rqsz && hdr->nr_body != (uintptr_t)NULL)) {
3011 		/* Request body expected, but not found; or
3012 		 * request body found but unexpected. */
3013 		if (netmap_verbose)
3014 			nm_prerr("nr_body expected but not found, or vice versa");
3015 		error = EINVAL;
3016 		goto out_err;
3017 	}
3018 
3019 	bufsz = 2 * sizeof(void *) + rqsz;
3020 	optsz = 0;
3021 	for (src = (struct nmreq_option *)(uintptr_t)hdr->nr_options; src;
3022 	     src = (struct nmreq_option *)(uintptr_t)buf.nro_next)
3023 	{
3024 		error = copyin(src, &buf, sizeof(*src));
3025 		if (error)
3026 			goto out_err;
3027 		optsz += sizeof(*src);
3028 		optsz += nmreq_opt_size_by_type(buf.nro_reqtype, buf.nro_size);
3029 		if (rqsz + optsz > NETMAP_REQ_MAXSIZE) {
3030 			error = EMSGSIZE;
3031 			goto out_err;
3032 		}
3033 		bufsz += optsz + sizeof(void *);
3034 	}
3035 
3036 	ker = nm_os_malloc(bufsz);
3037 	if (ker == NULL) {
3038 		error = ENOMEM;
3039 		goto out_err;
3040 	}
3041 	p = ker;
3042 
3043 	/* make a copy of the user pointers */
3044 	ptrs = (uint64_t*)p;
3045 	*ptrs++ = hdr->nr_body;
3046 	*ptrs++ = hdr->nr_options;
3047 	p = (char *)ptrs;
3048 
3049 	/* copy the body */
3050 	error = copyin((void *)(uintptr_t)hdr->nr_body, p, rqsz);
3051 	if (error)
3052 		goto out_restore;
3053 	/* overwrite the user pointer with the in-kernel one */
3054 	hdr->nr_body = (uintptr_t)p;
3055 	p += rqsz;
3056 
3057 	/* copy the options */
3058 	next = (struct nmreq_option **)&hdr->nr_options;
3059 	src = *next;
3060 	while (src) {
3061 		struct nmreq_option *opt;
3062 
3063 		/* copy the option header */
3064 		ptrs = (uint64_t *)p;
3065 		opt = (struct nmreq_option *)(ptrs + 1);
3066 		error = copyin(src, opt, sizeof(*src));
3067 		if (error)
3068 			goto out_restore;
3069 		/* make a copy of the user next pointer */
3070 		*ptrs = opt->nro_next;
3071 		/* overwrite the user pointer with the in-kernel one */
3072 		*next = opt;
3073 
3074 		/* initialize the option as not supported.
3075 		 * Recognized options will update this field.
3076 		 */
3077 		opt->nro_status = EOPNOTSUPP;
3078 
3079 		p = (char *)(opt + 1);
3080 
3081 		/* copy the option body */
3082 		optsz = nmreq_opt_size_by_type(opt->nro_reqtype,
3083 						opt->nro_size);
3084 		if (optsz) {
3085 			/* the option body follows the option header */
3086 			error = copyin(src + 1, p, optsz);
3087 			if (error)
3088 				goto out_restore;
3089 			p += optsz;
3090 		}
3091 
3092 		/* move to next option */
3093 		next = (struct nmreq_option **)&opt->nro_next;
3094 		src = *next;
3095 	}
3096 	return 0;
3097 
3098 out_restore:
3099 	ptrs = (uint64_t *)ker;
3100 	hdr->nr_body = *ptrs++;
3101 	hdr->nr_options = *ptrs++;
3102 	hdr->nr_reserved = 0;
3103 	nm_os_free(ker);
3104 out_err:
3105 	return error;
3106 }
3107 
3108 static int
3109 nmreq_copyout(struct nmreq_header *hdr, int rerror)
3110 {
3111 	struct nmreq_option *src, *dst;
3112 	void *ker = (void *)(uintptr_t)hdr->nr_body, *bufstart;
3113 	uint64_t *ptrs;
3114 	size_t bodysz;
3115 	int error;
3116 
3117 	if (!hdr->nr_reserved)
3118 		return rerror;
3119 
3120 	/* restore the user pointers in the header */
3121 	ptrs = (uint64_t *)ker - 2;
3122 	bufstart = ptrs;
3123 	hdr->nr_body = *ptrs++;
3124 	src = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3125 	hdr->nr_options = *ptrs;
3126 
3127 	if (!rerror) {
3128 		/* copy the body */
3129 		bodysz = nmreq_size_by_type(hdr->nr_reqtype);
3130 		error = copyout(ker, (void *)(uintptr_t)hdr->nr_body, bodysz);
3131 		if (error) {
3132 			rerror = error;
3133 			goto out;
3134 		}
3135 	}
3136 
3137 	/* copy the options */
3138 	dst = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3139 	while (src) {
3140 		size_t optsz;
3141 		uint64_t next;
3142 
3143 		/* restore the user pointer */
3144 		next = src->nro_next;
3145 		ptrs = (uint64_t *)src - 1;
3146 		src->nro_next = *ptrs;
3147 
3148 		/* always copy the option header */
3149 		error = copyout(src, dst, sizeof(*src));
3150 		if (error) {
3151 			rerror = error;
3152 			goto out;
3153 		}
3154 
3155 		/* copy the option body only if there was no error */
3156 		if (!rerror && !src->nro_status) {
3157 			optsz = nmreq_opt_size_by_type(src->nro_reqtype,
3158 							src->nro_size);
3159 			if (optsz) {
3160 				error = copyout(src + 1, dst + 1, optsz);
3161 				if (error) {
3162 					rerror = error;
3163 					goto out;
3164 				}
3165 			}
3166 		}
3167 		src = (struct nmreq_option *)(uintptr_t)next;
3168 		dst = (struct nmreq_option *)(uintptr_t)*ptrs;
3169 	}
3170 
3171 
3172 out:
3173 	hdr->nr_reserved = 0;
3174 	nm_os_free(bufstart);
3175 	return rerror;
3176 }
3177 
3178 struct nmreq_option *
3179 nmreq_findoption(struct nmreq_option *opt, uint16_t reqtype)
3180 {
3181 	for ( ; opt; opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
3182 		if (opt->nro_reqtype == reqtype)
3183 			return opt;
3184 	return NULL;
3185 }
3186 
3187 int
3188 nmreq_checkduplicate(struct nmreq_option *opt) {
3189 	uint16_t type = opt->nro_reqtype;
3190 	int dup = 0;
3191 
3192 	while ((opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)opt->nro_next,
3193 			type))) {
3194 		dup++;
3195 		opt->nro_status = EINVAL;
3196 	}
3197 	return (dup ? EINVAL : 0);
3198 }
3199 
3200 static int
3201 nmreq_checkoptions(struct nmreq_header *hdr)
3202 {
3203 	struct nmreq_option *opt;
3204 	/* return error if there is still any option
3205 	 * marked as not supported
3206 	 */
3207 
3208 	for (opt = (struct nmreq_option *)(uintptr_t)hdr->nr_options; opt;
3209 	     opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
3210 		if (opt->nro_status == EOPNOTSUPP)
3211 			return EOPNOTSUPP;
3212 
3213 	return 0;
3214 }
3215 
3216 /*
3217  * select(2) and poll(2) handlers for the "netmap" device.
3218  *
3219  * Can be called for one or more queues.
3220  * Return true the event mask corresponding to ready events.
3221  * If there are no ready events (and 'sr' is not NULL), do a
3222  * selrecord on either individual selinfo or on the global one.
3223  * Device-dependent parts (locking and sync of tx/rx rings)
3224  * are done through callbacks.
3225  *
3226  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
3227  * The first one is remapped to pwait as selrecord() uses the name as an
3228  * hidden argument.
3229  */
3230 int
3231 netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
3232 {
3233 	struct netmap_adapter *na;
3234 	struct netmap_kring *kring;
3235 	struct netmap_ring *ring;
3236 	u_int i, want[NR_TXRX], revents = 0;
3237 	NM_SELINFO_T *si[NR_TXRX];
3238 #define want_tx want[NR_TX]
3239 #define want_rx want[NR_RX]
3240 	struct mbq q;	/* packets from RX hw queues to host stack */
3241 
3242 	/*
3243 	 * In order to avoid nested locks, we need to "double check"
3244 	 * txsync and rxsync if we decide to do a selrecord().
3245 	 * retry_tx (and retry_rx, later) prevent looping forever.
3246 	 */
3247 	int retry_tx = 1, retry_rx = 1;
3248 
3249 	/* Transparent mode: send_down is 1 if we have found some
3250 	 * packets to forward (host RX ring --> NIC) during the rx
3251 	 * scan and we have not sent them down to the NIC yet.
3252 	 * Transparent mode requires to bind all rings to a single
3253 	 * file descriptor.
3254 	 */
3255 	int send_down = 0;
3256 	int sync_flags = priv->np_sync_flags;
3257 
3258 	mbq_init(&q);
3259 
3260 	if (unlikely(priv->np_nifp == NULL)) {
3261 		return POLLERR;
3262 	}
3263 	mb(); /* make sure following reads are not from cache */
3264 
3265 	na = priv->np_na;
3266 
3267 	if (unlikely(!nm_netmap_on(na)))
3268 		return POLLERR;
3269 
3270 	if (unlikely(priv->np_csb_atok_base)) {
3271 		nm_prerr("Invalid poll in CSB mode");
3272 		return POLLERR;
3273 	}
3274 
3275 	if (netmap_debug & NM_DEBUG_ON)
3276 		nm_prinf("device %s events 0x%x", na->name, events);
3277 	want_tx = events & (POLLOUT | POLLWRNORM);
3278 	want_rx = events & (POLLIN | POLLRDNORM);
3279 
3280 	/*
3281 	 * If the card has more than one queue AND the file descriptor is
3282 	 * bound to all of them, we sleep on the "global" selinfo, otherwise
3283 	 * we sleep on individual selinfo (FreeBSD only allows two selinfo's
3284 	 * per file descriptor).
3285 	 * The interrupt routine in the driver wake one or the other
3286 	 * (or both) depending on which clients are active.
3287 	 *
3288 	 * rxsync() is only called if we run out of buffers on a POLLIN.
3289 	 * txsync() is called if we run out of buffers on POLLOUT, or
3290 	 * there are pending packets to send. The latter can be disabled
3291 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
3292 	 */
3293 	si[NR_RX] = priv->np_si[NR_RX];
3294 	si[NR_TX] = priv->np_si[NR_TX];
3295 
3296 #ifdef __FreeBSD__
3297 	/*
3298 	 * We start with a lock free round which is cheap if we have
3299 	 * slots available. If this fails, then lock and call the sync
3300 	 * routines. We can't do this on Linux, as the contract says
3301 	 * that we must call nm_os_selrecord() unconditionally.
3302 	 */
3303 	if (want_tx) {
3304 		const enum txrx t = NR_TX;
3305 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3306 			kring = NMR(na, t)[i];
3307 			if (kring->ring->cur != kring->ring->tail) {
3308 				/* Some unseen TX space is available, so what
3309 				 * we don't need to run txsync. */
3310 				revents |= want[t];
3311 				want[t] = 0;
3312 				break;
3313 			}
3314 		}
3315 	}
3316 	if (want_rx) {
3317 		const enum txrx t = NR_RX;
3318 		int rxsync_needed = 0;
3319 
3320 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3321 			kring = NMR(na, t)[i];
3322 			if (kring->ring->cur == kring->ring->tail
3323 				|| kring->rhead != kring->ring->head) {
3324 				/* There are no unseen packets on this ring,
3325 				 * or there are some buffers to be returned
3326 				 * to the netmap port. We therefore go ahead
3327 				 * and run rxsync. */
3328 				rxsync_needed = 1;
3329 				break;
3330 			}
3331 		}
3332 		if (!rxsync_needed) {
3333 			revents |= want_rx;
3334 			want_rx = 0;
3335 		}
3336 	}
3337 #endif
3338 
3339 #ifdef linux
3340 	/* The selrecord must be unconditional on linux. */
3341 	nm_os_selrecord(sr, si[NR_RX]);
3342 	nm_os_selrecord(sr, si[NR_TX]);
3343 #endif /* linux */
3344 
3345 	/*
3346 	 * If we want to push packets out (priv->np_txpoll) or
3347 	 * want_tx is still set, we must issue txsync calls
3348 	 * (on all rings, to avoid that the tx rings stall).
3349 	 * Fortunately, normal tx mode has np_txpoll set.
3350 	 */
3351 	if (priv->np_txpoll || want_tx) {
3352 		/*
3353 		 * The first round checks if anyone is ready, if not
3354 		 * do a selrecord and another round to handle races.
3355 		 * want_tx goes to 0 if any space is found, and is
3356 		 * used to skip rings with no pending transmissions.
3357 		 */
3358 flush_tx:
3359 		for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
3360 			int found = 0;
3361 
3362 			kring = na->tx_rings[i];
3363 			ring = kring->ring;
3364 
3365 			/*
3366 			 * Don't try to txsync this TX ring if we already found some
3367 			 * space in some of the TX rings (want_tx == 0) and there are no
3368 			 * TX slots in this ring that need to be flushed to the NIC
3369 			 * (head == hwcur).
3370 			 */
3371 			if (!send_down && !want_tx && ring->head == kring->nr_hwcur)
3372 				continue;
3373 
3374 			if (nm_kr_tryget(kring, 1, &revents))
3375 				continue;
3376 
3377 			if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3378 				netmap_ring_reinit(kring);
3379 				revents |= POLLERR;
3380 			} else {
3381 				if (kring->nm_sync(kring, sync_flags))
3382 					revents |= POLLERR;
3383 				else
3384 					nm_sync_finalize(kring);
3385 			}
3386 
3387 			/*
3388 			 * If we found new slots, notify potential
3389 			 * listeners on the same ring.
3390 			 * Since we just did a txsync, look at the copies
3391 			 * of cur,tail in the kring.
3392 			 */
3393 			found = kring->rcur != kring->rtail;
3394 			nm_kr_put(kring);
3395 			if (found) { /* notify other listeners */
3396 				revents |= want_tx;
3397 				want_tx = 0;
3398 #ifndef linux
3399 				kring->nm_notify(kring, 0);
3400 #endif /* linux */
3401 			}
3402 		}
3403 		/* if there were any packet to forward we must have handled them by now */
3404 		send_down = 0;
3405 		if (want_tx && retry_tx && sr) {
3406 #ifndef linux
3407 			nm_os_selrecord(sr, si[NR_TX]);
3408 #endif /* !linux */
3409 			retry_tx = 0;
3410 			goto flush_tx;
3411 		}
3412 	}
3413 
3414 	/*
3415 	 * If want_rx is still set scan receive rings.
3416 	 * Do it on all rings because otherwise we starve.
3417 	 */
3418 	if (want_rx) {
3419 		/* two rounds here for race avoidance */
3420 do_retry_rx:
3421 		for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
3422 			int found = 0;
3423 
3424 			kring = na->rx_rings[i];
3425 			ring = kring->ring;
3426 
3427 			if (unlikely(nm_kr_tryget(kring, 1, &revents)))
3428 				continue;
3429 
3430 			if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3431 				netmap_ring_reinit(kring);
3432 				revents |= POLLERR;
3433 			}
3434 			/* now we can use kring->rcur, rtail */
3435 
3436 			/*
3437 			 * transparent mode support: collect packets from
3438 			 * hw rxring(s) that have been released by the user
3439 			 */
3440 			if (nm_may_forward_up(kring)) {
3441 				netmap_grab_packets(kring, &q, netmap_fwd);
3442 			}
3443 
3444 			/* Clear the NR_FORWARD flag anyway, it may be set by
3445 			 * the nm_sync() below only on for the host RX ring (see
3446 			 * netmap_rxsync_from_host()). */
3447 			kring->nr_kflags &= ~NR_FORWARD;
3448 			if (kring->nm_sync(kring, sync_flags))
3449 				revents |= POLLERR;
3450 			else
3451 				nm_sync_finalize(kring);
3452 			send_down |= (kring->nr_kflags & NR_FORWARD);
3453 			ring_timestamp_set(ring);
3454 			found = kring->rcur != kring->rtail;
3455 			nm_kr_put(kring);
3456 			if (found) {
3457 				revents |= want_rx;
3458 				retry_rx = 0;
3459 #ifndef linux
3460 				kring->nm_notify(kring, 0);
3461 #endif /* linux */
3462 			}
3463 		}
3464 
3465 #ifndef linux
3466 		if (retry_rx && sr) {
3467 			nm_os_selrecord(sr, si[NR_RX]);
3468 		}
3469 #endif /* !linux */
3470 		if (send_down || retry_rx) {
3471 			retry_rx = 0;
3472 			if (send_down)
3473 				goto flush_tx; /* and retry_rx */
3474 			else
3475 				goto do_retry_rx;
3476 		}
3477 	}
3478 
3479 	/*
3480 	 * Transparent mode: released bufs (i.e. between kring->nr_hwcur and
3481 	 * ring->head) marked with NS_FORWARD on hw rx rings are passed up
3482 	 * to the host stack.
3483 	 */
3484 
3485 	if (mbq_peek(&q)) {
3486 		netmap_send_up(na->ifp, &q);
3487 	}
3488 
3489 	return (revents);
3490 #undef want_tx
3491 #undef want_rx
3492 }
3493 
3494 int
3495 nma_intr_enable(struct netmap_adapter *na, int onoff)
3496 {
3497 	bool changed = false;
3498 	enum txrx t;
3499 	int i;
3500 
3501 	for_rx_tx(t) {
3502 		for (i = 0; i < nma_get_nrings(na, t); i++) {
3503 			struct netmap_kring *kring = NMR(na, t)[i];
3504 			int on = !(kring->nr_kflags & NKR_NOINTR);
3505 
3506 			if (!!onoff != !!on) {
3507 				changed = true;
3508 			}
3509 			if (onoff) {
3510 				kring->nr_kflags &= ~NKR_NOINTR;
3511 			} else {
3512 				kring->nr_kflags |= NKR_NOINTR;
3513 			}
3514 		}
3515 	}
3516 
3517 	if (!changed) {
3518 		return 0; /* nothing to do */
3519 	}
3520 
3521 	if (!na->nm_intr) {
3522 		nm_prerr("Cannot %s interrupts for %s", onoff ? "enable" : "disable",
3523 		  na->name);
3524 		return -1;
3525 	}
3526 
3527 	na->nm_intr(na, onoff);
3528 
3529 	return 0;
3530 }
3531 
3532 
3533 /*-------------------- driver support routines -------------------*/
3534 
3535 /* default notify callback */
3536 static int
3537 netmap_notify(struct netmap_kring *kring, int flags)
3538 {
3539 	struct netmap_adapter *na = kring->notify_na;
3540 	enum txrx t = kring->tx;
3541 
3542 	nm_os_selwakeup(&kring->si);
3543 	/* optimization: avoid a wake up on the global
3544 	 * queue if nobody has registered for more
3545 	 * than one ring
3546 	 */
3547 	if (na->si_users[t] > 0)
3548 		nm_os_selwakeup(&na->si[t]);
3549 
3550 	return NM_IRQ_COMPLETED;
3551 }
3552 
3553 /* called by all routines that create netmap_adapters.
3554  * provide some defaults and get a reference to the
3555  * memory allocator
3556  */
3557 int
3558 netmap_attach_common(struct netmap_adapter *na)
3559 {
3560 	if (!na->rx_buf_maxsize) {
3561 		/* Set a conservative default (larger is safer). */
3562 		na->rx_buf_maxsize = PAGE_SIZE;
3563 	}
3564 
3565 #ifdef __FreeBSD__
3566 	if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
3567 		na->if_input = na->ifp->if_input; /* for netmap_send_up */
3568 	}
3569 	na->pdev = na; /* make sure netmap_mem_map() is called */
3570 #endif /* __FreeBSD__ */
3571 	if (na->na_flags & NAF_HOST_RINGS) {
3572 		if (na->num_host_rx_rings == 0)
3573 			na->num_host_rx_rings = 1;
3574 		if (na->num_host_tx_rings == 0)
3575 			na->num_host_tx_rings = 1;
3576 	}
3577 	if (na->nm_krings_create == NULL) {
3578 		/* we assume that we have been called by a driver,
3579 		 * since other port types all provide their own
3580 		 * nm_krings_create
3581 		 */
3582 		na->nm_krings_create = netmap_hw_krings_create;
3583 		na->nm_krings_delete = netmap_hw_krings_delete;
3584 	}
3585 	if (na->nm_notify == NULL)
3586 		na->nm_notify = netmap_notify;
3587 	na->active_fds = 0;
3588 
3589 	if (na->nm_mem == NULL) {
3590 		/* use the global allocator */
3591 		na->nm_mem = netmap_mem_get(&nm_mem);
3592 	}
3593 #ifdef WITH_VALE
3594 	if (na->nm_bdg_attach == NULL)
3595 		/* no special nm_bdg_attach callback. On VALE
3596 		 * attach, we need to interpose a bwrap
3597 		 */
3598 		na->nm_bdg_attach = netmap_default_bdg_attach;
3599 #endif
3600 
3601 	return 0;
3602 }
3603 
3604 /* Wrapper for the register callback provided netmap-enabled
3605  * hardware drivers.
3606  * nm_iszombie(na) means that the driver module has been
3607  * unloaded, so we cannot call into it.
3608  * nm_os_ifnet_lock() must guarantee mutual exclusion with
3609  * module unloading.
3610  */
3611 static int
3612 netmap_hw_reg(struct netmap_adapter *na, int onoff)
3613 {
3614 	struct netmap_hw_adapter *hwna =
3615 		(struct netmap_hw_adapter*)na;
3616 	int error = 0;
3617 
3618 	nm_os_ifnet_lock();
3619 
3620 	if (nm_iszombie(na)) {
3621 		if (onoff) {
3622 			error = ENXIO;
3623 		} else if (na != NULL) {
3624 			na->na_flags &= ~NAF_NETMAP_ON;
3625 		}
3626 		goto out;
3627 	}
3628 
3629 	error = hwna->nm_hw_register(na, onoff);
3630 
3631 out:
3632 	nm_os_ifnet_unlock();
3633 
3634 	return error;
3635 }
3636 
3637 static void
3638 netmap_hw_dtor(struct netmap_adapter *na)
3639 {
3640 	if (na->ifp == NULL)
3641 		return;
3642 
3643 	NM_DETACH_NA(na->ifp);
3644 }
3645 
3646 
3647 /*
3648  * Allocate a netmap_adapter object, and initialize it from the
3649  * 'arg' passed by the driver on attach.
3650  * We allocate a block of memory of 'size' bytes, which has room
3651  * for struct netmap_adapter plus additional room private to
3652  * the caller.
3653  * Return 0 on success, ENOMEM otherwise.
3654  */
3655 int
3656 netmap_attach_ext(struct netmap_adapter *arg, size_t size, int override_reg)
3657 {
3658 	struct netmap_hw_adapter *hwna = NULL;
3659 	struct ifnet *ifp = NULL;
3660 
3661 	if (size < sizeof(struct netmap_hw_adapter)) {
3662 		if (netmap_debug & NM_DEBUG_ON)
3663 			nm_prerr("Invalid netmap adapter size %d", (int)size);
3664 		return EINVAL;
3665 	}
3666 
3667 	if (arg == NULL || arg->ifp == NULL) {
3668 		if (netmap_debug & NM_DEBUG_ON)
3669 			nm_prerr("either arg or arg->ifp is NULL");
3670 		return EINVAL;
3671 	}
3672 
3673 	if (arg->num_tx_rings == 0 || arg->num_rx_rings == 0) {
3674 		if (netmap_debug & NM_DEBUG_ON)
3675 			nm_prerr("%s: invalid rings tx %d rx %d",
3676 				arg->name, arg->num_tx_rings, arg->num_rx_rings);
3677 		return EINVAL;
3678 	}
3679 
3680 	ifp = arg->ifp;
3681 	if (NM_NA_CLASH(ifp)) {
3682 		/* If NA(ifp) is not null but there is no valid netmap
3683 		 * adapter it means that someone else is using the same
3684 		 * pointer (e.g. ax25_ptr on linux). This happens for
3685 		 * instance when also PF_RING is in use. */
3686 		nm_prerr("Error: netmap adapter hook is busy");
3687 		return EBUSY;
3688 	}
3689 
3690 	hwna = nm_os_malloc(size);
3691 	if (hwna == NULL)
3692 		goto fail;
3693 	hwna->up = *arg;
3694 	hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
3695 	strlcpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
3696 	if (override_reg) {
3697 		hwna->nm_hw_register = hwna->up.nm_register;
3698 		hwna->up.nm_register = netmap_hw_reg;
3699 	}
3700 	if (netmap_attach_common(&hwna->up)) {
3701 		nm_os_free(hwna);
3702 		goto fail;
3703 	}
3704 	netmap_adapter_get(&hwna->up);
3705 
3706 	NM_ATTACH_NA(ifp, &hwna->up);
3707 
3708 	nm_os_onattach(ifp);
3709 
3710 	if (arg->nm_dtor == NULL) {
3711 		hwna->up.nm_dtor = netmap_hw_dtor;
3712 	}
3713 
3714 	if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
3715 	    hwna->up.num_tx_rings, hwna->up.num_tx_desc,
3716 	    hwna->up.num_rx_rings, hwna->up.num_rx_desc);
3717 	return 0;
3718 
3719 fail:
3720 	nm_prerr("fail, arg %p ifp %p na %p", arg, ifp, hwna);
3721 	return (hwna ? EINVAL : ENOMEM);
3722 }
3723 
3724 
3725 int
3726 netmap_attach(struct netmap_adapter *arg)
3727 {
3728 	return netmap_attach_ext(arg, sizeof(struct netmap_hw_adapter),
3729 			1 /* override nm_reg */);
3730 }
3731 
3732 
3733 void
3734 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
3735 {
3736 	if (!na) {
3737 		return;
3738 	}
3739 
3740 	refcount_acquire(&na->na_refcount);
3741 }
3742 
3743 
3744 /* returns 1 iff the netmap_adapter is destroyed */
3745 int
3746 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
3747 {
3748 	if (!na)
3749 		return 1;
3750 
3751 	if (!refcount_release(&na->na_refcount))
3752 		return 0;
3753 
3754 	if (na->nm_dtor)
3755 		na->nm_dtor(na);
3756 
3757 	if (na->tx_rings) { /* XXX should not happen */
3758 		if (netmap_debug & NM_DEBUG_ON)
3759 			nm_prerr("freeing leftover tx_rings");
3760 		na->nm_krings_delete(na);
3761 	}
3762 	netmap_pipe_dealloc(na);
3763 	if (na->nm_mem)
3764 		netmap_mem_put(na->nm_mem);
3765 	bzero(na, sizeof(*na));
3766 	nm_os_free(na);
3767 
3768 	return 1;
3769 }
3770 
3771 /* nm_krings_create callback for all hardware native adapters */
3772 int
3773 netmap_hw_krings_create(struct netmap_adapter *na)
3774 {
3775 	int ret = netmap_krings_create(na, 0);
3776 	if (ret == 0) {
3777 		/* initialize the mbq for the sw rx ring */
3778 		u_int lim = netmap_real_rings(na, NR_RX), i;
3779 		for (i = na->num_rx_rings; i < lim; i++) {
3780 			mbq_safe_init(&NMR(na, NR_RX)[i]->rx_queue);
3781 		}
3782 		nm_prdis("initialized sw rx queue %d", na->num_rx_rings);
3783 	}
3784 	return ret;
3785 }
3786 
3787 
3788 
3789 /*
3790  * Called on module unload by the netmap-enabled drivers
3791  */
3792 void
3793 netmap_detach(struct ifnet *ifp)
3794 {
3795 	struct netmap_adapter *na = NA(ifp);
3796 
3797 	if (!na)
3798 		return;
3799 
3800 	NMG_LOCK();
3801 	netmap_set_all_rings(na, NM_KR_LOCKED);
3802 	/*
3803 	 * if the netmap adapter is not native, somebody
3804 	 * changed it, so we can not release it here.
3805 	 * The NAF_ZOMBIE flag will notify the new owner that
3806 	 * the driver is gone.
3807 	 */
3808 	if (!(na->na_flags & NAF_NATIVE) || !netmap_adapter_put(na)) {
3809 		na->na_flags |= NAF_ZOMBIE;
3810 	}
3811 	/* give active users a chance to notice that NAF_ZOMBIE has been
3812 	 * turned on, so that they can stop and return an error to userspace.
3813 	 * Note that this becomes a NOP if there are no active users and,
3814 	 * therefore, the put() above has deleted the na, since now NA(ifp) is
3815 	 * NULL.
3816 	 */
3817 	netmap_enable_all_rings(ifp);
3818 	NMG_UNLOCK();
3819 }
3820 
3821 
3822 /*
3823  * Intercept packets from the network stack and pass them
3824  * to netmap as incoming packets on the 'software' ring.
3825  *
3826  * We only store packets in a bounded mbq and then copy them
3827  * in the relevant rxsync routine.
3828  *
3829  * We rely on the OS to make sure that the ifp and na do not go
3830  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
3831  * In nm_register() or whenever there is a reinitialization,
3832  * we make sure to make the mode change visible here.
3833  */
3834 int
3835 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
3836 {
3837 	struct netmap_adapter *na = NA(ifp);
3838 	struct netmap_kring *kring, *tx_kring;
3839 	u_int len = MBUF_LEN(m);
3840 	u_int error = ENOBUFS;
3841 	unsigned int txr;
3842 	struct mbq *q;
3843 	int busy;
3844 	u_int i;
3845 
3846 	i = MBUF_TXQ(m);
3847 	if (i >= na->num_host_rx_rings) {
3848 		i = i % na->num_host_rx_rings;
3849 	}
3850 	kring = NMR(na, NR_RX)[nma_get_nrings(na, NR_RX) + i];
3851 
3852 	// XXX [Linux] we do not need this lock
3853 	// if we follow the down/configure/up protocol -gl
3854 	// mtx_lock(&na->core_lock);
3855 
3856 	if (!nm_netmap_on(na)) {
3857 		nm_prerr("%s not in netmap mode anymore", na->name);
3858 		error = ENXIO;
3859 		goto done;
3860 	}
3861 
3862 	txr = MBUF_TXQ(m);
3863 	if (txr >= na->num_tx_rings) {
3864 		txr %= na->num_tx_rings;
3865 	}
3866 	tx_kring = NMR(na, NR_TX)[txr];
3867 
3868 	if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
3869 		return MBUF_TRANSMIT(na, ifp, m);
3870 	}
3871 
3872 	q = &kring->rx_queue;
3873 
3874 	// XXX reconsider long packets if we handle fragments
3875 	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
3876 		nm_prerr("%s from_host, drop packet size %d > %d", na->name,
3877 			len, NETMAP_BUF_SIZE(na));
3878 		goto done;
3879 	}
3880 
3881 	if (!netmap_generic_hwcsum) {
3882 		if (nm_os_mbuf_has_csum_offld(m)) {
3883 			nm_prlim(1, "%s drop mbuf that needs checksum offload", na->name);
3884 			goto done;
3885 		}
3886 	}
3887 
3888 	if (nm_os_mbuf_has_seg_offld(m)) {
3889 		nm_prlim(1, "%s drop mbuf that needs generic segmentation offload", na->name);
3890 		goto done;
3891 	}
3892 
3893 #ifdef __FreeBSD__
3894 	ETHER_BPF_MTAP(ifp, m);
3895 #endif /* __FreeBSD__ */
3896 
3897 	/* protect against netmap_rxsync_from_host(), netmap_sw_to_nic()
3898 	 * and maybe other instances of netmap_transmit (the latter
3899 	 * not possible on Linux).
3900 	 * We enqueue the mbuf only if we are sure there is going to be
3901 	 * enough room in the host RX ring, otherwise we drop it.
3902 	 */
3903 	mbq_lock(q);
3904 
3905 	busy = kring->nr_hwtail - kring->nr_hwcur;
3906 	if (busy < 0)
3907 		busy += kring->nkr_num_slots;
3908 	if (busy + mbq_len(q) >= kring->nkr_num_slots - 1) {
3909 		nm_prlim(2, "%s full hwcur %d hwtail %d qlen %d", na->name,
3910 			kring->nr_hwcur, kring->nr_hwtail, mbq_len(q));
3911 	} else {
3912 		mbq_enqueue(q, m);
3913 		nm_prdis(2, "%s %d bufs in queue", na->name, mbq_len(q));
3914 		/* notify outside the lock */
3915 		m = NULL;
3916 		error = 0;
3917 	}
3918 	mbq_unlock(q);
3919 
3920 done:
3921 	if (m)
3922 		m_freem(m);
3923 	/* unconditionally wake up listeners */
3924 	kring->nm_notify(kring, 0);
3925 	/* this is normally netmap_notify(), but for nics
3926 	 * connected to a bridge it is netmap_bwrap_intr_notify(),
3927 	 * that possibly forwards the frames through the switch
3928 	 */
3929 
3930 	return (error);
3931 }
3932 
3933 
3934 /*
3935  * netmap_reset() is called by the driver routines when reinitializing
3936  * a ring. The driver is in charge of locking to protect the kring.
3937  * If native netmap mode is not set just return NULL.
3938  * If native netmap mode is set, in particular, we have to set nr_mode to
3939  * NKR_NETMAP_ON.
3940  */
3941 struct netmap_slot *
3942 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
3943 	u_int new_cur)
3944 {
3945 	struct netmap_kring *kring;
3946 	int new_hwofs, lim;
3947 
3948 	if (!nm_native_on(na)) {
3949 		nm_prdis("interface not in native netmap mode");
3950 		return NULL;	/* nothing to reinitialize */
3951 	}
3952 
3953 	/* XXX note- in the new scheme, we are not guaranteed to be
3954 	 * under lock (e.g. when called on a device reset).
3955 	 * In this case, we should set a flag and do not trust too
3956 	 * much the values. In practice: TODO
3957 	 * - set a RESET flag somewhere in the kring
3958 	 * - do the processing in a conservative way
3959 	 * - let the *sync() fixup at the end.
3960 	 */
3961 	if (tx == NR_TX) {
3962 		if (n >= na->num_tx_rings)
3963 			return NULL;
3964 
3965 		kring = na->tx_rings[n];
3966 
3967 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3968 			kring->nr_mode = NKR_NETMAP_OFF;
3969 			return NULL;
3970 		}
3971 
3972 		// XXX check whether we should use hwcur or rcur
3973 		new_hwofs = kring->nr_hwcur - new_cur;
3974 	} else {
3975 		if (n >= na->num_rx_rings)
3976 			return NULL;
3977 		kring = na->rx_rings[n];
3978 
3979 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3980 			kring->nr_mode = NKR_NETMAP_OFF;
3981 			return NULL;
3982 		}
3983 
3984 		new_hwofs = kring->nr_hwtail - new_cur;
3985 	}
3986 	lim = kring->nkr_num_slots - 1;
3987 	if (new_hwofs > lim)
3988 		new_hwofs -= lim + 1;
3989 
3990 	/* Always set the new offset value and realign the ring. */
3991 	if (netmap_debug & NM_DEBUG_ON)
3992 	    nm_prinf("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
3993 		na->name,
3994 		tx == NR_TX ? "TX" : "RX", n,
3995 		kring->nkr_hwofs, new_hwofs,
3996 		kring->nr_hwtail,
3997 		tx == NR_TX ? lim : kring->nr_hwtail);
3998 	kring->nkr_hwofs = new_hwofs;
3999 	if (tx == NR_TX) {
4000 		kring->nr_hwtail = kring->nr_hwcur + lim;
4001 		if (kring->nr_hwtail > lim)
4002 			kring->nr_hwtail -= lim + 1;
4003 	}
4004 
4005 	/*
4006 	 * Wakeup on the individual and global selwait
4007 	 * We do the wakeup here, but the ring is not yet reconfigured.
4008 	 * However, we are under lock so there are no races.
4009 	 */
4010 	kring->nr_mode = NKR_NETMAP_ON;
4011 	kring->nm_notify(kring, 0);
4012 	return kring->ring->slot;
4013 }
4014 
4015 
4016 /*
4017  * Dispatch rx/tx interrupts to the netmap rings.
4018  *
4019  * "work_done" is non-null on the RX path, NULL for the TX path.
4020  * We rely on the OS to make sure that there is only one active
4021  * instance per queue, and that there is appropriate locking.
4022  *
4023  * The 'notify' routine depends on what the ring is attached to.
4024  * - for a netmap file descriptor, do a selwakeup on the individual
4025  *   waitqueue, plus one on the global one if needed
4026  *   (see netmap_notify)
4027  * - for a nic connected to a switch, call the proper forwarding routine
4028  *   (see netmap_bwrap_intr_notify)
4029  */
4030 int
4031 netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
4032 {
4033 	struct netmap_kring *kring;
4034 	enum txrx t = (work_done ? NR_RX : NR_TX);
4035 
4036 	q &= NETMAP_RING_MASK;
4037 
4038 	if (netmap_debug & (NM_DEBUG_RXINTR|NM_DEBUG_TXINTR)) {
4039 	        nm_prlim(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
4040 	}
4041 
4042 	if (q >= nma_get_nrings(na, t))
4043 		return NM_IRQ_PASS; // not a physical queue
4044 
4045 	kring = NMR(na, t)[q];
4046 
4047 	if (kring->nr_mode == NKR_NETMAP_OFF) {
4048 		return NM_IRQ_PASS;
4049 	}
4050 
4051 	if (t == NR_RX) {
4052 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
4053 		*work_done = 1; /* do not fire napi again */
4054 	}
4055 
4056 	return kring->nm_notify(kring, 0);
4057 }
4058 
4059 
4060 /*
4061  * Default functions to handle rx/tx interrupts from a physical device.
4062  * "work_done" is non-null on the RX path, NULL for the TX path.
4063  *
4064  * If the card is not in netmap mode, simply return NM_IRQ_PASS,
4065  * so that the caller proceeds with regular processing.
4066  * Otherwise call netmap_common_irq().
4067  *
4068  * If the card is connected to a netmap file descriptor,
4069  * do a selwakeup on the individual queue, plus one on the global one
4070  * if needed (multiqueue card _and_ there are multiqueue listeners),
4071  * and return NR_IRQ_COMPLETED.
4072  *
4073  * Finally, if called on rx from an interface connected to a switch,
4074  * calls the proper forwarding routine.
4075  */
4076 int
4077 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
4078 {
4079 	struct netmap_adapter *na = NA(ifp);
4080 
4081 	/*
4082 	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
4083 	 * we still use the regular driver even though the previous
4084 	 * check fails. It is unclear whether we should use
4085 	 * nm_native_on() here.
4086 	 */
4087 	if (!nm_netmap_on(na))
4088 		return NM_IRQ_PASS;
4089 
4090 	if (na->na_flags & NAF_SKIP_INTR) {
4091 		nm_prdis("use regular interrupt");
4092 		return NM_IRQ_PASS;
4093 	}
4094 
4095 	return netmap_common_irq(na, q, work_done);
4096 }
4097 
4098 /* set/clear native flags and if_transmit/netdev_ops */
4099 void
4100 nm_set_native_flags(struct netmap_adapter *na)
4101 {
4102 	struct ifnet *ifp = na->ifp;
4103 
4104 	/* We do the setup for intercepting packets only if we are the
4105 	 * first user of this adapapter. */
4106 	if (na->active_fds > 0) {
4107 		return;
4108 	}
4109 
4110 	na->na_flags |= NAF_NETMAP_ON;
4111 	nm_os_onenter(ifp);
4112 	nm_update_hostrings_mode(na);
4113 }
4114 
4115 void
4116 nm_clear_native_flags(struct netmap_adapter *na)
4117 {
4118 	struct ifnet *ifp = na->ifp;
4119 
4120 	/* We undo the setup for intercepting packets only if we are the
4121 	 * last user of this adapter. */
4122 	if (na->active_fds > 0) {
4123 		return;
4124 	}
4125 
4126 	nm_update_hostrings_mode(na);
4127 	nm_os_onexit(ifp);
4128 
4129 	na->na_flags &= ~NAF_NETMAP_ON;
4130 }
4131 
4132 void
4133 netmap_krings_mode_commit(struct netmap_adapter *na, int onoff)
4134 {
4135 	enum txrx t;
4136 
4137 	for_rx_tx(t) {
4138 		int i;
4139 
4140 		for (i = 0; i < netmap_real_rings(na, t); i++) {
4141 			struct netmap_kring *kring = NMR(na, t)[i];
4142 
4143 			if (onoff && nm_kring_pending_on(kring))
4144 				kring->nr_mode = NKR_NETMAP_ON;
4145 			else if (!onoff && nm_kring_pending_off(kring))
4146 				kring->nr_mode = NKR_NETMAP_OFF;
4147 		}
4148 	}
4149 }
4150 
4151 /*
4152  * Module loader and unloader
4153  *
4154  * netmap_init() creates the /dev/netmap device and initializes
4155  * all global variables. Returns 0 on success, errno on failure
4156  * (but there is no chance)
4157  *
4158  * netmap_fini() destroys everything.
4159  */
4160 
4161 static struct cdev *netmap_dev; /* /dev/netmap character device. */
4162 extern struct cdevsw netmap_cdevsw;
4163 
4164 
4165 void
4166 netmap_fini(void)
4167 {
4168 	if (netmap_dev)
4169 		destroy_dev(netmap_dev);
4170 	/* we assume that there are no longer netmap users */
4171 	nm_os_ifnet_fini();
4172 	netmap_uninit_bridges();
4173 	netmap_mem_fini();
4174 	NMG_LOCK_DESTROY();
4175 	nm_prinf("netmap: unloaded module.");
4176 }
4177 
4178 
4179 int
4180 netmap_init(void)
4181 {
4182 	int error;
4183 
4184 	NMG_LOCK_INIT();
4185 
4186 	error = netmap_mem_init();
4187 	if (error != 0)
4188 		goto fail;
4189 	/*
4190 	 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
4191 	 * when the module is compiled in.
4192 	 * XXX could use make_dev_credv() to get error number
4193 	 */
4194 	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
4195 		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
4196 			      "netmap");
4197 	if (!netmap_dev)
4198 		goto fail;
4199 
4200 	error = netmap_init_bridges();
4201 	if (error)
4202 		goto fail;
4203 
4204 #ifdef __FreeBSD__
4205 	nm_os_vi_init_index();
4206 #endif
4207 
4208 	error = nm_os_ifnet_init();
4209 	if (error)
4210 		goto fail;
4211 
4212 	nm_prinf("netmap: loaded module");
4213 	return (0);
4214 fail:
4215 	netmap_fini();
4216 	return (EINVAL); /* may be incorrect */
4217 }
4218