1 /* 2 * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * This module supports memory mapped access to network devices, 29 * see netmap(4). 30 * 31 * The module uses a large, memory pool allocated by the kernel 32 * and accessible as mmapped memory by multiple userspace threads/processes. 33 * The memory pool contains packet buffers and "netmap rings", 34 * i.e. user-accessible copies of the interface's queues. 35 * 36 * Access to the network card works like this: 37 * 1. a process/thread issues one or more open() on /dev/netmap, to create 38 * select()able file descriptor on which events are reported. 39 * 2. on each descriptor, the process issues an ioctl() to identify 40 * the interface that should report events to the file descriptor. 41 * 3. on each descriptor, the process issues an mmap() request to 42 * map the shared memory region within the process' address space. 43 * The list of interesting queues is indicated by a location in 44 * the shared memory region. 45 * 4. using the functions in the netmap(4) userspace API, a process 46 * can look up the occupation state of a queue, access memory buffers, 47 * and retrieve received packets or enqueue packets to transmit. 48 * 5. using some ioctl()s the process can synchronize the userspace view 49 * of the queue with the actual status in the kernel. This includes both 50 * receiving the notification of new packets, and transmitting new 51 * packets on the output interface. 52 * 6. select() or poll() can be used to wait for events on individual 53 * transmit or receive queues (or all queues for a given interface). 54 * 55 56 SYNCHRONIZATION (USER) 57 58 The netmap rings and data structures may be shared among multiple 59 user threads or even independent processes. 60 Any synchronization among those threads/processes is delegated 61 to the threads themselves. Only one thread at a time can be in 62 a system call on the same netmap ring. The OS does not enforce 63 this and only guarantees against system crashes in case of 64 invalid usage. 65 66 LOCKING (INTERNAL) 67 68 Within the kernel, access to the netmap rings is protected as follows: 69 70 - a spinlock on each ring, to handle producer/consumer races on 71 RX rings attached to the host stack (against multiple host 72 threads writing from the host stack to the same ring), 73 and on 'destination' rings attached to a VALE switch 74 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) 75 protecting multiple active senders for the same destination) 76 77 - an atomic variable to guarantee that there is at most one 78 instance of *_*xsync() on the ring at any time. 79 For rings connected to user file 80 descriptors, an atomic_test_and_set() protects this, and the 81 lock on the ring is not actually used. 82 For NIC RX rings connected to a VALE switch, an atomic_test_and_set() 83 is also used to prevent multiple executions (the driver might indeed 84 already guarantee this). 85 For NIC TX rings connected to a VALE switch, the lock arbitrates 86 access to the queue (both when allocating buffers and when pushing 87 them out). 88 89 - *xsync() should be protected against initializations of the card. 90 On FreeBSD most devices have the reset routine protected by 91 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing 92 the RING protection on rx_reset(), this should be added. 93 94 On linux there is an external lock on the tx path, which probably 95 also arbitrates access to the reset routine. XXX to be revised 96 97 - a per-interface core_lock protecting access from the host stack 98 while interfaces may be detached from netmap mode. 99 XXX there should be no need for this lock if we detach the interfaces 100 only while they are down. 101 102 103 --- VALE SWITCH --- 104 105 NMG_LOCK() serializes all modifications to switches and ports. 106 A switch cannot be deleted until all ports are gone. 107 108 For each switch, an SX lock (RWlock on linux) protects 109 deletion of ports. When configuring or deleting a new port, the 110 lock is acquired in exclusive mode (after holding NMG_LOCK). 111 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 112 The lock is held throughout the entire forwarding cycle, 113 during which the thread may incur in a page fault. 114 Hence it is important that sleepable shared locks are used. 115 116 On the rx ring, the per-port lock is grabbed initially to reserve 117 a number of slot in the ring, then the lock is released, 118 packets are copied from source to destination, and then 119 the lock is acquired again and the receive ring is updated. 120 (A similar thing is done on the tx ring for NIC and host stack 121 ports attached to the switch) 122 123 */ 124 125 /* 126 * OS-specific code that is used only within this file. 127 * Other OS-specific code that must be accessed by drivers 128 * is present in netmap_kern.h 129 */ 130 131 #include <sys/cdefs.h> /* prerequisite */ 132 __FBSDID("$FreeBSD: head/sys/dev/netmap/netmap.c 257176 2013-10-26 17:58:36Z glebius $"); 133 134 #include <sys/types.h> 135 #include <sys/errno.h> 136 #include <sys/param.h> /* defines used in kernel.h */ 137 #include <sys/kernel.h> /* types used in module initialization */ 138 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 139 #include <sys/devfs.h> 140 #include <sys/sockio.h> 141 #include <sys/socketvar.h> /* struct socket */ 142 #include <sys/malloc.h> 143 #include <sys/poll.h> 144 #include <sys/lock.h> 145 #include <sys/socket.h> /* sockaddrs */ 146 #include <sys/event.h> 147 #include <sys/sysctl.h> 148 #include <net/if.h> 149 #include <net/if_var.h> 150 #include <net/bpf.h> /* BIOCIMMEDIATE */ 151 #include <sys/bus.h> /* bus_dmamap_* */ 152 #include <sys/endian.h> 153 #include <sys/refcount.h> 154 155 /* reduce conditional code */ 156 #define init_waitqueue_head(x) // only needed in linux 157 158 extern struct dev_ops netmap_cdevsw; 159 160 /* 161 * common headers 162 */ 163 #include <net/netmap.h> 164 #include <net/netmap/netmap_kern.h> 165 #include <net/netmap/netmap_mem2.h> 166 167 168 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 169 170 /* 171 * The following variables are used by the drivers and replicate 172 * fields in the global memory pool. They only refer to buffers 173 * used by physical interfaces. 174 */ 175 u_int netmap_total_buffers; 176 u_int netmap_buf_size; 177 char *netmap_buffer_base; /* also address of an invalid buffer */ 178 179 /* user-controlled variables */ 180 int netmap_verbose; 181 182 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 183 184 SYSCTL_NODE(_net, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 185 SYSCTL_INT(_net_netmap, OID_AUTO, verbose, 186 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 187 SYSCTL_INT(_net_netmap, OID_AUTO, no_timestamp, 188 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 189 int netmap_mitigate = 1; 190 SYSCTL_INT(_net_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 191 int netmap_no_pendintr = 1; 192 SYSCTL_INT(_net_netmap, OID_AUTO, no_pendintr, 193 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 194 int netmap_txsync_retry = 2; 195 SYSCTL_INT(_net_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, 196 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); 197 198 int netmap_flags = 0; /* debug flags */ 199 int netmap_fwd = 0; /* force transparent mode */ 200 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ 201 202 /* 203 * netmap_admode selects the netmap mode to use. 204 * Invalid values are reset to NETMAP_ADMODE_BEST 205 */ 206 enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ 207 NETMAP_ADMODE_NATIVE, /* either native or none */ 208 NETMAP_ADMODE_GENERIC, /* force generic */ 209 NETMAP_ADMODE_LAST }; 210 #define NETMAP_ADMODE_NATIVE 1 /* Force native netmap adapter. */ 211 #define NETMAP_ADMODE_GENERIC 2 /* Force generic netmap adapter. */ 212 #define NETMAP_ADMODE_BEST 0 /* Priority to native netmap adapter. */ 213 static int netmap_admode = NETMAP_ADMODE_BEST; 214 215 int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ 216 int netmap_generic_ringsize = 1024; /* Generic ringsize. */ 217 218 SYSCTL_INT(_net_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 219 SYSCTL_INT(_net_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); 220 SYSCTL_INT(_net_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); 221 SYSCTL_INT(_net_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); 222 SYSCTL_INT(_net_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); 223 SYSCTL_INT(_net_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); 224 225 NMG_LOCK_T netmap_global_lock; 226 227 228 static void 229 nm_kr_get(struct netmap_kring *kr) 230 { 231 while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) 232 tsleep(kr, 0, "NM_KR_GET", 4); 233 } 234 235 236 void 237 netmap_disable_ring(struct netmap_kring *kr) 238 { 239 kr->nkr_stopped = 1; 240 nm_kr_get(kr); 241 lockmgr(&kr->q_lock, LK_EXCLUSIVE); 242 lockmgr(&kr->q_lock, LK_RELEASE); 243 nm_kr_put(kr); 244 } 245 246 247 static void 248 netmap_set_all_rings(struct ifnet *ifp, int stopped) 249 { 250 struct netmap_adapter *na; 251 int i; 252 253 if (!(ifp->if_capenable & IFCAP_NETMAP)) 254 return; 255 256 na = NA(ifp); 257 258 for (i = 0; i <= na->num_tx_rings; i++) { 259 if (stopped) 260 netmap_disable_ring(na->tx_rings + i); 261 else 262 na->tx_rings[i].nkr_stopped = 0; 263 na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY | 264 (i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0)); 265 } 266 267 for (i = 0; i <= na->num_rx_rings; i++) { 268 if (stopped) 269 netmap_disable_ring(na->rx_rings + i); 270 else 271 na->rx_rings[i].nkr_stopped = 0; 272 na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY | 273 (i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0)); 274 } 275 } 276 277 278 void 279 netmap_disable_all_rings(struct ifnet *ifp) 280 { 281 netmap_set_all_rings(ifp, 1 /* stopped */); 282 } 283 284 285 void 286 netmap_enable_all_rings(struct ifnet *ifp) 287 { 288 netmap_set_all_rings(ifp, 0 /* enabled */); 289 } 290 291 292 /* 293 * generic bound_checking function 294 */ 295 u_int 296 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) 297 { 298 u_int oldv = *v; 299 const char *op = NULL; 300 301 if (dflt < lo) 302 dflt = lo; 303 if (dflt > hi) 304 dflt = hi; 305 if (oldv < lo) { 306 *v = dflt; 307 op = "Bump"; 308 } else if (oldv > hi) { 309 *v = hi; 310 op = "Clamp"; 311 } 312 if (op && msg) 313 kprintf("%s %s to %d (was %d)\n", op, msg, *v, oldv); 314 return *v; 315 } 316 317 318 /* 319 * packet-dump function, user-supplied or static buffer. 320 * The destination buffer must be at least 30+4*len 321 */ 322 const char * 323 nm_dump_buf(char *p, int len, int lim, char *dst) 324 { 325 static char _dst[8192]; 326 int i, j, i0; 327 static char hex[] ="0123456789abcdef"; 328 char *o; /* output position */ 329 330 #define P_HI(x) hex[((x) & 0xf0)>>4] 331 #define P_LO(x) hex[((x) & 0xf)] 332 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') 333 if (!dst) 334 dst = _dst; 335 if (lim <= 0 || lim > len) 336 lim = len; 337 o = dst; 338 ksprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); 339 o += strlen(o); 340 /* hexdump routine */ 341 for (i = 0; i < lim; ) { 342 ksprintf(o, "%5d: ", i); 343 o += strlen(o); 344 memset(o, ' ', 48); 345 i0 = i; 346 for (j=0; j < 16 && i < lim; i++, j++) { 347 o[j*3] = P_HI(p[i]); 348 o[j*3+1] = P_LO(p[i]); 349 } 350 i = i0; 351 for (j=0; j < 16 && i < lim; i++, j++) 352 o[j + 48] = P_C(p[i]); 353 o[j+48] = '\n'; 354 o += j+49; 355 } 356 *o = '\0'; 357 #undef P_HI 358 #undef P_LO 359 #undef P_C 360 return dst; 361 } 362 363 364 365 /* 366 * Fetch configuration from the device, to cope with dynamic 367 * reconfigurations after loading the module. 368 */ 369 int 370 netmap_update_config(struct netmap_adapter *na) 371 { 372 struct ifnet *ifp = na->ifp; 373 u_int txr, txd, rxr, rxd; 374 375 txr = txd = rxr = rxd = 0; 376 if (na->nm_config) { 377 na->nm_config(na, &txr, &txd, &rxr, &rxd); 378 } else { 379 /* take whatever we had at init time */ 380 txr = na->num_tx_rings; 381 txd = na->num_tx_desc; 382 rxr = na->num_rx_rings; 383 rxd = na->num_rx_desc; 384 } 385 386 if (na->num_tx_rings == txr && na->num_tx_desc == txd && 387 na->num_rx_rings == rxr && na->num_rx_desc == rxd) 388 return 0; /* nothing changed */ 389 if (netmap_verbose || na->active_fds > 0) { 390 D("stored config %s: txring %d x %d, rxring %d x %d", 391 NM_IFPNAME(ifp), 392 na->num_tx_rings, na->num_tx_desc, 393 na->num_rx_rings, na->num_rx_desc); 394 D("new config %s: txring %d x %d, rxring %d x %d", 395 NM_IFPNAME(ifp), txr, txd, rxr, rxd); 396 } 397 if (na->active_fds == 0) { 398 D("configuration changed (but fine)"); 399 na->num_tx_rings = txr; 400 na->num_tx_desc = txd; 401 na->num_rx_rings = rxr; 402 na->num_rx_desc = rxd; 403 return 0; 404 } 405 D("configuration changed while active, this is bad..."); 406 return 1; 407 } 408 409 410 int 411 netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom) 412 { 413 u_int i, len, ndesc; 414 struct netmap_kring *kring; 415 416 len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; 417 418 na->tx_rings = kmalloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); 419 if (na->tx_rings == NULL) { 420 D("Cannot allocate krings"); 421 return ENOMEM; 422 } 423 na->rx_rings = na->tx_rings + ntx; 424 425 ndesc = na->num_tx_desc; 426 for (i = 0; i < ntx; i++) { /* Transmit rings */ 427 kring = &na->tx_rings[i]; 428 bzero(kring, sizeof(*kring)); 429 kring->na = na; 430 kring->nkr_num_slots = ndesc; 431 /* 432 * IMPORTANT: 433 * Always keep one slot empty, so we can detect new 434 * transmissions comparing cur and nr_hwcur (they are 435 * the same only if there are no new transmissions). 436 */ 437 kring->nr_hwavail = ndesc - 1; 438 lockinit(&kring->q_lock, "nm_txq_lock", 0, LK_CANRECURSE); 439 init_waitqueue_head(&kring->si); 440 } 441 442 ndesc = na->num_rx_desc; 443 for (i = 0; i < nrx; i++) { /* Receive rings */ 444 kring = &na->rx_rings[i]; 445 bzero(kring, sizeof(*kring)); 446 kring->na = na; 447 kring->nkr_num_slots = ndesc; 448 lockinit(&kring->q_lock, "nm_rxq_lock", 0, LK_CANRECURSE); 449 init_waitqueue_head(&kring->si); 450 } 451 init_waitqueue_head(&na->tx_si); 452 init_waitqueue_head(&na->rx_si); 453 454 na->tailroom = na->rx_rings + nrx; 455 456 return 0; 457 458 } 459 460 461 void 462 netmap_krings_delete(struct netmap_adapter *na) 463 { 464 int i; 465 466 for (i = 0; i < na->num_tx_rings + 1; i++) { 467 lockuninit(&na->tx_rings[i].q_lock); 468 } 469 for (i = 0; i < na->num_rx_rings + 1; i++) { 470 lockuninit(&na->rx_rings[i].q_lock); 471 } 472 kfree(na->tx_rings, M_DEVBUF); 473 na->tx_rings = na->rx_rings = na->tailroom = NULL; 474 } 475 476 477 static struct netmap_if* 478 netmap_if_new(const char *ifname, struct netmap_adapter *na) 479 { 480 struct netmap_if *nifp; 481 482 if (netmap_update_config(na)) { 483 /* configuration mismatch, report and fail */ 484 return NULL; 485 } 486 487 if (na->active_fds) 488 goto final; 489 490 if (na->nm_krings_create(na)) 491 goto cleanup; 492 493 if (netmap_mem_rings_create(na)) 494 goto cleanup; 495 496 final: 497 498 nifp = netmap_mem_if_new(ifname, na); 499 if (nifp == NULL) 500 goto cleanup; 501 502 return (nifp); 503 504 cleanup: 505 506 if (na->active_fds == 0) { 507 netmap_mem_rings_delete(na); 508 na->nm_krings_delete(na); 509 } 510 511 return NULL; 512 } 513 514 515 /* grab a reference to the memory allocator, if we don't have one already. The 516 * reference is taken from the netmap_adapter registered with the priv. 517 * 518 */ 519 static int 520 netmap_get_memory_locked(struct netmap_priv_d* p) 521 { 522 struct netmap_mem_d *nmd; 523 int error = 0; 524 525 if (p->np_na == NULL) { 526 if (!netmap_mmap_unreg) 527 return ENODEV; 528 /* for compatibility with older versions of the API 529 * we use the global allocator when no interface has been 530 * registered 531 */ 532 nmd = &nm_mem; 533 } else { 534 nmd = p->np_na->nm_mem; 535 } 536 if (p->np_mref == NULL) { 537 error = netmap_mem_finalize(nmd); 538 if (!error) 539 p->np_mref = nmd; 540 } else if (p->np_mref != nmd) { 541 /* a virtual port has been registered, but previous 542 * syscalls already used the global allocator. 543 * We cannot continue 544 */ 545 error = ENODEV; 546 } 547 return error; 548 } 549 550 551 int 552 netmap_get_memory(struct netmap_priv_d* p) 553 { 554 int error; 555 NMG_LOCK(); 556 error = netmap_get_memory_locked(p); 557 NMG_UNLOCK(); 558 return error; 559 } 560 561 562 static int 563 netmap_have_memory_locked(struct netmap_priv_d* p) 564 { 565 return p->np_mref != NULL; 566 } 567 568 569 static void 570 netmap_drop_memory_locked(struct netmap_priv_d* p) 571 { 572 if (p->np_mref) { 573 netmap_mem_deref(p->np_mref); 574 p->np_mref = NULL; 575 } 576 } 577 578 579 /* 580 * File descriptor's private data destructor. 581 * 582 * Call nm_register(ifp,0) to stop netmap mode on the interface and 583 * revert to normal operation. We expect that np_na->ifp has not gone. 584 * The second argument is the nifp to work on. In some cases it is 585 * not attached yet to the netmap_priv_d so we need to pass it as 586 * a separate argument. 587 */ 588 /* call with NMG_LOCK held */ 589 static void 590 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) 591 { 592 struct netmap_adapter *na = priv->np_na; 593 struct ifnet *ifp = na->ifp; 594 595 NMG_LOCK_ASSERT(); 596 na->active_fds--; 597 if (na->active_fds <= 0) { /* last instance */ 598 599 if (netmap_verbose) 600 D("deleting last instance for %s", NM_IFPNAME(ifp)); 601 /* 602 * (TO CHECK) This function is only called 603 * when the last reference to this file descriptor goes 604 * away. This means we cannot have any pending poll() 605 * or interrupt routine operating on the structure. 606 * XXX The file may be closed in a thread while 607 * another thread is using it. 608 * Linux keeps the file opened until the last reference 609 * by any outstanding ioctl/poll or mmap is gone. 610 * FreeBSD does not track mmap()s (but we do) and 611 * wakes up any sleeping poll(). Need to check what 612 * happens if the close() occurs while a concurrent 613 * syscall is running. 614 */ 615 if (ifp) 616 na->nm_register(na, 0); /* off, clear IFCAP_NETMAP */ 617 /* Wake up any sleeping threads. netmap_poll will 618 * then return POLLERR 619 * XXX The wake up now must happen during *_down(), when 620 * we order all activities to stop. -gl 621 */ 622 /* XXX kqueue(9) needed; these will mirror knlist_init. */ 623 /* knlist_destroy(&na->tx_si.si_note); */ 624 /* knlist_destroy(&na->rx_si.si_note); */ 625 626 /* delete rings and buffers */ 627 netmap_mem_rings_delete(na); 628 na->nm_krings_delete(na); 629 } 630 /* delete the nifp */ 631 netmap_mem_if_delete(na, nifp); 632 } 633 634 635 /* 636 * returns 1 if this is the last instance and we can free priv 637 */ 638 int 639 netmap_dtor_locked(struct netmap_priv_d *priv) 640 { 641 struct netmap_adapter *na = priv->np_na; 642 643 /* 644 * np_refcount is the number of active mmaps on 645 * this file descriptor 646 */ 647 if (--priv->np_refcount > 0) { 648 return 0; 649 } 650 if (!na) { 651 return 1; //XXX is it correct? 652 } 653 netmap_do_unregif(priv, priv->np_nifp); 654 priv->np_nifp = NULL; 655 netmap_drop_memory_locked(priv); 656 if (priv->np_na) { 657 netmap_adapter_put(na); 658 priv->np_na = NULL; 659 } 660 return 1; 661 } 662 663 664 void 665 netmap_dtor(void *data) 666 { 667 struct netmap_priv_d *priv = data; 668 int last_instance; 669 670 NMG_LOCK(); 671 last_instance = netmap_dtor_locked(priv); 672 NMG_UNLOCK(); 673 if (last_instance) { 674 bzero(priv, sizeof(*priv)); /* for safety */ 675 kfree(priv, M_DEVBUF); 676 } 677 } 678 679 680 681 682 /* 683 * Handlers for synchronization of the queues from/to the host. 684 * Netmap has two operating modes: 685 * - in the default mode, the rings connected to the host stack are 686 * just another ring pair managed by userspace; 687 * - in transparent mode (XXX to be defined) incoming packets 688 * (from the host or the NIC) are marked as NS_FORWARD upon 689 * arrival, and the user application has a chance to reset the 690 * flag for packets that should be dropped. 691 * On the RXSYNC or poll(), packets in RX rings between 692 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved 693 * to the other side. 694 * The transfer NIC --> host is relatively easy, just encapsulate 695 * into mbufs and we are done. The host --> NIC side is slightly 696 * harder because there might not be room in the tx ring so it 697 * might take a while before releasing the buffer. 698 */ 699 700 701 /* 702 * pass a chain of buffers to the host stack as coming from 'dst' 703 */ 704 static void 705 netmap_send_up(struct ifnet *dst, struct mbq *q) 706 { 707 struct mbuf *m; 708 709 /* send packets up, outside the lock */ 710 while ((m = mbq_dequeue(q)) != NULL) { 711 if (netmap_verbose & NM_VERB_HOST) 712 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 713 NM_SEND_UP(dst, m); 714 } 715 mbq_destroy(q); 716 } 717 718 719 /* 720 * put a copy of the buffers marked NS_FORWARD into an mbuf chain. 721 * Run from hwcur to cur - reserved 722 */ 723 static void 724 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) 725 { 726 /* Take packets from hwcur to cur-reserved and pass them up. 727 * In case of no buffers we give up. At the end of the loop, 728 * the queue is drained in all cases. 729 * XXX handle reserved 730 */ 731 u_int lim = kring->nkr_num_slots - 1; 732 struct mbuf *m; 733 u_int k = kring->ring->cur, n = kring->ring->reserved; 734 struct netmap_adapter *na = kring->na; 735 736 /* compute the final position, ring->cur - ring->reserved */ 737 if (n > 0) { 738 if (k < n) 739 k += kring->nkr_num_slots; 740 k += n; 741 } 742 for (n = kring->nr_hwcur; n != k;) { 743 struct netmap_slot *slot = &kring->ring->slot[n]; 744 745 n = nm_next(n, lim); 746 if ((slot->flags & NS_FORWARD) == 0 && !force) 747 continue; 748 if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { 749 D("bad pkt at %d len %d", n, slot->len); 750 continue; 751 } 752 slot->flags &= ~NS_FORWARD; // XXX needed ? 753 /* XXX adapt to the case of a multisegment packet */ 754 m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL); 755 756 if (m == NULL) 757 break; 758 mbq_enqueue(q, m); 759 } 760 } 761 762 763 /* 764 * The host ring has packets from nr_hwcur to (cur - reserved) 765 * to be sent down to the NIC. 766 * We need to use the queue lock on the source (host RX ring) 767 * to protect against netmap_transmit. 768 * If the user is well behaved we do not need to acquire locks 769 * on the destination(s), 770 * so we only need to make sure that there are no panics because 771 * of user errors. 772 * XXX verify 773 * 774 * We scan the tx rings, which have just been 775 * flushed so nr_hwcur == cur. Pushing packets down means 776 * increment cur and decrement avail. 777 * XXX to be verified 778 */ 779 static void 780 netmap_sw_to_nic(struct netmap_adapter *na) 781 { 782 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 783 struct netmap_kring *k1 = &na->tx_rings[0]; 784 u_int i, howmany, src_lim, dst_lim; 785 786 /* XXX we should also check that the carrier is on */ 787 if (kring->nkr_stopped) 788 return; 789 790 lockmgr(&kring->q_lock, LK_EXCLUSIVE); 791 792 if (kring->nkr_stopped) 793 goto out; 794 795 howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */ 796 797 src_lim = kring->nkr_num_slots - 1; 798 for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) { 799 ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail); 800 dst_lim = k1->nkr_num_slots - 1; 801 while (howmany > 0 && k1->ring->avail > 0) { 802 struct netmap_slot *src, *dst, tmp; 803 src = &kring->ring->slot[kring->nr_hwcur]; 804 dst = &k1->ring->slot[k1->ring->cur]; 805 tmp = *src; 806 src->buf_idx = dst->buf_idx; 807 src->flags = NS_BUF_CHANGED; 808 809 dst->buf_idx = tmp.buf_idx; 810 dst->len = tmp.len; 811 dst->flags = NS_BUF_CHANGED; 812 ND("out len %d buf %d from %d to %d", 813 dst->len, dst->buf_idx, 814 kring->nr_hwcur, k1->ring->cur); 815 816 kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim); 817 howmany--; 818 kring->nr_hwavail--; 819 k1->ring->cur = nm_next(k1->ring->cur, dst_lim); 820 k1->ring->avail--; 821 } 822 kring->ring->cur = kring->nr_hwcur; // XXX 823 k1++; // XXX why? 824 } 825 out: 826 lockmgr(&kring->q_lock, LK_RELEASE); 827 } 828 829 830 /* 831 * netmap_txsync_to_host() passes packets up. We are called from a 832 * system call in user process context, and the only contention 833 * can be among multiple user threads erroneously calling 834 * this routine concurrently. 835 */ 836 void 837 netmap_txsync_to_host(struct netmap_adapter *na) 838 { 839 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 840 struct netmap_ring *ring = kring->ring; 841 u_int k, lim = kring->nkr_num_slots - 1; 842 struct mbq q; 843 int error; 844 845 error = nm_kr_tryget(kring); 846 if (error) { 847 if (error == NM_KR_BUSY) 848 D("ring %p busy (user error)", kring); 849 return; 850 } 851 k = ring->cur; 852 if (k > lim) { 853 D("invalid ring index in stack TX kring %p", kring); 854 netmap_ring_reinit(kring); 855 nm_kr_put(kring); 856 return; 857 } 858 859 /* Take packets from hwcur to cur and pass them up. 860 * In case of no buffers we give up. At the end of the loop, 861 * the queue is drained in all cases. 862 */ 863 mbq_init(&q); 864 netmap_grab_packets(kring, &q, 1); 865 kring->nr_hwcur = k; 866 kring->nr_hwavail = ring->avail = lim; 867 868 nm_kr_put(kring); 869 netmap_send_up(na->ifp, &q); 870 } 871 872 873 /* 874 * rxsync backend for packets coming from the host stack. 875 * They have been put in the queue by netmap_transmit() so we 876 * need to protect access to the kring using a lock. 877 * 878 * This routine also does the selrecord if called from the poll handler 879 * (we know because td != NULL). 880 * 881 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 882 * as an additional hidden argument. 883 */ 884 static void 885 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 886 { 887 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 888 struct netmap_ring *ring = kring->ring; 889 u_int j, n, lim = kring->nkr_num_slots; 890 u_int k = ring->cur, resvd = ring->reserved; 891 892 (void)pwait; /* disable unused warnings */ 893 894 if (kring->nkr_stopped) /* check a first time without lock */ 895 return; 896 897 lockmgr(&kring->q_lock, LK_EXCLUSIVE); 898 899 if (kring->nkr_stopped) /* check again with lock held */ 900 goto unlock_out; 901 902 if (k >= lim) { 903 netmap_ring_reinit(kring); 904 goto unlock_out; 905 } 906 /* new packets are already set in nr_hwavail */ 907 /* skip past packets that userspace has released */ 908 j = kring->nr_hwcur; 909 if (resvd > 0) { 910 if (resvd + ring->avail >= lim + 1) { 911 D("XXX invalid reserve/avail %d %d", resvd, ring->avail); 912 ring->reserved = resvd = 0; // XXX panic... 913 } 914 k = (k >= resvd) ? k - resvd : k + lim - resvd; 915 } 916 if (j != k) { 917 n = k >= j ? k - j : k + lim - j; 918 kring->nr_hwavail -= n; 919 kring->nr_hwcur = k; 920 } 921 k = ring->avail = kring->nr_hwavail - resvd; 922 if (k == 0 && td) 923 KNOTE(&kring->si.ki_note, 0); 924 if (k && (netmap_verbose & NM_VERB_HOST)) 925 D("%d pkts from stack", k); 926 unlock_out: 927 928 lockmgr(&kring->q_lock, LK_RELEASE); 929 } 930 931 932 /* Get a netmap adapter for the port. 933 * 934 * If it is possible to satisfy the request, return 0 935 * with *na containing the netmap adapter found. 936 * Otherwise return an error code, with *na containing NULL. 937 * 938 * When the port is attached to a bridge, we always return 939 * EBUSY. 940 * Otherwise, if the port is already bound to a file descriptor, 941 * then we unconditionally return the existing adapter into *na. 942 * In all the other cases, we return (into *na) either native, 943 * generic or NULL, according to the following table: 944 * 945 * native_support 946 * active_fds dev.netmap.admode YES NO 947 * ------------------------------------------------------- 948 * >0 * NA(ifp) NA(ifp) 949 * 950 * 0 NETMAP_ADMODE_BEST NATIVE GENERIC 951 * 0 NETMAP_ADMODE_NATIVE NATIVE NULL 952 * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC 953 * 954 */ 955 956 int 957 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) 958 { 959 /* generic support */ 960 int i = netmap_admode; /* Take a snapshot. */ 961 int error = 0; 962 struct netmap_adapter *prev_na; 963 struct netmap_generic_adapter *gna; 964 965 *na = NULL; /* default */ 966 967 /* reset in case of invalid value */ 968 if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) 969 i = netmap_admode = NETMAP_ADMODE_BEST; 970 971 if (NETMAP_CAPABLE(ifp)) { 972 /* If an adapter already exists, but is 973 * attached to a vale port, we report that the 974 * port is busy. 975 */ 976 if (NETMAP_OWNED_BY_KERN(NA(ifp))) 977 return EBUSY; 978 979 /* If an adapter already exists, return it if 980 * there are active file descriptors or if 981 * netmap is not forced to use generic 982 * adapters. 983 */ 984 if (NA(ifp)->active_fds > 0 || 985 i != NETMAP_ADMODE_GENERIC) { 986 *na = NA(ifp); 987 return 0; 988 } 989 } 990 991 /* If there isn't native support and netmap is not allowed 992 * to use generic adapters, we cannot satisfy the request. 993 */ 994 if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) 995 return EINVAL; 996 997 /* Otherwise, create a generic adapter and return it, 998 * saving the previously used netmap adapter, if any. 999 * 1000 * Note that here 'prev_na', if not NULL, MUST be a 1001 * native adapter, and CANNOT be a generic one. This is 1002 * true because generic adapters are created on demand, and 1003 * destroyed when not used anymore. Therefore, if the adapter 1004 * currently attached to an interface 'ifp' is generic, it 1005 * must be that 1006 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). 1007 * Consequently, if NA(ifp) is generic, we will enter one of 1008 * the branches above. This ensures that we never override 1009 * a generic adapter with another generic adapter. 1010 */ 1011 prev_na = NA(ifp); 1012 error = generic_netmap_attach(ifp); 1013 if (error) 1014 return error; 1015 1016 *na = NA(ifp); 1017 gna = (struct netmap_generic_adapter*)NA(ifp); 1018 gna->prev = prev_na; /* save old na */ 1019 if (prev_na != NULL) { 1020 ifunit(ifp->if_xname); /* XXX huh? */ 1021 // XXX add a refcount ? 1022 netmap_adapter_get(prev_na); 1023 } 1024 D("Created generic NA %p (prev %p)", gna, gna->prev); 1025 1026 return 0; 1027 } 1028 1029 1030 /* 1031 * MUST BE CALLED UNDER NMG_LOCK() 1032 * 1033 * get a refcounted reference to an interface. 1034 * This is always called in the execution of an ioctl(). 1035 * 1036 * Return ENXIO if the interface does not exist, EINVAL if netmap 1037 * is not supported by the interface. 1038 * If successful, hold a reference. 1039 * 1040 * When the NIC is attached to a bridge, reference is managed 1041 * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as 1042 * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC 1043 * is detached from the bridge, then ifp's refcount is dropped (this 1044 * is equivalent to that ifp is destroyed in case of virtual ports. 1045 * 1046 * This function uses if_rele() when we want to prevent the NIC from 1047 * being detached from the bridge in error handling. But once refcount 1048 * is acquired by this function, it must be released using nm_if_rele(). 1049 */ 1050 int 1051 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 1052 { 1053 struct ifnet *ifp; 1054 int error = 0; 1055 struct netmap_adapter *ret; 1056 1057 *na = NULL; /* default return value */ 1058 1059 /* first try to see if this is a bridge port. */ 1060 NMG_LOCK_ASSERT(); 1061 1062 error = netmap_get_bdg_na(nmr, na, create); 1063 if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */ 1064 return error; 1065 1066 ifp = ifunit(nmr->nr_name); 1067 if (ifp == NULL) { 1068 return ENXIO; 1069 } 1070 1071 error = netmap_get_hw_na(ifp, &ret); 1072 if (error) 1073 goto out; 1074 1075 if (ret != NULL) { 1076 /* Users cannot use the NIC attached to a bridge directly */ 1077 if (NETMAP_OWNED_BY_KERN(ret)) { 1078 error = EINVAL; 1079 goto out; 1080 } 1081 error = 0; 1082 *na = ret; 1083 netmap_adapter_get(ret); 1084 } 1085 out: 1086 #if 0 1087 if_rele(ifp); 1088 #endif 1089 1090 return error; 1091 } 1092 1093 1094 /* 1095 * Error routine called when txsync/rxsync detects an error. 1096 * Can't do much more than resetting cur = hwcur, avail = hwavail. 1097 * Return 1 on reinit. 1098 * 1099 * This routine is only called by the upper half of the kernel. 1100 * It only reads hwcur (which is changed only by the upper half, too) 1101 * and hwavail (which may be changed by the lower half, but only on 1102 * a tx ring and only to increase it, so any error will be recovered 1103 * on the next call). For the above, we don't strictly need to call 1104 * it under lock. 1105 */ 1106 int 1107 netmap_ring_reinit(struct netmap_kring *kring) 1108 { 1109 struct netmap_ring *ring = kring->ring; 1110 u_int i, lim = kring->nkr_num_slots - 1; 1111 int errors = 0; 1112 1113 // XXX KASSERT nm_kr_tryget 1114 RD(10, "called for %s", NM_IFPNAME(kring->na->ifp)); 1115 if (ring->cur > lim) 1116 errors++; 1117 for (i = 0; i <= lim; i++) { 1118 u_int idx = ring->slot[i].buf_idx; 1119 u_int len = ring->slot[i].len; 1120 if (idx < 2 || idx >= netmap_total_buffers) { 1121 if (!errors++) 1122 D("bad buffer at slot %d idx %d len %d ", i, idx, len); 1123 ring->slot[i].buf_idx = 0; 1124 ring->slot[i].len = 0; 1125 } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { 1126 ring->slot[i].len = 0; 1127 if (!errors++) 1128 D("bad len %d at slot %d idx %d", 1129 len, i, idx); 1130 } 1131 } 1132 if (errors) { 1133 int pos = kring - kring->na->tx_rings; 1134 int n = kring->na->num_tx_rings + 1; 1135 1136 RD(10, "total %d errors", errors); 1137 errors++; 1138 RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", 1139 NM_IFPNAME(kring->na->ifp), 1140 pos < n ? "TX" : "RX", pos < n ? pos : pos - n, 1141 ring->cur, kring->nr_hwcur, 1142 ring->avail, kring->nr_hwavail); 1143 ring->cur = kring->nr_hwcur; 1144 ring->avail = kring->nr_hwavail; 1145 } 1146 return (errors ? 1 : 0); 1147 } 1148 1149 1150 /* 1151 * Set the ring ID. For devices with a single queue, a request 1152 * for all rings is the same as a single ring. 1153 */ 1154 static int 1155 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) 1156 { 1157 struct netmap_adapter *na = priv->np_na; 1158 struct ifnet *ifp = na->ifp; 1159 u_int i = ringid & NETMAP_RING_MASK; 1160 /* initially (np_qfirst == np_qlast) we don't want to lock */ 1161 u_int lim = na->num_rx_rings; 1162 1163 if (na->num_tx_rings > lim) 1164 lim = na->num_tx_rings; 1165 if ( (ringid & NETMAP_HW_RING) && i >= lim) { 1166 D("invalid ring id %d", i); 1167 return (EINVAL); 1168 } 1169 priv->np_ringid = ringid; 1170 if (ringid & NETMAP_SW_RING) { 1171 priv->np_qfirst = NETMAP_SW_RING; 1172 priv->np_qlast = 0; 1173 } else if (ringid & NETMAP_HW_RING) { 1174 priv->np_qfirst = i; 1175 priv->np_qlast = i + 1; 1176 } else { 1177 priv->np_qfirst = 0; 1178 priv->np_qlast = NETMAP_HW_RING ; 1179 } 1180 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 1181 if (netmap_verbose) { 1182 if (ringid & NETMAP_SW_RING) 1183 D("ringid %s set to SW RING", NM_IFPNAME(ifp)); 1184 else if (ringid & NETMAP_HW_RING) 1185 D("ringid %s set to HW RING %d", NM_IFPNAME(ifp), 1186 priv->np_qfirst); 1187 else 1188 D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim); 1189 } 1190 return 0; 1191 } 1192 1193 1194 /* 1195 * possibly move the interface to netmap-mode. 1196 * If success it returns a pointer to netmap_if, otherwise NULL. 1197 * This must be called with NMG_LOCK held. 1198 */ 1199 struct netmap_if * 1200 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, 1201 uint16_t ringid, int *err) 1202 { 1203 struct ifnet *ifp = na->ifp; 1204 struct netmap_if *nifp = NULL; 1205 int error, need_mem = 0; 1206 1207 NMG_LOCK_ASSERT(); 1208 /* ring configuration may have changed, fetch from the card */ 1209 netmap_update_config(na); 1210 priv->np_na = na; /* store the reference */ 1211 error = netmap_set_ringid(priv, ringid); 1212 if (error) 1213 goto out; 1214 /* ensure allocators are ready */ 1215 need_mem = !netmap_have_memory_locked(priv); 1216 if (need_mem) { 1217 error = netmap_get_memory_locked(priv); 1218 ND("get_memory returned %d", error); 1219 if (error) 1220 goto out; 1221 } 1222 nifp = netmap_if_new(NM_IFPNAME(ifp), na); 1223 if (nifp == NULL) { /* allocation failed */ 1224 /* we should drop the allocator, but only 1225 * if we were the ones who grabbed it 1226 */ 1227 error = ENOMEM; 1228 goto out; 1229 } 1230 na->active_fds++; 1231 if (ifp->if_capenable & IFCAP_NETMAP) { 1232 /* was already set */ 1233 } else { 1234 /* Otherwise set the card in netmap mode 1235 * and make it use the shared buffers. 1236 * 1237 * do not core lock because the race is harmless here, 1238 * there cannot be any traffic to netmap_transmit() 1239 */ 1240 na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut; 1241 ND("%p->na_lut == %p", na, na->na_lut); 1242 na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal; 1243 error = na->nm_register(na, 1); /* mode on */ 1244 if (error) { 1245 netmap_do_unregif(priv, nifp); 1246 nifp = NULL; 1247 } 1248 } 1249 out: 1250 *err = error; 1251 if (error) { 1252 priv->np_na = NULL; 1253 if (need_mem) 1254 netmap_drop_memory_locked(priv); 1255 } 1256 if (nifp != NULL) { 1257 /* 1258 * advertise that the interface is ready bt setting ni_nifp. 1259 * The barrier is needed because readers (poll and *SYNC) 1260 * check for priv->np_nifp != NULL without locking 1261 */ 1262 wmb(); /* make sure previous writes are visible to all CPUs */ 1263 priv->np_nifp = nifp; 1264 } 1265 return nifp; 1266 } 1267 1268 1269 1270 /* 1271 * ioctl(2) support for the "netmap" device. 1272 * 1273 * Following a list of accepted commands: 1274 * - NIOCGINFO 1275 * - SIOCGIFADDR just for convenience 1276 * - NIOCREGIF 1277 * - NIOCUNREGIF 1278 * - NIOCTXSYNC 1279 * - NIOCRXSYNC 1280 * 1281 * Return 0 on success, errno otherwise. 1282 */ 1283 int 1284 netmap_ioctl(struct dev_ioctl_args *ap) 1285 { 1286 struct netmap_priv_d *priv = NULL; 1287 struct ifnet *ifp = NULL; 1288 struct nmreq *nmr = (struct nmreq *) ap->a_data; 1289 struct netmap_adapter *na = NULL; 1290 int error; 1291 u_int i, lim; 1292 struct netmap_if *nifp; 1293 struct netmap_kring *krings; 1294 u_long cmd = ap->a_cmd; 1295 1296 priv = ap->a_head.a_dev->si_drv1; 1297 1298 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ 1299 switch (cmd) { 1300 case NIOCGINFO: /* return capabilities etc */ 1301 if (nmr->nr_version != NETMAP_API) { 1302 D("API mismatch got %d have %d", 1303 nmr->nr_version, NETMAP_API); 1304 nmr->nr_version = NETMAP_API; 1305 error = EINVAL; 1306 break; 1307 } 1308 if (nmr->nr_cmd == NETMAP_BDG_LIST) { 1309 error = netmap_bdg_ctl(nmr, NULL); 1310 break; 1311 } 1312 1313 NMG_LOCK(); 1314 do { 1315 /* memsize is always valid */ 1316 struct netmap_mem_d *nmd = &nm_mem; 1317 u_int memflags; 1318 1319 if (nmr->nr_name[0] != '\0') { 1320 /* get a refcount */ 1321 error = netmap_get_na(nmr, &na, 1 /* create */); 1322 if (error) 1323 break; 1324 nmd = na->nm_mem; /* get memory allocator */ 1325 } 1326 1327 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); 1328 if (error) 1329 break; 1330 if (na == NULL) /* only memory info */ 1331 break; 1332 nmr->nr_offset = 0; 1333 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 1334 netmap_update_config(na); 1335 nmr->nr_rx_rings = na->num_rx_rings; 1336 nmr->nr_tx_rings = na->num_tx_rings; 1337 nmr->nr_rx_slots = na->num_rx_desc; 1338 nmr->nr_tx_slots = na->num_tx_desc; 1339 if (memflags & NETMAP_MEM_PRIVATE) 1340 nmr->nr_ringid |= NETMAP_PRIV_MEM; 1341 netmap_adapter_put(na); 1342 } while (0); 1343 NMG_UNLOCK(); 1344 break; 1345 1346 case NIOCREGIF: 1347 if (nmr->nr_version != NETMAP_API) { 1348 nmr->nr_version = NETMAP_API; 1349 error = EINVAL; 1350 break; 1351 } 1352 /* possibly attach/detach NIC and VALE switch */ 1353 i = nmr->nr_cmd; 1354 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) { 1355 error = netmap_bdg_ctl(nmr, NULL); 1356 break; 1357 } else if (i != 0) { 1358 D("nr_cmd must be 0 not %d", i); 1359 error = EINVAL; 1360 break; 1361 } 1362 1363 /* protect access to priv from concurrent NIOCREGIF */ 1364 NMG_LOCK(); 1365 do { 1366 u_int memflags; 1367 1368 if (priv->np_na != NULL) { /* thread already registered */ 1369 error = netmap_set_ringid(priv, nmr->nr_ringid); 1370 break; 1371 } 1372 /* find the interface and a reference */ 1373 error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ 1374 if (error) 1375 break; 1376 ifp = na->ifp; 1377 if (NETMAP_OWNED_BY_KERN(na)) { 1378 netmap_adapter_put(na); 1379 error = EBUSY; 1380 break; 1381 } 1382 nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error); 1383 if (!nifp) { /* reg. failed, release priv and ref */ 1384 netmap_adapter_put(na); 1385 priv->np_nifp = NULL; 1386 break; 1387 } 1388 1389 /* return the offset of the netmap_if object */ 1390 nmr->nr_rx_rings = na->num_rx_rings; 1391 nmr->nr_tx_rings = na->num_tx_rings; 1392 nmr->nr_rx_slots = na->num_rx_desc; 1393 nmr->nr_tx_slots = na->num_tx_desc; 1394 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); 1395 if (error) { 1396 netmap_adapter_put(na); 1397 break; 1398 } 1399 if (memflags & NETMAP_MEM_PRIVATE) { 1400 nmr->nr_ringid |= NETMAP_PRIV_MEM; 1401 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; 1402 } 1403 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); 1404 } while (0); 1405 NMG_UNLOCK(); 1406 break; 1407 1408 case NIOCUNREGIF: 1409 // XXX we have no data here ? 1410 D("deprecated, data is %p", nmr); 1411 error = EINVAL; 1412 break; 1413 1414 case NIOCTXSYNC: 1415 case NIOCRXSYNC: 1416 nifp = priv->np_nifp; 1417 1418 if (nifp == NULL) { 1419 error = ENXIO; 1420 break; 1421 } 1422 rmb(); /* make sure following reads are not from cache */ 1423 1424 na = priv->np_na; /* we have a reference */ 1425 1426 if (na == NULL) { 1427 D("Internal error: nifp != NULL && na == NULL"); 1428 error = ENXIO; 1429 break; 1430 } 1431 1432 ifp = na->ifp; 1433 if (ifp == NULL) { 1434 RD(1, "the ifp is gone"); 1435 error = ENXIO; 1436 break; 1437 } 1438 1439 if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ 1440 if (cmd == NIOCTXSYNC) 1441 netmap_txsync_to_host(na); 1442 else 1443 netmap_rxsync_from_host(na, NULL, NULL); 1444 break; 1445 } 1446 /* find the last ring to scan */ 1447 lim = priv->np_qlast; 1448 if (lim == NETMAP_HW_RING) 1449 lim = (cmd == NIOCTXSYNC) ? 1450 na->num_tx_rings : na->num_rx_rings; 1451 1452 krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings; 1453 for (i = priv->np_qfirst; i < lim; i++) { 1454 struct netmap_kring *kring = krings + i; 1455 if (nm_kr_tryget(kring)) { 1456 error = EBUSY; 1457 goto out; 1458 } 1459 if (cmd == NIOCTXSYNC) { 1460 if (netmap_verbose & NM_VERB_TXSYNC) 1461 D("pre txsync ring %d cur %d hwcur %d", 1462 i, kring->ring->cur, 1463 kring->nr_hwcur); 1464 na->nm_txsync(na, i, NAF_FORCE_RECLAIM); 1465 if (netmap_verbose & NM_VERB_TXSYNC) 1466 D("post txsync ring %d cur %d hwcur %d", 1467 i, kring->ring->cur, 1468 kring->nr_hwcur); 1469 } else { 1470 na->nm_rxsync(na, i, NAF_FORCE_READ); 1471 microtime(&na->rx_rings[i].ring->ts); 1472 } 1473 nm_kr_put(kring); 1474 } 1475 1476 break; 1477 case BIOCIMMEDIATE: 1478 case BIOCGHDRCMPLT: 1479 case BIOCSHDRCMPLT: 1480 case BIOCSSEESENT: 1481 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 1482 break; 1483 1484 default: /* allow device-specific ioctls */ 1485 { 1486 struct socket so; 1487 1488 bzero(&so, sizeof(so)); 1489 NMG_LOCK(); 1490 error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */ 1491 if (error) { 1492 netmap_adapter_put(na); 1493 NMG_UNLOCK(); 1494 break; 1495 } 1496 ifp = na->ifp; 1497 // so->so_proto not null. 1498 error = ifioctl(&so, cmd, ap->a_data, ap->a_cred); 1499 netmap_adapter_put(na); 1500 NMG_UNLOCK(); 1501 break; 1502 } 1503 } 1504 out: 1505 1506 return (error); 1507 } 1508 1509 static int 1510 netmap_kqfilter_event(struct knote *kn, long hint) 1511 { 1512 return (0); 1513 } 1514 1515 static void 1516 netmap_kqfilter_detach(struct knote *kn) 1517 { 1518 } 1519 1520 static struct filterops netmap_kqfilter_ops = { 1521 FILTEROP_ISFD, NULL, netmap_kqfilter_detach, netmap_kqfilter_event, 1522 }; 1523 1524 int 1525 netmap_kqfilter(struct dev_kqfilter_args *ap) 1526 { 1527 struct knote *kn = ap->a_kn; 1528 1529 ap->a_result = 0; 1530 1531 switch (kn->kn_filter) { 1532 case EVFILT_READ: 1533 case EVFILT_WRITE: 1534 kn->kn_fop = &netmap_kqfilter_ops; 1535 break; 1536 default: 1537 ap->a_result = EOPNOTSUPP; 1538 return (0); 1539 } 1540 1541 return (0); 1542 } 1543 1544 /* 1545 * select(2) and poll(2) handlers for the "netmap" device. 1546 * 1547 * Can be called for one or more queues. 1548 * Return true the event mask corresponding to ready events. 1549 * If there are no ready events, do a selrecord on either individual 1550 * selinfo or on the global one. 1551 * Device-dependent parts (locking and sync of tx/rx rings) 1552 * are done through callbacks. 1553 * 1554 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 1555 * The first one is remapped to pwait as selrecord() uses the name as an 1556 * hidden argument. 1557 */ 1558 static int 1559 netmap_poll(struct cdev *dev, int events, struct thread *td) 1560 { 1561 struct netmap_priv_d *priv = NULL; 1562 struct netmap_adapter *na; 1563 struct ifnet *ifp; 1564 struct netmap_kring *kring; 1565 u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; 1566 u_int lim_tx, lim_rx, host_forwarded = 0; 1567 struct mbq q; 1568 void *pwait = dev; /* linux compatibility */ 1569 1570 /* 1571 * In order to avoid nested locks, we need to "double check" 1572 * txsync and rxsync if we decide to do a selrecord(). 1573 * retry_tx (and retry_rx, later) prevent looping forever. 1574 */ 1575 int retry_tx = 1; 1576 1577 (void)pwait; 1578 mbq_init(&q); 1579 1580 priv = dev->si_drv1; 1581 1582 if (priv->np_nifp == NULL) { 1583 D("No if registered"); 1584 return POLLERR; 1585 } 1586 rmb(); /* make sure following reads are not from cache */ 1587 1588 na = priv->np_na; 1589 ifp = na->ifp; 1590 // check for deleted 1591 if (ifp == NULL) { 1592 RD(1, "the ifp is gone"); 1593 return POLLERR; 1594 } 1595 1596 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) 1597 return POLLERR; 1598 1599 if (netmap_verbose & 0x8000) 1600 D("device %s events 0x%x", NM_IFPNAME(ifp), events); 1601 want_tx = events & (POLLOUT | POLLWRNORM); 1602 want_rx = events & (POLLIN | POLLRDNORM); 1603 1604 lim_tx = na->num_tx_rings; 1605 lim_rx = na->num_rx_rings; 1606 1607 if (priv->np_qfirst == NETMAP_SW_RING) { 1608 /* handle the host stack ring */ 1609 if (priv->np_txpoll || want_tx) { 1610 /* push any packets up, then we are always ready */ 1611 netmap_txsync_to_host(na); 1612 revents |= want_tx; 1613 } 1614 if (want_rx) { 1615 kring = &na->rx_rings[lim_rx]; 1616 if (kring->ring->avail == 0) 1617 netmap_rxsync_from_host(na, td, dev); 1618 if (kring->ring->avail > 0) { 1619 revents |= want_rx; 1620 } 1621 } 1622 return (revents); 1623 } 1624 1625 /* 1626 * If we are in transparent mode, check also the host rx ring 1627 * XXX Transparent mode at the moment requires to bind all 1628 * rings to a single file descriptor. 1629 */ 1630 kring = &na->rx_rings[lim_rx]; 1631 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 1632 && want_rx 1633 && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) { 1634 if (kring->ring->avail == 0) 1635 netmap_rxsync_from_host(na, td, dev); 1636 if (kring->ring->avail > 0) 1637 revents |= want_rx; 1638 } 1639 1640 /* 1641 * check_all_{tx|rx} are set if the card has more than one queue AND 1642 * the file descriptor is bound to all of them. If so, we sleep on 1643 * the "global" selinfo, otherwise we sleep on individual selinfo 1644 * (FreeBSD only allows two selinfo's per file descriptor). 1645 * The interrupt routine in the driver wake one or the other 1646 * (or both) depending on which clients are active. 1647 * 1648 * rxsync() is only called if we run out of buffers on a POLLIN. 1649 * txsync() is called if we run out of buffers on POLLOUT, or 1650 * there are pending packets to send. The latter can be disabled 1651 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 1652 */ 1653 check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1); 1654 check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1); 1655 1656 if (priv->np_qlast != NETMAP_HW_RING) { 1657 lim_tx = lim_rx = priv->np_qlast; 1658 } 1659 1660 /* 1661 * We start with a lock free round which is cheap if we have 1662 * slots available. If this fails, then lock and call the sync 1663 * routines. 1664 */ 1665 for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { 1666 kring = &na->rx_rings[i]; 1667 if (kring->ring->avail > 0) { 1668 revents |= want_rx; 1669 want_rx = 0; /* also breaks the loop */ 1670 } 1671 } 1672 for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { 1673 kring = &na->tx_rings[i]; 1674 if (kring->ring->avail > 0) { 1675 revents |= want_tx; 1676 want_tx = 0; /* also breaks the loop */ 1677 } 1678 } 1679 1680 /* 1681 * If we to push packets out (priv->np_txpoll) or want_tx is 1682 * still set, we do need to run the txsync calls (on all rings, 1683 * to avoid that the tx rings stall). 1684 * XXX should also check cur != hwcur on the tx rings. 1685 * Fortunately, normal tx mode has np_txpoll set. 1686 */ 1687 if (priv->np_txpoll || want_tx) { 1688 /* If we really want to be woken up (want_tx), 1689 * do a selrecord, either on the global or on 1690 * the private structure. Then issue the txsync 1691 * so there is no race in the selrecord/selwait 1692 */ 1693 flush_tx: 1694 for (i = priv->np_qfirst; i < lim_tx; i++) { 1695 kring = &na->tx_rings[i]; 1696 /* 1697 * Skip this ring if want_tx == 0 1698 * (we have already done a successful sync on 1699 * a previous ring) AND kring->cur == kring->hwcur 1700 * (there are no pending transmissions for this ring). 1701 */ 1702 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 1703 continue; 1704 /* make sure only one user thread is doing this */ 1705 if (nm_kr_tryget(kring)) { 1706 ND("ring %p busy is %d", 1707 kring, (int)kring->nr_busy); 1708 revents |= POLLERR; 1709 goto out; 1710 } 1711 1712 if (netmap_verbose & NM_VERB_TXSYNC) 1713 D("send %d on %s %d", 1714 kring->ring->cur, NM_IFPNAME(ifp), i); 1715 if (na->nm_txsync(na, i, 0)) 1716 revents |= POLLERR; 1717 1718 /* Check avail/call selrecord only if called with POLLOUT */ 1719 if (want_tx) { 1720 if (kring->ring->avail > 0) { 1721 /* stop at the first ring. We don't risk 1722 * starvation. 1723 */ 1724 revents |= want_tx; 1725 want_tx = 0; 1726 } 1727 } 1728 nm_kr_put(kring); 1729 } 1730 if (want_tx && retry_tx) { 1731 KNOTE(check_all_tx ? &na->tx_si.ki_note : 1732 &na->tx_rings[priv->np_qfirst].si.ki_note, 0); 1733 retry_tx = 0; 1734 goto flush_tx; 1735 } 1736 } 1737 1738 /* 1739 * now if want_rx is still set we need to lock and rxsync. 1740 * Do it on all rings because otherwise we starve. 1741 */ 1742 if (want_rx) { 1743 int retry_rx = 1; 1744 do_retry_rx: 1745 for (i = priv->np_qfirst; i < lim_rx; i++) { 1746 kring = &na->rx_rings[i]; 1747 1748 if (nm_kr_tryget(kring)) { 1749 revents |= POLLERR; 1750 goto out; 1751 } 1752 1753 /* XXX NR_FORWARD should only be read on 1754 * physical or NIC ports 1755 */ 1756 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { 1757 ND(10, "forwarding some buffers up %d to %d", 1758 kring->nr_hwcur, kring->ring->cur); 1759 netmap_grab_packets(kring, &q, netmap_fwd); 1760 } 1761 1762 if (na->nm_rxsync(na, i, 0)) 1763 revents |= POLLERR; 1764 if (netmap_no_timestamp == 0 || 1765 kring->ring->flags & NR_TIMESTAMP) { 1766 microtime(&kring->ring->ts); 1767 } 1768 1769 if (kring->ring->avail > 0) { 1770 revents |= want_rx; 1771 retry_rx = 0; 1772 } 1773 nm_kr_put(kring); 1774 } 1775 if (retry_rx) { 1776 retry_rx = 0; 1777 KNOTE(check_all_rx ? &na->rx_si.ki_note : 1778 &na->rx_rings[priv->np_qfirst].si.ki_note, 0); 1779 goto do_retry_rx; 1780 } 1781 } 1782 1783 /* forward host to the netmap ring. 1784 * I am accessing nr_hwavail without lock, but netmap_transmit 1785 * can only increment it, so the operation is safe. 1786 */ 1787 kring = &na->rx_rings[lim_rx]; 1788 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 1789 && (netmap_fwd || kring->ring->flags & NR_FORWARD) 1790 && kring->nr_hwavail > 0 && !host_forwarded) { 1791 netmap_sw_to_nic(na); 1792 host_forwarded = 1; /* prevent another pass */ 1793 want_rx = 0; 1794 goto flush_tx; 1795 } 1796 1797 if (q.head) 1798 netmap_send_up(na->ifp, &q); 1799 1800 out: 1801 1802 return (revents); 1803 } 1804 1805 /*------- driver support routines ------*/ 1806 1807 static int netmap_hw_krings_create(struct netmap_adapter *); 1808 1809 static int 1810 netmap_notify(struct netmap_adapter *na, u_int n_ring, enum txrx tx, int flags) 1811 { 1812 struct netmap_kring *kring; 1813 1814 if (tx == NR_TX) { 1815 kring = na->tx_rings + n_ring; 1816 KNOTE(&kring->si.ki_note, 0); 1817 wakeup(&kring->si.ki_note); 1818 if (flags & NAF_GLOBAL_NOTIFY) 1819 wakeup(&na->tx_si.ki_note); 1820 } else { 1821 kring = na->rx_rings + n_ring; 1822 KNOTE(&kring->si.ki_note, 0); 1823 wakeup(&kring->si.ki_note); 1824 if (flags & NAF_GLOBAL_NOTIFY) 1825 wakeup(&na->rx_si.ki_note); 1826 } 1827 return 0; 1828 } 1829 1830 1831 // XXX check handling of failures 1832 int 1833 netmap_attach_common(struct netmap_adapter *na) 1834 { 1835 struct ifnet *ifp = na->ifp; 1836 1837 if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { 1838 D("%s: invalid rings tx %d rx %d", 1839 ifp->if_xname, na->num_tx_rings, na->num_rx_rings); 1840 return EINVAL; 1841 } 1842 WNA(ifp) = na; 1843 NETMAP_SET_CAPABLE(ifp); 1844 if (na->nm_krings_create == NULL) { 1845 na->nm_krings_create = netmap_hw_krings_create; 1846 na->nm_krings_delete = netmap_krings_delete; 1847 } 1848 if (na->nm_notify == NULL) 1849 na->nm_notify = netmap_notify; 1850 na->active_fds = 0; 1851 1852 if (na->nm_mem == NULL) 1853 na->nm_mem = &nm_mem; 1854 return 0; 1855 } 1856 1857 1858 void 1859 netmap_detach_common(struct netmap_adapter *na) 1860 { 1861 if (na->ifp) 1862 WNA(na->ifp) = NULL; /* XXX do we need this? */ 1863 1864 if (na->tx_rings) { /* XXX should not happen */ 1865 D("freeing leftover tx_rings"); 1866 na->nm_krings_delete(na); 1867 } 1868 if (na->na_flags & NAF_MEM_OWNER) 1869 netmap_mem_private_delete(na->nm_mem); 1870 bzero(na, sizeof(*na)); 1871 kfree(na, M_DEVBUF); 1872 } 1873 1874 1875 /* 1876 * Initialize a ``netmap_adapter`` object created by driver on attach. 1877 * We allocate a block of memory with room for a struct netmap_adapter 1878 * plus two sets of N+2 struct netmap_kring (where N is the number 1879 * of hardware rings): 1880 * krings 0..N-1 are for the hardware queues. 1881 * kring N is for the host stack queue 1882 * kring N+1 is only used for the selinfo for all queues. 1883 * Return 0 on success, ENOMEM otherwise. 1884 * 1885 * By default the receive and transmit adapter ring counts are both initialized 1886 * to num_queues. na->num_tx_rings can be set for cards with different tx/rx 1887 * setups. 1888 */ 1889 int 1890 netmap_attach(struct netmap_adapter *arg) 1891 { 1892 struct netmap_hw_adapter *hwna = NULL; 1893 // XXX when is arg == NULL ? 1894 struct ifnet *ifp = arg ? arg->ifp : NULL; 1895 1896 if (arg == NULL || ifp == NULL) 1897 goto fail; 1898 hwna = kmalloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); 1899 if (hwna == NULL) 1900 goto fail; 1901 hwna->up = *arg; 1902 if (netmap_attach_common(&hwna->up)) { 1903 kfree(hwna, M_DEVBUF); 1904 goto fail; 1905 } 1906 netmap_adapter_get(&hwna->up); 1907 1908 D("success for %s", NM_IFPNAME(ifp)); 1909 return 0; 1910 1911 fail: 1912 D("fail, arg %p ifp %p na %p", arg, ifp, hwna); 1913 netmap_detach(ifp); 1914 return (hwna ? EINVAL : ENOMEM); 1915 } 1916 1917 1918 void 1919 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) 1920 { 1921 if (!na) { 1922 return; 1923 } 1924 1925 refcount_acquire(&na->na_refcount); 1926 } 1927 1928 1929 /* returns 1 iff the netmap_adapter is destroyed */ 1930 int 1931 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) 1932 { 1933 if (!na) 1934 return 1; 1935 1936 if (!refcount_release(&na->na_refcount)) 1937 return 0; 1938 1939 if (na->nm_dtor) 1940 na->nm_dtor(na); 1941 1942 netmap_detach_common(na); 1943 1944 return 1; 1945 } 1946 1947 1948 int 1949 netmap_hw_krings_create(struct netmap_adapter *na) 1950 { 1951 return netmap_krings_create(na, 1952 na->num_tx_rings + 1, na->num_rx_rings + 1, 0); 1953 } 1954 1955 1956 1957 /* 1958 * Free the allocated memory linked to the given ``netmap_adapter`` 1959 * object. 1960 */ 1961 void 1962 netmap_detach(struct ifnet *ifp) 1963 { 1964 struct netmap_adapter *na = NA(ifp); 1965 1966 if (!na) 1967 return; 1968 1969 NMG_LOCK(); 1970 netmap_disable_all_rings(ifp); 1971 netmap_adapter_put(na); 1972 na->ifp = NULL; 1973 netmap_enable_all_rings(ifp); 1974 NMG_UNLOCK(); 1975 } 1976 1977 1978 /* 1979 * Intercept packets from the network stack and pass them 1980 * to netmap as incoming packets on the 'software' ring. 1981 * We rely on the OS to make sure that the ifp and na do not go 1982 * away (typically the caller checks for IFF_DRV_RUNNING or the like). 1983 * In nm_register() or whenever there is a reinitialization, 1984 * we make sure to access the core lock and per-ring locks 1985 * so that IFCAP_NETMAP is visible here. 1986 */ 1987 int 1988 netmap_transmit(struct ifnet *ifp, struct mbuf *m) 1989 { 1990 struct netmap_adapter *na = NA(ifp); 1991 struct netmap_kring *kring; 1992 u_int i, len = MBUF_LEN(m); 1993 u_int error = EBUSY, lim; 1994 struct netmap_slot *slot; 1995 1996 // XXX [Linux] we do not need this lock 1997 // if we follow the down/configure/up protocol -gl 1998 // mtx_lock(&na->core_lock); 1999 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { 2000 /* interface not in netmap mode anymore */ 2001 error = ENXIO; 2002 goto done; 2003 } 2004 2005 kring = &na->rx_rings[na->num_rx_rings]; 2006 lim = kring->nkr_num_slots - 1; 2007 if (netmap_verbose & NM_VERB_HOST) 2008 D("%s packet %d len %d from the stack", NM_IFPNAME(ifp), 2009 kring->nr_hwcur + kring->nr_hwavail, len); 2010 // XXX reconsider long packets if we handle fragments 2011 if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ 2012 D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp), 2013 len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); 2014 goto done; 2015 } 2016 /* protect against other instances of netmap_transmit, 2017 * and userspace invocations of rxsync(). 2018 */ 2019 // XXX [Linux] there can be no other instances of netmap_transmit 2020 // on this same ring, but we still need this lock to protect 2021 // concurrent access from netmap_sw_to_nic() -gl 2022 lockmgr(&kring->q_lock, LK_EXCLUSIVE); 2023 if (kring->nr_hwavail >= lim) { 2024 if (netmap_verbose) 2025 D("stack ring %s full\n", NM_IFPNAME(ifp)); 2026 } else { 2027 /* compute the insert position */ 2028 i = nm_kr_rxpos(kring); 2029 slot = &kring->ring->slot[i]; 2030 m_copydata(m, 0, (int)len, BDG_NMB(na, slot)); 2031 slot->len = len; 2032 slot->flags = kring->nkr_slot_flags; 2033 kring->nr_hwavail++; 2034 if (netmap_verbose & NM_VERB_HOST) 2035 D("wake up host ring %s %d", NM_IFPNAME(na->ifp), na->num_rx_rings); 2036 na->nm_notify(na, na->num_rx_rings, NR_RX, 0); 2037 error = 0; 2038 } 2039 lockmgr(&kring->q_lock, LK_RELEASE); 2040 2041 done: 2042 // mtx_unlock(&na->core_lock); 2043 2044 /* release the mbuf in either cases of success or failure. As an 2045 * alternative, put the mbuf in a free list and free the list 2046 * only when really necessary. 2047 */ 2048 m_freem(m); 2049 2050 return (error); 2051 } 2052 2053 2054 /* 2055 * netmap_reset() is called by the driver routines when reinitializing 2056 * a ring. The driver is in charge of locking to protect the kring. 2057 * If native netmap mode is not set just return NULL. 2058 */ 2059 struct netmap_slot * 2060 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, 2061 u_int new_cur) 2062 { 2063 struct netmap_kring *kring; 2064 int new_hwofs, lim; 2065 2066 if (na == NULL) { 2067 D("NULL na, should not happen"); 2068 return NULL; /* no netmap support here */ 2069 } 2070 if (!(na->ifp->if_capenable & IFCAP_NETMAP) || nma_is_generic(na)) { 2071 ND("interface not in netmap mode"); 2072 return NULL; /* nothing to reinitialize */ 2073 } 2074 2075 /* XXX note- in the new scheme, we are not guaranteed to be 2076 * under lock (e.g. when called on a device reset). 2077 * In this case, we should set a flag and do not trust too 2078 * much the values. In practice: TODO 2079 * - set a RESET flag somewhere in the kring 2080 * - do the processing in a conservative way 2081 * - let the *sync() fixup at the end. 2082 */ 2083 if (tx == NR_TX) { 2084 if (n >= na->num_tx_rings) 2085 return NULL; 2086 kring = na->tx_rings + n; 2087 new_hwofs = kring->nr_hwcur - new_cur; 2088 } else { 2089 if (n >= na->num_rx_rings) 2090 return NULL; 2091 kring = na->rx_rings + n; 2092 new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur; 2093 } 2094 lim = kring->nkr_num_slots - 1; 2095 if (new_hwofs > lim) 2096 new_hwofs -= lim + 1; 2097 2098 /* Always set the new offset value and realign the ring. */ 2099 D("%s hwofs %d -> %d, hwavail %d -> %d", 2100 tx == NR_TX ? "TX" : "RX", 2101 kring->nkr_hwofs, new_hwofs, 2102 kring->nr_hwavail, 2103 tx == NR_TX ? lim : kring->nr_hwavail); 2104 kring->nkr_hwofs = new_hwofs; 2105 if (tx == NR_TX) 2106 kring->nr_hwavail = lim; 2107 kring->nr_hwreserved = 0; 2108 2109 /* 2110 * Wakeup on the individual and global selwait 2111 * We do the wakeup here, but the ring is not yet reconfigured. 2112 * However, we are under lock so there are no races. 2113 */ 2114 na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY); 2115 return kring->ring->slot; 2116 } 2117 2118 2119 /* 2120 * Default functions to handle rx/tx interrupts from a physical device. 2121 * "work_done" is non-null on the RX path, NULL for the TX path. 2122 * "generic" is 0 when we are called by a device driver, and 1 when we 2123 * are called by the generic netmap adapter layer. 2124 * We rely on the OS to make sure that there is only one active 2125 * instance per queue, and that there is appropriate locking. 2126 * 2127 * If the card is not in netmap mode, simply return 0, 2128 * so that the caller proceeds with regular processing. 2129 * 2130 * We return 0 also when the card is in netmap mode but the current 2131 * netmap adapter is the generic one, because this function will be 2132 * called by the generic layer. 2133 * 2134 * If the card is connected to a netmap file descriptor, 2135 * do a selwakeup on the individual queue, plus one on the global one 2136 * if needed (multiqueue card _and_ there are multiqueue listeners), 2137 * and return 1. 2138 * 2139 * Finally, if called on rx from an interface connected to a switch, 2140 * calls the proper forwarding routine, and return 1. 2141 */ 2142 int 2143 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2144 { 2145 struct netmap_adapter *na = NA(ifp); 2146 struct netmap_kring *kring; 2147 2148 q &= NETMAP_RING_MASK; 2149 2150 if (netmap_verbose) { 2151 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); 2152 } 2153 2154 if (work_done) { /* RX path */ 2155 if (q >= na->num_rx_rings) 2156 return 0; // not a physical queue 2157 kring = na->rx_rings + q; 2158 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? 2159 na->nm_notify(na, q, NR_RX, 2160 (na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); 2161 *work_done = 1; /* do not fire napi again */ 2162 } else { /* TX path */ 2163 if (q >= na->num_tx_rings) 2164 return 0; // not a physical queue 2165 kring = na->tx_rings + q; 2166 na->nm_notify(na, q, NR_TX, 2167 (na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); 2168 } 2169 return 1; 2170 } 2171 2172 /* 2173 * Default functions to handle rx/tx interrupts from a physical device. 2174 * "work_done" is non-null on the RX path, NULL for the TX path. 2175 * "generic" is 0 when we are called by a device driver, and 1 when we 2176 * are called by the generic netmap adapter layer. 2177 * We rely on the OS to make sure that there is only one active 2178 * instance per queue, and that there is appropriate locking. 2179 * 2180 * If the card is not in netmap mode, simply return 0, 2181 * so that the caller proceeds with regular processing. 2182 * 2183 * If the card is connected to a netmap file descriptor, 2184 * do a selwakeup on the individual queue, plus one on the global one 2185 * if needed (multiqueue card _and_ there are multiqueue listeners), 2186 * and return 1. 2187 * 2188 * Finally, if called on rx from an interface connected to a switch, 2189 * calls the proper forwarding routine, and return 1. 2190 */ 2191 int 2192 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2193 { 2194 // XXX could we check NAF_NATIVE_ON ? 2195 if (!(ifp->if_capenable & IFCAP_NETMAP)) 2196 return 0; 2197 2198 if (NA(ifp)->na_flags & NAF_SKIP_INTR) { 2199 ND("use regular interrupt"); 2200 return 0; 2201 } 2202 2203 return netmap_common_irq(ifp, q, work_done); 2204 } 2205 2206 2207 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 2208 2209 2210 /* 2211 * Module loader. 2212 * 2213 * Create the /dev/netmap device and initialize all global 2214 * variables. 2215 * 2216 * Return 0 on success, errno on failure. 2217 */ 2218 int 2219 netmap_init(void) 2220 { 2221 int error; 2222 2223 NMG_LOCK_INIT(); 2224 2225 error = netmap_mem_init(); 2226 if (error != 0) { 2227 kprintf("netmap: unable to initialize the memory allocator.\n"); 2228 return (error); 2229 } 2230 kprintf("netmap: loaded module\n"); 2231 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, 2232 "netmap"); 2233 2234 netmap_init_bridges(); 2235 return (error); 2236 } 2237 2238 2239 /* 2240 * Module unloader. 2241 * 2242 * Free all the memory, and destroy the ``/dev/netmap`` device. 2243 */ 2244 void 2245 netmap_fini(void) 2246 { 2247 destroy_dev(netmap_dev); 2248 netmap_mem_fini(); 2249 NMG_LOCK_DESTROY(); 2250 kprintf("netmap: unloaded module.\n"); 2251 } 2252