1 /* 2 * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * This module supports memory mapped access to network devices, 29 * see netmap(4). 30 * 31 * The module uses a large, memory pool allocated by the kernel 32 * and accessible as mmapped memory by multiple userspace threads/processes. 33 * The memory pool contains packet buffers and "netmap rings", 34 * i.e. user-accessible copies of the interface's queues. 35 * 36 * Access to the network card works like this: 37 * 1. a process/thread issues one or more open() on /dev/netmap, to create 38 * select()able file descriptor on which events are reported. 39 * 2. on each descriptor, the process issues an ioctl() to identify 40 * the interface that should report events to the file descriptor. 41 * 3. on each descriptor, the process issues an mmap() request to 42 * map the shared memory region within the process' address space. 43 * The list of interesting queues is indicated by a location in 44 * the shared memory region. 45 * 4. using the functions in the netmap(4) userspace API, a process 46 * can look up the occupation state of a queue, access memory buffers, 47 * and retrieve received packets or enqueue packets to transmit. 48 * 5. using some ioctl()s the process can synchronize the userspace view 49 * of the queue with the actual status in the kernel. This includes both 50 * receiving the notification of new packets, and transmitting new 51 * packets on the output interface. 52 * 6. select() or poll() can be used to wait for events on individual 53 * transmit or receive queues (or all queues for a given interface). 54 * 55 56 SYNCHRONIZATION (USER) 57 58 The netmap rings and data structures may be shared among multiple 59 user threads or even independent processes. 60 Any synchronization among those threads/processes is delegated 61 to the threads themselves. Only one thread at a time can be in 62 a system call on the same netmap ring. The OS does not enforce 63 this and only guarantees against system crashes in case of 64 invalid usage. 65 66 LOCKING (INTERNAL) 67 68 Within the kernel, access to the netmap rings is protected as follows: 69 70 - a spinlock on each ring, to handle producer/consumer races on 71 RX rings attached to the host stack (against multiple host 72 threads writing from the host stack to the same ring), 73 and on 'destination' rings attached to a VALE switch 74 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) 75 protecting multiple active senders for the same destination) 76 77 - an atomic variable to guarantee that there is at most one 78 instance of *_*xsync() on the ring at any time. 79 For rings connected to user file 80 descriptors, an atomic_test_and_set() protects this, and the 81 lock on the ring is not actually used. 82 For NIC RX rings connected to a VALE switch, an atomic_test_and_set() 83 is also used to prevent multiple executions (the driver might indeed 84 already guarantee this). 85 For NIC TX rings connected to a VALE switch, the lock arbitrates 86 access to the queue (both when allocating buffers and when pushing 87 them out). 88 89 - *xsync() should be protected against initializations of the card. 90 On FreeBSD most devices have the reset routine protected by 91 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing 92 the RING protection on rx_reset(), this should be added. 93 94 On linux there is an external lock on the tx path, which probably 95 also arbitrates access to the reset routine. XXX to be revised 96 97 - a per-interface core_lock protecting access from the host stack 98 while interfaces may be detached from netmap mode. 99 XXX there should be no need for this lock if we detach the interfaces 100 only while they are down. 101 102 103 --- VALE SWITCH --- 104 105 NMG_LOCK() serializes all modifications to switches and ports. 106 A switch cannot be deleted until all ports are gone. 107 108 For each switch, an SX lock (RWlock on linux) protects 109 deletion of ports. When configuring or deleting a new port, the 110 lock is acquired in exclusive mode (after holding NMG_LOCK). 111 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 112 The lock is held throughout the entire forwarding cycle, 113 during which the thread may incur in a page fault. 114 Hence it is important that sleepable shared locks are used. 115 116 On the rx ring, the per-port lock is grabbed initially to reserve 117 a number of slot in the ring, then the lock is released, 118 packets are copied from source to destination, and then 119 the lock is acquired again and the receive ring is updated. 120 (A similar thing is done on the tx ring for NIC and host stack 121 ports attached to the switch) 122 123 */ 124 125 /* 126 * OS-specific code that is used only within this file. 127 * Other OS-specific code that must be accessed by drivers 128 * is present in netmap_kern.h 129 */ 130 131 #include <sys/cdefs.h> /* prerequisite */ 132 __FBSDID("$FreeBSD: head/sys/dev/netmap/netmap.c 257176 2013-10-26 17:58:36Z glebius $"); 133 134 #include <sys/types.h> 135 #include <sys/errno.h> 136 #include <sys/param.h> /* defines used in kernel.h */ 137 #include <sys/kernel.h> /* types used in module initialization */ 138 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 139 #include <sys/devfs.h> 140 #include <sys/sockio.h> 141 #include <sys/socketvar.h> /* struct socket */ 142 #include <sys/malloc.h> 143 #include <sys/poll.h> 144 #include <sys/lock.h> 145 #include <sys/socket.h> /* sockaddrs */ 146 #include <sys/event.h> 147 #include <sys/sysctl.h> 148 #include <net/if.h> 149 #include <net/if_var.h> 150 #include <net/bpf.h> /* BIOCIMMEDIATE */ 151 #include <sys/bus.h> /* bus_dmamap_* */ 152 #include <sys/endian.h> 153 #include <sys/refcount.h> 154 155 /* reduce conditional code */ 156 #define init_waitqueue_head(x) // only needed in linux 157 158 extern struct dev_ops netmap_cdevsw; 159 160 /* 161 * common headers 162 */ 163 #include <net/netmap.h> 164 #include <net/netmap/netmap_kern.h> 165 #include <net/netmap/netmap_mem2.h> 166 167 168 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 169 170 /* 171 * The following variables are used by the drivers and replicate 172 * fields in the global memory pool. They only refer to buffers 173 * used by physical interfaces. 174 */ 175 u_int netmap_total_buffers; 176 u_int netmap_buf_size; 177 char *netmap_buffer_base; /* also address of an invalid buffer */ 178 179 /* user-controlled variables */ 180 int netmap_verbose; 181 182 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 183 184 SYSCTL_NODE(_net, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 185 SYSCTL_INT(_net_netmap, OID_AUTO, verbose, 186 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 187 SYSCTL_INT(_net_netmap, OID_AUTO, no_timestamp, 188 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 189 int netmap_mitigate = 1; 190 SYSCTL_INT(_net_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 191 int netmap_no_pendintr = 1; 192 SYSCTL_INT(_net_netmap, OID_AUTO, no_pendintr, 193 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 194 int netmap_txsync_retry = 2; 195 SYSCTL_INT(_net_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, 196 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); 197 198 int netmap_flags = 0; /* debug flags */ 199 int netmap_fwd = 0; /* force transparent mode */ 200 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ 201 202 /* 203 * netmap_admode selects the netmap mode to use. 204 * Invalid values are reset to NETMAP_ADMODE_BEST 205 */ 206 enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ 207 NETMAP_ADMODE_NATIVE, /* either native or none */ 208 NETMAP_ADMODE_GENERIC, /* force generic */ 209 NETMAP_ADMODE_LAST }; 210 #define NETMAP_ADMODE_NATIVE 1 /* Force native netmap adapter. */ 211 #define NETMAP_ADMODE_GENERIC 2 /* Force generic netmap adapter. */ 212 #define NETMAP_ADMODE_BEST 0 /* Priority to native netmap adapter. */ 213 static int netmap_admode = NETMAP_ADMODE_BEST; 214 215 int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ 216 int netmap_generic_ringsize = 1024; /* Generic ringsize. */ 217 218 SYSCTL_INT(_net_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 219 SYSCTL_INT(_net_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); 220 SYSCTL_INT(_net_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); 221 SYSCTL_INT(_net_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); 222 SYSCTL_INT(_net_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); 223 SYSCTL_INT(_net_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); 224 225 NMG_LOCK_T netmap_global_lock; 226 227 228 static void 229 nm_kr_get(struct netmap_kring *kr) 230 { 231 while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) 232 tsleep(kr, 0, "NM_KR_GET", 4); 233 } 234 235 236 void 237 netmap_disable_ring(struct netmap_kring *kr) 238 { 239 kr->nkr_stopped = 1; 240 nm_kr_get(kr); 241 lockmgr(&kr->q_lock, LK_EXCLUSIVE); 242 lockmgr(&kr->q_lock, LK_RELEASE); 243 nm_kr_put(kr); 244 } 245 246 247 static void 248 netmap_set_all_rings(struct ifnet *ifp, int stopped) 249 { 250 struct netmap_adapter *na; 251 int i; 252 253 if (!(ifp->if_capenable & IFCAP_NETMAP)) 254 return; 255 256 na = NA(ifp); 257 258 for (i = 0; i <= na->num_tx_rings; i++) { 259 if (stopped) 260 netmap_disable_ring(na->tx_rings + i); 261 else 262 na->tx_rings[i].nkr_stopped = 0; 263 na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY | 264 (i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0)); 265 } 266 267 for (i = 0; i <= na->num_rx_rings; i++) { 268 if (stopped) 269 netmap_disable_ring(na->rx_rings + i); 270 else 271 na->rx_rings[i].nkr_stopped = 0; 272 na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY | 273 (i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0)); 274 } 275 } 276 277 278 void 279 netmap_disable_all_rings(struct ifnet *ifp) 280 { 281 netmap_set_all_rings(ifp, 1 /* stopped */); 282 } 283 284 285 void 286 netmap_enable_all_rings(struct ifnet *ifp) 287 { 288 netmap_set_all_rings(ifp, 0 /* enabled */); 289 } 290 291 292 /* 293 * generic bound_checking function 294 */ 295 u_int 296 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) 297 { 298 u_int oldv = *v; 299 const char *op = NULL; 300 301 if (dflt < lo) 302 dflt = lo; 303 if (dflt > hi) 304 dflt = hi; 305 if (oldv < lo) { 306 *v = dflt; 307 op = "Bump"; 308 } else if (oldv > hi) { 309 *v = hi; 310 op = "Clamp"; 311 } 312 if (op && msg) 313 kprintf("%s %s to %d (was %d)\n", op, msg, *v, oldv); 314 return *v; 315 } 316 317 318 /* 319 * packet-dump function, user-supplied or static buffer. 320 * The destination buffer must be at least 30+4*len 321 */ 322 const char * 323 nm_dump_buf(char *p, int len, int lim, char *dst) 324 { 325 static char _dst[8192]; 326 int i, j, i0; 327 static char hex[] ="0123456789abcdef"; 328 char *o; /* output position */ 329 330 #define P_HI(x) hex[((x) & 0xf0)>>4] 331 #define P_LO(x) hex[((x) & 0xf)] 332 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') 333 if (!dst) 334 dst = _dst; 335 if (lim <= 0 || lim > len) 336 lim = len; 337 o = dst; 338 ksprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); 339 o += strlen(o); 340 /* hexdump routine */ 341 for (i = 0; i < lim; ) { 342 ksprintf(o, "%5d: ", i); 343 o += strlen(o); 344 memset(o, ' ', 48); 345 i0 = i; 346 for (j=0; j < 16 && i < lim; i++, j++) { 347 o[j*3] = P_HI(p[i]); 348 o[j*3+1] = P_LO(p[i]); 349 } 350 i = i0; 351 for (j=0; j < 16 && i < lim; i++, j++) 352 o[j + 48] = P_C(p[i]); 353 o[j+48] = '\n'; 354 o += j+49; 355 } 356 *o = '\0'; 357 #undef P_HI 358 #undef P_LO 359 #undef P_C 360 return dst; 361 } 362 363 364 365 /* 366 * Fetch configuration from the device, to cope with dynamic 367 * reconfigurations after loading the module. 368 */ 369 int 370 netmap_update_config(struct netmap_adapter *na) 371 { 372 struct ifnet *ifp = na->ifp; 373 u_int txr, txd, rxr, rxd; 374 375 txr = txd = rxr = rxd = 0; 376 if (na->nm_config) { 377 na->nm_config(na, &txr, &txd, &rxr, &rxd); 378 } else { 379 /* take whatever we had at init time */ 380 txr = na->num_tx_rings; 381 txd = na->num_tx_desc; 382 rxr = na->num_rx_rings; 383 rxd = na->num_rx_desc; 384 } 385 386 if (na->num_tx_rings == txr && na->num_tx_desc == txd && 387 na->num_rx_rings == rxr && na->num_rx_desc == rxd) 388 return 0; /* nothing changed */ 389 if (netmap_verbose || na->active_fds > 0) { 390 D("stored config %s: txring %d x %d, rxring %d x %d", 391 NM_IFPNAME(ifp), 392 na->num_tx_rings, na->num_tx_desc, 393 na->num_rx_rings, na->num_rx_desc); 394 D("new config %s: txring %d x %d, rxring %d x %d", 395 NM_IFPNAME(ifp), txr, txd, rxr, rxd); 396 } 397 if (na->active_fds == 0) { 398 D("configuration changed (but fine)"); 399 na->num_tx_rings = txr; 400 na->num_tx_desc = txd; 401 na->num_rx_rings = rxr; 402 na->num_rx_desc = rxd; 403 return 0; 404 } 405 D("configuration changed while active, this is bad..."); 406 return 1; 407 } 408 409 410 int 411 netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom) 412 { 413 u_int i, len, ndesc; 414 struct netmap_kring *kring; 415 416 len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; 417 418 na->tx_rings = kmalloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); 419 if (na->tx_rings == NULL) { 420 D("Cannot allocate krings"); 421 return ENOMEM; 422 } 423 na->rx_rings = na->tx_rings + ntx; 424 425 ndesc = na->num_tx_desc; 426 for (i = 0; i < ntx; i++) { /* Transmit rings */ 427 kring = &na->tx_rings[i]; 428 bzero(kring, sizeof(*kring)); 429 kring->na = na; 430 kring->nkr_num_slots = ndesc; 431 /* 432 * IMPORTANT: 433 * Always keep one slot empty, so we can detect new 434 * transmissions comparing cur and nr_hwcur (they are 435 * the same only if there are no new transmissions). 436 */ 437 kring->nr_hwavail = ndesc - 1; 438 lockinit(&kring->q_lock, "nm_txq_lock", 0, LK_CANRECURSE); 439 init_waitqueue_head(&kring->si); 440 } 441 442 ndesc = na->num_rx_desc; 443 for (i = 0; i < nrx; i++) { /* Receive rings */ 444 kring = &na->rx_rings[i]; 445 bzero(kring, sizeof(*kring)); 446 kring->na = na; 447 kring->nkr_num_slots = ndesc; 448 lockinit(&kring->q_lock, "nm_rxq_lock", 0, LK_CANRECURSE); 449 init_waitqueue_head(&kring->si); 450 } 451 init_waitqueue_head(&na->tx_si); 452 init_waitqueue_head(&na->rx_si); 453 454 na->tailroom = na->rx_rings + nrx; 455 456 return 0; 457 458 } 459 460 461 void 462 netmap_krings_delete(struct netmap_adapter *na) 463 { 464 int i; 465 466 for (i = 0; i < na->num_tx_rings + 1; i++) { 467 lockuninit(&na->tx_rings[i].q_lock); 468 } 469 for (i = 0; i < na->num_rx_rings + 1; i++) { 470 lockuninit(&na->rx_rings[i].q_lock); 471 } 472 kfree(na->tx_rings, M_DEVBUF); 473 na->tx_rings = na->rx_rings = na->tailroom = NULL; 474 } 475 476 477 static struct netmap_if* 478 netmap_if_new(const char *ifname, struct netmap_adapter *na) 479 { 480 struct netmap_if *nifp; 481 482 if (netmap_update_config(na)) { 483 /* configuration mismatch, report and fail */ 484 return NULL; 485 } 486 487 if (na->active_fds) 488 goto final; 489 490 if (na->nm_krings_create(na)) 491 goto cleanup; 492 493 if (netmap_mem_rings_create(na)) 494 goto cleanup; 495 496 final: 497 498 nifp = netmap_mem_if_new(ifname, na); 499 if (nifp == NULL) 500 goto cleanup; 501 502 return (nifp); 503 504 cleanup: 505 506 if (na->active_fds == 0) { 507 netmap_mem_rings_delete(na); 508 na->nm_krings_delete(na); 509 } 510 511 return NULL; 512 } 513 514 515 /* grab a reference to the memory allocator, if we don't have one already. The 516 * reference is taken from the netmap_adapter registered with the priv. 517 * 518 */ 519 static int 520 netmap_get_memory_locked(struct netmap_priv_d* p) 521 { 522 struct netmap_mem_d *nmd; 523 int error = 0; 524 525 if (p->np_na == NULL) { 526 if (!netmap_mmap_unreg) 527 return ENODEV; 528 /* for compatibility with older versions of the API 529 * we use the global allocator when no interface has been 530 * registered 531 */ 532 nmd = &nm_mem; 533 } else { 534 nmd = p->np_na->nm_mem; 535 } 536 if (p->np_mref == NULL) { 537 error = netmap_mem_finalize(nmd); 538 if (!error) 539 p->np_mref = nmd; 540 } else if (p->np_mref != nmd) { 541 /* a virtual port has been registered, but previous 542 * syscalls already used the global allocator. 543 * We cannot continue 544 */ 545 error = ENODEV; 546 } 547 return error; 548 } 549 550 551 int 552 netmap_get_memory(struct netmap_priv_d* p) 553 { 554 int error; 555 NMG_LOCK(); 556 error = netmap_get_memory_locked(p); 557 NMG_UNLOCK(); 558 return error; 559 } 560 561 562 static int 563 netmap_have_memory_locked(struct netmap_priv_d* p) 564 { 565 return p->np_mref != NULL; 566 } 567 568 569 static void 570 netmap_drop_memory_locked(struct netmap_priv_d* p) 571 { 572 if (p->np_mref) { 573 netmap_mem_deref(p->np_mref); 574 p->np_mref = NULL; 575 } 576 } 577 578 579 /* 580 * File descriptor's private data destructor. 581 * 582 * Call nm_register(ifp,0) to stop netmap mode on the interface and 583 * revert to normal operation. We expect that np_na->ifp has not gone. 584 * The second argument is the nifp to work on. In some cases it is 585 * not attached yet to the netmap_priv_d so we need to pass it as 586 * a separate argument. 587 */ 588 /* call with NMG_LOCK held */ 589 static void 590 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) 591 { 592 struct netmap_adapter *na = priv->np_na; 593 struct ifnet *ifp = na->ifp; 594 595 NMG_LOCK_ASSERT(); 596 na->active_fds--; 597 if (na->active_fds <= 0) { /* last instance */ 598 599 if (netmap_verbose) 600 D("deleting last instance for %s", NM_IFPNAME(ifp)); 601 /* 602 * (TO CHECK) This function is only called 603 * when the last reference to this file descriptor goes 604 * away. This means we cannot have any pending poll() 605 * or interrupt routine operating on the structure. 606 * XXX The file may be closed in a thread while 607 * another thread is using it. 608 * Linux keeps the file opened until the last reference 609 * by any outstanding ioctl/poll or mmap is gone. 610 * FreeBSD does not track mmap()s (but we do) and 611 * wakes up any sleeping poll(). Need to check what 612 * happens if the close() occurs while a concurrent 613 * syscall is running. 614 */ 615 if (ifp) 616 na->nm_register(na, 0); /* off, clear IFCAP_NETMAP */ 617 /* Wake up any sleeping threads. netmap_poll will 618 * then return POLLERR 619 * XXX The wake up now must happen during *_down(), when 620 * we order all activities to stop. -gl 621 */ 622 /* XXX kqueue(9) needed; these will mirror knlist_init. */ 623 /* knlist_destroy(&na->tx_si.si_note); */ 624 /* knlist_destroy(&na->rx_si.si_note); */ 625 626 /* delete rings and buffers */ 627 netmap_mem_rings_delete(na); 628 na->nm_krings_delete(na); 629 } 630 /* delete the nifp */ 631 netmap_mem_if_delete(na, nifp); 632 } 633 634 635 /* 636 * returns 1 if this is the last instance and we can free priv 637 */ 638 int 639 netmap_dtor_locked(struct netmap_priv_d *priv) 640 { 641 struct netmap_adapter *na = priv->np_na; 642 643 /* 644 * np_refcount is the number of active mmaps on 645 * this file descriptor 646 */ 647 if (--priv->np_refcount > 0) { 648 return 0; 649 } 650 if (!na) { 651 return 1; //XXX is it correct? 652 } 653 netmap_do_unregif(priv, priv->np_nifp); 654 priv->np_nifp = NULL; 655 netmap_drop_memory_locked(priv); 656 if (priv->np_na) { 657 netmap_adapter_put(na); 658 priv->np_na = NULL; 659 } 660 return 1; 661 } 662 663 664 void 665 netmap_dtor(void *data) 666 { 667 struct netmap_priv_d *priv = data; 668 int last_instance; 669 670 NMG_LOCK(); 671 last_instance = netmap_dtor_locked(priv); 672 NMG_UNLOCK(); 673 if (last_instance) { 674 bzero(priv, sizeof(*priv)); /* for safety */ 675 kfree(priv, M_DEVBUF); 676 } 677 } 678 679 680 681 682 /* 683 * Handlers for synchronization of the queues from/to the host. 684 * Netmap has two operating modes: 685 * - in the default mode, the rings connected to the host stack are 686 * just another ring pair managed by userspace; 687 * - in transparent mode (XXX to be defined) incoming packets 688 * (from the host or the NIC) are marked as NS_FORWARD upon 689 * arrival, and the user application has a chance to reset the 690 * flag for packets that should be dropped. 691 * On the RXSYNC or poll(), packets in RX rings between 692 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved 693 * to the other side. 694 * The transfer NIC --> host is relatively easy, just encapsulate 695 * into mbufs and we are done. The host --> NIC side is slightly 696 * harder because there might not be room in the tx ring so it 697 * might take a while before releasing the buffer. 698 */ 699 700 701 /* 702 * pass a chain of buffers to the host stack as coming from 'dst' 703 */ 704 static void 705 netmap_send_up(struct ifnet *dst, struct mbq *q) 706 { 707 struct mbuf *m; 708 709 /* send packets up, outside the lock */ 710 while ((m = mbq_dequeue(q)) != NULL) { 711 if (netmap_verbose & NM_VERB_HOST) 712 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 713 NM_SEND_UP(dst, m); 714 } 715 mbq_destroy(q); 716 } 717 718 719 /* 720 * put a copy of the buffers marked NS_FORWARD into an mbuf chain. 721 * Run from hwcur to cur - reserved 722 */ 723 static void 724 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) 725 { 726 /* Take packets from hwcur to cur-reserved and pass them up. 727 * In case of no buffers we give up. At the end of the loop, 728 * the queue is drained in all cases. 729 * XXX handle reserved 730 */ 731 u_int lim = kring->nkr_num_slots - 1; 732 struct mbuf *m; 733 u_int k = kring->ring->cur, n = kring->ring->reserved; 734 struct netmap_adapter *na = kring->na; 735 736 /* compute the final position, ring->cur - ring->reserved */ 737 if (n > 0) { 738 if (k < n) 739 k += kring->nkr_num_slots; 740 k += n; 741 } 742 for (n = kring->nr_hwcur; n != k;) { 743 struct netmap_slot *slot = &kring->ring->slot[n]; 744 745 n = nm_next(n, lim); 746 if ((slot->flags & NS_FORWARD) == 0 && !force) 747 continue; 748 if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { 749 D("bad pkt at %d len %d", n, slot->len); 750 continue; 751 } 752 slot->flags &= ~NS_FORWARD; // XXX needed ? 753 /* XXX adapt to the case of a multisegment packet */ 754 m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL); 755 756 if (m == NULL) 757 break; 758 mbq_enqueue(q, m); 759 } 760 } 761 762 763 /* 764 * The host ring has packets from nr_hwcur to (cur - reserved) 765 * to be sent down to the NIC. 766 * We need to use the queue lock on the source (host RX ring) 767 * to protect against netmap_transmit. 768 * If the user is well behaved we do not need to acquire locks 769 * on the destination(s), 770 * so we only need to make sure that there are no panics because 771 * of user errors. 772 * XXX verify 773 * 774 * We scan the tx rings, which have just been 775 * flushed so nr_hwcur == cur. Pushing packets down means 776 * increment cur and decrement avail. 777 * XXX to be verified 778 */ 779 static void 780 netmap_sw_to_nic(struct netmap_adapter *na) 781 { 782 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 783 struct netmap_kring *k1 = &na->tx_rings[0]; 784 u_int i, howmany, src_lim, dst_lim; 785 786 /* XXX we should also check that the carrier is on */ 787 if (kring->nkr_stopped) 788 return; 789 790 lockmgr(&kring->q_lock, LK_EXCLUSIVE); 791 792 if (kring->nkr_stopped) 793 goto out; 794 795 howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */ 796 797 src_lim = kring->nkr_num_slots - 1; 798 for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) { 799 ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail); 800 dst_lim = k1->nkr_num_slots - 1; 801 while (howmany > 0 && k1->ring->avail > 0) { 802 struct netmap_slot *src, *dst, tmp; 803 src = &kring->ring->slot[kring->nr_hwcur]; 804 dst = &k1->ring->slot[k1->ring->cur]; 805 tmp = *src; 806 src->buf_idx = dst->buf_idx; 807 src->flags = NS_BUF_CHANGED; 808 809 dst->buf_idx = tmp.buf_idx; 810 dst->len = tmp.len; 811 dst->flags = NS_BUF_CHANGED; 812 ND("out len %d buf %d from %d to %d", 813 dst->len, dst->buf_idx, 814 kring->nr_hwcur, k1->ring->cur); 815 816 kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim); 817 howmany--; 818 kring->nr_hwavail--; 819 k1->ring->cur = nm_next(k1->ring->cur, dst_lim); 820 k1->ring->avail--; 821 } 822 kring->ring->cur = kring->nr_hwcur; // XXX 823 k1++; // XXX why? 824 } 825 out: 826 lockmgr(&kring->q_lock, LK_RELEASE); 827 } 828 829 830 /* 831 * netmap_txsync_to_host() passes packets up. We are called from a 832 * system call in user process context, and the only contention 833 * can be among multiple user threads erroneously calling 834 * this routine concurrently. 835 */ 836 void 837 netmap_txsync_to_host(struct netmap_adapter *na) 838 { 839 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 840 struct netmap_ring *ring = kring->ring; 841 u_int k, lim = kring->nkr_num_slots - 1; 842 struct mbq q; 843 int error; 844 845 error = nm_kr_tryget(kring); 846 if (error) { 847 if (error == NM_KR_BUSY) 848 D("ring %p busy (user error)", kring); 849 return; 850 } 851 k = ring->cur; 852 if (k > lim) { 853 D("invalid ring index in stack TX kring %p", kring); 854 netmap_ring_reinit(kring); 855 nm_kr_put(kring); 856 return; 857 } 858 859 /* Take packets from hwcur to cur and pass them up. 860 * In case of no buffers we give up. At the end of the loop, 861 * the queue is drained in all cases. 862 */ 863 mbq_init(&q); 864 netmap_grab_packets(kring, &q, 1); 865 kring->nr_hwcur = k; 866 kring->nr_hwavail = ring->avail = lim; 867 868 nm_kr_put(kring); 869 netmap_send_up(na->ifp, &q); 870 } 871 872 873 /* 874 * rxsync backend for packets coming from the host stack. 875 * They have been put in the queue by netmap_transmit() so we 876 * need to protect access to the kring using a lock. 877 * 878 * This routine also does the selrecord if called from the poll handler 879 * (we know because td != NULL). 880 * 881 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 882 * as an additional hidden argument. 883 */ 884 static void 885 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 886 { 887 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 888 struct netmap_ring *ring = kring->ring; 889 u_int j, n, lim = kring->nkr_num_slots; 890 u_int k = ring->cur, resvd = ring->reserved; 891 892 (void)pwait; /* disable unused warnings */ 893 894 if (kring->nkr_stopped) /* check a first time without lock */ 895 return; 896 897 lockmgr(&kring->q_lock, LK_EXCLUSIVE); 898 899 if (kring->nkr_stopped) /* check again with lock held */ 900 goto unlock_out; 901 902 if (k >= lim) { 903 netmap_ring_reinit(kring); 904 goto unlock_out; 905 } 906 /* new packets are already set in nr_hwavail */ 907 /* skip past packets that userspace has released */ 908 j = kring->nr_hwcur; 909 if (resvd > 0) { 910 if (resvd + ring->avail >= lim + 1) { 911 D("XXX invalid reserve/avail %d %d", resvd, ring->avail); 912 ring->reserved = resvd = 0; // XXX panic... 913 } 914 k = (k >= resvd) ? k - resvd : k + lim - resvd; 915 } 916 if (j != k) { 917 n = k >= j ? k - j : k + lim - j; 918 kring->nr_hwavail -= n; 919 kring->nr_hwcur = k; 920 } 921 k = ring->avail = kring->nr_hwavail - resvd; 922 if (k == 0 && td) 923 KNOTE(&kring->si.ki_note, 0); 924 if (k && (netmap_verbose & NM_VERB_HOST)) 925 D("%d pkts from stack", k); 926 unlock_out: 927 928 lockmgr(&kring->q_lock, LK_RELEASE); 929 } 930 931 932 /* Get a netmap adapter for the port. 933 * 934 * If it is possible to satisfy the request, return 0 935 * with *na containing the netmap adapter found. 936 * Otherwise return an error code, with *na containing NULL. 937 * 938 * When the port is attached to a bridge, we always return 939 * EBUSY. 940 * Otherwise, if the port is already bound to a file descriptor, 941 * then we unconditionally return the existing adapter into *na. 942 * In all the other cases, we return (into *na) either native, 943 * generic or NULL, according to the following table: 944 * 945 * native_support 946 * active_fds dev.netmap.admode YES NO 947 * ------------------------------------------------------- 948 * >0 * NA(ifp) NA(ifp) 949 * 950 * 0 NETMAP_ADMODE_BEST NATIVE GENERIC 951 * 0 NETMAP_ADMODE_NATIVE NATIVE NULL 952 * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC 953 * 954 */ 955 956 int 957 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) 958 { 959 /* generic support */ 960 int i = netmap_admode; /* Take a snapshot. */ 961 int error = 0; 962 struct netmap_adapter *prev_na; 963 struct netmap_generic_adapter *gna; 964 965 *na = NULL; /* default */ 966 967 /* reset in case of invalid value */ 968 if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) 969 i = netmap_admode = NETMAP_ADMODE_BEST; 970 971 if (NETMAP_CAPABLE(ifp)) { 972 /* If an adapter already exists, but is 973 * attached to a vale port, we report that the 974 * port is busy. 975 */ 976 if (NETMAP_OWNED_BY_KERN(NA(ifp))) 977 return EBUSY; 978 979 /* If an adapter already exists, return it if 980 * there are active file descriptors or if 981 * netmap is not forced to use generic 982 * adapters. 983 */ 984 if (NA(ifp)->active_fds > 0 || 985 i != NETMAP_ADMODE_GENERIC) { 986 *na = NA(ifp); 987 return 0; 988 } 989 } 990 991 /* If there isn't native support and netmap is not allowed 992 * to use generic adapters, we cannot satisfy the request. 993 */ 994 if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) 995 return EINVAL; 996 997 /* Otherwise, create a generic adapter and return it, 998 * saving the previously used netmap adapter, if any. 999 * 1000 * Note that here 'prev_na', if not NULL, MUST be a 1001 * native adapter, and CANNOT be a generic one. This is 1002 * true because generic adapters are created on demand, and 1003 * destroyed when not used anymore. Therefore, if the adapter 1004 * currently attached to an interface 'ifp' is generic, it 1005 * must be that 1006 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). 1007 * Consequently, if NA(ifp) is generic, we will enter one of 1008 * the branches above. This ensures that we never override 1009 * a generic adapter with another generic adapter. 1010 */ 1011 prev_na = NA(ifp); 1012 error = generic_netmap_attach(ifp); 1013 if (error) 1014 return error; 1015 1016 *na = NA(ifp); 1017 gna = (struct netmap_generic_adapter*)NA(ifp); 1018 gna->prev = prev_na; /* save old na */ 1019 if (prev_na != NULL) { 1020 ifunit(ifp->if_xname); /* XXX huh? */ 1021 // XXX add a refcount ? 1022 netmap_adapter_get(prev_na); 1023 } 1024 D("Created generic NA %p (prev %p)", gna, gna->prev); 1025 1026 return 0; 1027 } 1028 1029 1030 /* 1031 * MUST BE CALLED UNDER NMG_LOCK() 1032 * 1033 * get a refcounted reference to an interface. 1034 * This is always called in the execution of an ioctl(). 1035 * 1036 * Return ENXIO if the interface does not exist, EINVAL if netmap 1037 * is not supported by the interface. 1038 * If successful, hold a reference. 1039 * 1040 * When the NIC is attached to a bridge, reference is managed 1041 * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as 1042 * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC 1043 * is detached from the bridge, then ifp's refcount is dropped (this 1044 * is equivalent to that ifp is destroyed in case of virtual ports. 1045 * 1046 * This function uses if_rele() when we want to prevent the NIC from 1047 * being detached from the bridge in error handling. But once refcount 1048 * is acquired by this function, it must be released using nm_if_rele(). 1049 */ 1050 int 1051 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 1052 { 1053 struct ifnet *ifp; 1054 int error = 0; 1055 struct netmap_adapter *ret; 1056 1057 *na = NULL; /* default return value */ 1058 1059 /* first try to see if this is a bridge port. */ 1060 NMG_LOCK_ASSERT(); 1061 1062 error = netmap_get_bdg_na(nmr, na, create); 1063 if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */ 1064 return error; 1065 1066 ifp = ifunit(nmr->nr_name); 1067 if (ifp == NULL) { 1068 return ENXIO; 1069 } 1070 1071 error = netmap_get_hw_na(ifp, &ret); 1072 if (error) 1073 goto out; 1074 1075 if (ret != NULL) { 1076 /* Users cannot use the NIC attached to a bridge directly */ 1077 if (NETMAP_OWNED_BY_KERN(ret)) { 1078 error = EINVAL; 1079 goto out; 1080 } 1081 error = 0; 1082 *na = ret; 1083 netmap_adapter_get(ret); 1084 } 1085 out: 1086 #if 0 1087 if_rele(ifp); 1088 #endif 1089 1090 return error; 1091 } 1092 1093 1094 /* 1095 * Error routine called when txsync/rxsync detects an error. 1096 * Can't do much more than resetting cur = hwcur, avail = hwavail. 1097 * Return 1 on reinit. 1098 * 1099 * This routine is only called by the upper half of the kernel. 1100 * It only reads hwcur (which is changed only by the upper half, too) 1101 * and hwavail (which may be changed by the lower half, but only on 1102 * a tx ring and only to increase it, so any error will be recovered 1103 * on the next call). For the above, we don't strictly need to call 1104 * it under lock. 1105 */ 1106 int 1107 netmap_ring_reinit(struct netmap_kring *kring) 1108 { 1109 struct netmap_ring *ring = kring->ring; 1110 u_int i, lim = kring->nkr_num_slots - 1; 1111 int errors = 0; 1112 1113 // XXX KASSERT nm_kr_tryget 1114 RD(10, "called for %s", NM_IFPNAME(kring->na->ifp)); 1115 if (ring->cur > lim) 1116 errors++; 1117 for (i = 0; i <= lim; i++) { 1118 u_int idx = ring->slot[i].buf_idx; 1119 u_int len = ring->slot[i].len; 1120 if (idx < 2 || idx >= netmap_total_buffers) { 1121 if (!errors++) 1122 D("bad buffer at slot %d idx %d len %d ", i, idx, len); 1123 ring->slot[i].buf_idx = 0; 1124 ring->slot[i].len = 0; 1125 } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { 1126 ring->slot[i].len = 0; 1127 if (!errors++) 1128 D("bad len %d at slot %d idx %d", 1129 len, i, idx); 1130 } 1131 } 1132 if (errors) { 1133 int pos = kring - kring->na->tx_rings; 1134 int n = kring->na->num_tx_rings + 1; 1135 1136 RD(10, "total %d errors", errors); 1137 errors++; 1138 RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", 1139 NM_IFPNAME(kring->na->ifp), 1140 pos < n ? "TX" : "RX", pos < n ? pos : pos - n, 1141 ring->cur, kring->nr_hwcur, 1142 ring->avail, kring->nr_hwavail); 1143 ring->cur = kring->nr_hwcur; 1144 ring->avail = kring->nr_hwavail; 1145 } 1146 return (errors ? 1 : 0); 1147 } 1148 1149 1150 /* 1151 * Set the ring ID. For devices with a single queue, a request 1152 * for all rings is the same as a single ring. 1153 */ 1154 static int 1155 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) 1156 { 1157 struct netmap_adapter *na = priv->np_na; 1158 struct ifnet *ifp = na->ifp; 1159 u_int i = ringid & NETMAP_RING_MASK; 1160 /* initially (np_qfirst == np_qlast) we don't want to lock */ 1161 u_int lim = na->num_rx_rings; 1162 1163 if (na->num_tx_rings > lim) 1164 lim = na->num_tx_rings; 1165 if ( (ringid & NETMAP_HW_RING) && i >= lim) { 1166 D("invalid ring id %d", i); 1167 return (EINVAL); 1168 } 1169 priv->np_ringid = ringid; 1170 if (ringid & NETMAP_SW_RING) { 1171 priv->np_qfirst = NETMAP_SW_RING; 1172 priv->np_qlast = 0; 1173 } else if (ringid & NETMAP_HW_RING) { 1174 priv->np_qfirst = i; 1175 priv->np_qlast = i + 1; 1176 } else { 1177 priv->np_qfirst = 0; 1178 priv->np_qlast = NETMAP_HW_RING ; 1179 } 1180 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 1181 if (netmap_verbose) { 1182 if (ringid & NETMAP_SW_RING) 1183 D("ringid %s set to SW RING", NM_IFPNAME(ifp)); 1184 else if (ringid & NETMAP_HW_RING) 1185 D("ringid %s set to HW RING %d", NM_IFPNAME(ifp), 1186 priv->np_qfirst); 1187 else 1188 D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim); 1189 } 1190 return 0; 1191 } 1192 1193 1194 /* 1195 * possibly move the interface to netmap-mode. 1196 * If success it returns a pointer to netmap_if, otherwise NULL. 1197 * This must be called with NMG_LOCK held. 1198 */ 1199 struct netmap_if * 1200 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, 1201 uint16_t ringid, int *err) 1202 { 1203 struct ifnet *ifp = na->ifp; 1204 struct netmap_if *nifp = NULL; 1205 int error, need_mem = 0; 1206 1207 NMG_LOCK_ASSERT(); 1208 /* ring configuration may have changed, fetch from the card */ 1209 netmap_update_config(na); 1210 priv->np_na = na; /* store the reference */ 1211 error = netmap_set_ringid(priv, ringid); 1212 if (error) 1213 goto out; 1214 /* ensure allocators are ready */ 1215 need_mem = !netmap_have_memory_locked(priv); 1216 if (need_mem) { 1217 error = netmap_get_memory_locked(priv); 1218 ND("get_memory returned %d", error); 1219 if (error) 1220 goto out; 1221 } 1222 nifp = netmap_if_new(NM_IFPNAME(ifp), na); 1223 if (nifp == NULL) { /* allocation failed */ 1224 /* we should drop the allocator, but only 1225 * if we were the ones who grabbed it 1226 */ 1227 error = ENOMEM; 1228 goto out; 1229 } 1230 na->active_fds++; 1231 if (ifp->if_capenable & IFCAP_NETMAP) { 1232 /* was already set */ 1233 } else { 1234 /* Otherwise set the card in netmap mode 1235 * and make it use the shared buffers. 1236 * 1237 * do not core lock because the race is harmless here, 1238 * there cannot be any traffic to netmap_transmit() 1239 */ 1240 na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut; 1241 ND("%p->na_lut == %p", na, na->na_lut); 1242 na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal; 1243 error = na->nm_register(na, 1); /* mode on */ 1244 if (error) { 1245 netmap_do_unregif(priv, nifp); 1246 nifp = NULL; 1247 } 1248 } 1249 out: 1250 *err = error; 1251 if (error) { 1252 priv->np_na = NULL; 1253 if (need_mem) 1254 netmap_drop_memory_locked(priv); 1255 } 1256 if (nifp != NULL) { 1257 /* 1258 * advertise that the interface is ready bt setting ni_nifp. 1259 * The barrier is needed because readers (poll and *SYNC) 1260 * check for priv->np_nifp != NULL without locking 1261 */ 1262 wmb(); /* make sure previous writes are visible to all CPUs */ 1263 priv->np_nifp = nifp; 1264 } 1265 return nifp; 1266 } 1267 1268 1269 1270 /* 1271 * ioctl(2) support for the "netmap" device. 1272 * 1273 * Following a list of accepted commands: 1274 * - NIOCGINFO 1275 * - SIOCGIFADDR just for convenience 1276 * - NIOCREGIF 1277 * - NIOCUNREGIF 1278 * - NIOCTXSYNC 1279 * - NIOCRXSYNC 1280 * 1281 * Return 0 on success, errno otherwise. 1282 */ 1283 int 1284 netmap_ioctl(struct dev_ioctl_args *ap) 1285 { 1286 struct netmap_priv_d *priv = NULL; 1287 struct ifnet *ifp = NULL; 1288 struct nmreq *nmr = (struct nmreq *) ap->a_data; 1289 struct netmap_adapter *na = NULL; 1290 int error; 1291 u_int i, lim; 1292 struct netmap_if *nifp; 1293 struct netmap_kring *krings; 1294 u_long cmd = ap->a_cmd; 1295 1296 error = devfs_get_cdevpriv(ap->a_fp, (void **)&priv); 1297 if (error) { 1298 /* XXX ENOENT should be impossible, since the priv 1299 * is now created in the open */ 1300 return (error == ENOENT ? ENXIO : error); 1301 } 1302 1303 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ 1304 switch (cmd) { 1305 case NIOCGINFO: /* return capabilities etc */ 1306 if (nmr->nr_version != NETMAP_API) { 1307 D("API mismatch got %d have %d", 1308 nmr->nr_version, NETMAP_API); 1309 nmr->nr_version = NETMAP_API; 1310 error = EINVAL; 1311 break; 1312 } 1313 if (nmr->nr_cmd == NETMAP_BDG_LIST) { 1314 error = netmap_bdg_ctl(nmr, NULL); 1315 break; 1316 } 1317 1318 NMG_LOCK(); 1319 do { 1320 /* memsize is always valid */ 1321 struct netmap_mem_d *nmd = &nm_mem; 1322 u_int memflags; 1323 1324 if (nmr->nr_name[0] != '\0') { 1325 /* get a refcount */ 1326 error = netmap_get_na(nmr, &na, 1 /* create */); 1327 if (error) 1328 break; 1329 nmd = na->nm_mem; /* get memory allocator */ 1330 } 1331 1332 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); 1333 if (error) 1334 break; 1335 if (na == NULL) /* only memory info */ 1336 break; 1337 nmr->nr_offset = 0; 1338 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 1339 netmap_update_config(na); 1340 nmr->nr_rx_rings = na->num_rx_rings; 1341 nmr->nr_tx_rings = na->num_tx_rings; 1342 nmr->nr_rx_slots = na->num_rx_desc; 1343 nmr->nr_tx_slots = na->num_tx_desc; 1344 if (memflags & NETMAP_MEM_PRIVATE) 1345 nmr->nr_ringid |= NETMAP_PRIV_MEM; 1346 netmap_adapter_put(na); 1347 } while (0); 1348 NMG_UNLOCK(); 1349 break; 1350 1351 case NIOCREGIF: 1352 if (nmr->nr_version != NETMAP_API) { 1353 nmr->nr_version = NETMAP_API; 1354 error = EINVAL; 1355 break; 1356 } 1357 /* possibly attach/detach NIC and VALE switch */ 1358 i = nmr->nr_cmd; 1359 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) { 1360 error = netmap_bdg_ctl(nmr, NULL); 1361 break; 1362 } else if (i != 0) { 1363 D("nr_cmd must be 0 not %d", i); 1364 error = EINVAL; 1365 break; 1366 } 1367 1368 /* protect access to priv from concurrent NIOCREGIF */ 1369 NMG_LOCK(); 1370 do { 1371 u_int memflags; 1372 1373 if (priv->np_na != NULL) { /* thread already registered */ 1374 error = netmap_set_ringid(priv, nmr->nr_ringid); 1375 break; 1376 } 1377 /* find the interface and a reference */ 1378 error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ 1379 if (error) 1380 break; 1381 ifp = na->ifp; 1382 if (NETMAP_OWNED_BY_KERN(na)) { 1383 netmap_adapter_put(na); 1384 error = EBUSY; 1385 break; 1386 } 1387 nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error); 1388 if (!nifp) { /* reg. failed, release priv and ref */ 1389 netmap_adapter_put(na); 1390 priv->np_nifp = NULL; 1391 break; 1392 } 1393 1394 /* return the offset of the netmap_if object */ 1395 nmr->nr_rx_rings = na->num_rx_rings; 1396 nmr->nr_tx_rings = na->num_tx_rings; 1397 nmr->nr_rx_slots = na->num_rx_desc; 1398 nmr->nr_tx_slots = na->num_tx_desc; 1399 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); 1400 if (error) { 1401 netmap_adapter_put(na); 1402 break; 1403 } 1404 if (memflags & NETMAP_MEM_PRIVATE) { 1405 nmr->nr_ringid |= NETMAP_PRIV_MEM; 1406 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; 1407 } 1408 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); 1409 } while (0); 1410 NMG_UNLOCK(); 1411 break; 1412 1413 case NIOCUNREGIF: 1414 // XXX we have no data here ? 1415 D("deprecated, data is %p", nmr); 1416 error = EINVAL; 1417 break; 1418 1419 case NIOCTXSYNC: 1420 case NIOCRXSYNC: 1421 nifp = priv->np_nifp; 1422 1423 if (nifp == NULL) { 1424 error = ENXIO; 1425 break; 1426 } 1427 rmb(); /* make sure following reads are not from cache */ 1428 1429 na = priv->np_na; /* we have a reference */ 1430 1431 if (na == NULL) { 1432 D("Internal error: nifp != NULL && na == NULL"); 1433 error = ENXIO; 1434 break; 1435 } 1436 1437 ifp = na->ifp; 1438 if (ifp == NULL) { 1439 RD(1, "the ifp is gone"); 1440 error = ENXIO; 1441 break; 1442 } 1443 1444 if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ 1445 if (cmd == NIOCTXSYNC) 1446 netmap_txsync_to_host(na); 1447 else 1448 netmap_rxsync_from_host(na, NULL, NULL); 1449 break; 1450 } 1451 /* find the last ring to scan */ 1452 lim = priv->np_qlast; 1453 if (lim == NETMAP_HW_RING) 1454 lim = (cmd == NIOCTXSYNC) ? 1455 na->num_tx_rings : na->num_rx_rings; 1456 1457 krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings; 1458 for (i = priv->np_qfirst; i < lim; i++) { 1459 struct netmap_kring *kring = krings + i; 1460 if (nm_kr_tryget(kring)) { 1461 error = EBUSY; 1462 goto out; 1463 } 1464 if (cmd == NIOCTXSYNC) { 1465 if (netmap_verbose & NM_VERB_TXSYNC) 1466 D("pre txsync ring %d cur %d hwcur %d", 1467 i, kring->ring->cur, 1468 kring->nr_hwcur); 1469 na->nm_txsync(na, i, NAF_FORCE_RECLAIM); 1470 if (netmap_verbose & NM_VERB_TXSYNC) 1471 D("post txsync ring %d cur %d hwcur %d", 1472 i, kring->ring->cur, 1473 kring->nr_hwcur); 1474 } else { 1475 na->nm_rxsync(na, i, NAF_FORCE_READ); 1476 microtime(&na->rx_rings[i].ring->ts); 1477 } 1478 nm_kr_put(kring); 1479 } 1480 1481 break; 1482 case BIOCIMMEDIATE: 1483 case BIOCGHDRCMPLT: 1484 case BIOCSHDRCMPLT: 1485 case BIOCSSEESENT: 1486 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 1487 break; 1488 1489 default: /* allow device-specific ioctls */ 1490 { 1491 struct socket so; 1492 1493 bzero(&so, sizeof(so)); 1494 NMG_LOCK(); 1495 error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */ 1496 if (error) { 1497 netmap_adapter_put(na); 1498 NMG_UNLOCK(); 1499 break; 1500 } 1501 ifp = na->ifp; 1502 // so->so_proto not null. 1503 error = ifioctl(&so, cmd, ap->a_data, ap->a_cred); 1504 netmap_adapter_put(na); 1505 NMG_UNLOCK(); 1506 break; 1507 } 1508 } 1509 out: 1510 1511 return (error); 1512 } 1513 1514 static int 1515 netmap_kqfilter_event(struct knote *kn, long hint) 1516 { 1517 return (0); 1518 } 1519 1520 static void 1521 netmap_kqfilter_detach(struct knote *kn) 1522 { 1523 } 1524 1525 static struct filterops netmap_kqfilter_ops = { 1526 FILTEROP_ISFD, NULL, netmap_kqfilter_detach, netmap_kqfilter_event, 1527 }; 1528 1529 int 1530 netmap_kqfilter(struct dev_kqfilter_args *ap) 1531 { 1532 struct knote *kn = ap->a_kn; 1533 1534 ap->a_result = 0; 1535 1536 switch (kn->kn_filter) { 1537 case EVFILT_READ: 1538 case EVFILT_WRITE: 1539 kn->kn_fop = &netmap_kqfilter_ops; 1540 break; 1541 default: 1542 ap->a_result = EOPNOTSUPP; 1543 return (0); 1544 } 1545 1546 return (0); 1547 } 1548 1549 /* 1550 * select(2) and poll(2) handlers for the "netmap" device. 1551 * 1552 * Can be called for one or more queues. 1553 * Return true the event mask corresponding to ready events. 1554 * If there are no ready events, do a selrecord on either individual 1555 * selinfo or on the global one. 1556 * Device-dependent parts (locking and sync of tx/rx rings) 1557 * are done through callbacks. 1558 * 1559 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 1560 * The first one is remapped to pwait as selrecord() uses the name as an 1561 * hidden argument. 1562 */ 1563 static inline int /* XXX mute unused for now */ 1564 netmap_poll(struct cdev *dev, int events, struct thread *td) 1565 { 1566 struct netmap_priv_d *priv = NULL; 1567 struct netmap_adapter *na; 1568 struct ifnet *ifp; 1569 struct netmap_kring *kring; 1570 u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; 1571 u_int lim_tx, lim_rx, host_forwarded = 0; 1572 struct mbq q; 1573 void *pwait = dev; /* linux compatibility */ 1574 1575 /* 1576 * In order to avoid nested locks, we need to "double check" 1577 * txsync and rxsync if we decide to do a selrecord(). 1578 * retry_tx (and retry_rx, later) prevent looping forever. 1579 */ 1580 int retry_tx = 1; 1581 1582 (void)pwait; 1583 mbq_init(&q); 1584 1585 /* XXX poll isn't ported yet so fill in NULL as a placeholder: */ 1586 if (devfs_get_cdevpriv(NULL, (void **)&priv) != 0 || priv == NULL) 1587 return POLLERR; 1588 1589 if (priv->np_nifp == NULL) { 1590 D("No if registered"); 1591 return POLLERR; 1592 } 1593 rmb(); /* make sure following reads are not from cache */ 1594 1595 na = priv->np_na; 1596 ifp = na->ifp; 1597 // check for deleted 1598 if (ifp == NULL) { 1599 RD(1, "the ifp is gone"); 1600 return POLLERR; 1601 } 1602 1603 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) 1604 return POLLERR; 1605 1606 if (netmap_verbose & 0x8000) 1607 D("device %s events 0x%x", NM_IFPNAME(ifp), events); 1608 want_tx = events & (POLLOUT | POLLWRNORM); 1609 want_rx = events & (POLLIN | POLLRDNORM); 1610 1611 lim_tx = na->num_tx_rings; 1612 lim_rx = na->num_rx_rings; 1613 1614 if (priv->np_qfirst == NETMAP_SW_RING) { 1615 /* handle the host stack ring */ 1616 if (priv->np_txpoll || want_tx) { 1617 /* push any packets up, then we are always ready */ 1618 netmap_txsync_to_host(na); 1619 revents |= want_tx; 1620 } 1621 if (want_rx) { 1622 kring = &na->rx_rings[lim_rx]; 1623 if (kring->ring->avail == 0) 1624 netmap_rxsync_from_host(na, td, dev); 1625 if (kring->ring->avail > 0) { 1626 revents |= want_rx; 1627 } 1628 } 1629 return (revents); 1630 } 1631 1632 /* 1633 * If we are in transparent mode, check also the host rx ring 1634 * XXX Transparent mode at the moment requires to bind all 1635 * rings to a single file descriptor. 1636 */ 1637 kring = &na->rx_rings[lim_rx]; 1638 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 1639 && want_rx 1640 && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) { 1641 if (kring->ring->avail == 0) 1642 netmap_rxsync_from_host(na, td, dev); 1643 if (kring->ring->avail > 0) 1644 revents |= want_rx; 1645 } 1646 1647 /* 1648 * check_all_{tx|rx} are set if the card has more than one queue AND 1649 * the file descriptor is bound to all of them. If so, we sleep on 1650 * the "global" selinfo, otherwise we sleep on individual selinfo 1651 * (FreeBSD only allows two selinfo's per file descriptor). 1652 * The interrupt routine in the driver wake one or the other 1653 * (or both) depending on which clients are active. 1654 * 1655 * rxsync() is only called if we run out of buffers on a POLLIN. 1656 * txsync() is called if we run out of buffers on POLLOUT, or 1657 * there are pending packets to send. The latter can be disabled 1658 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 1659 */ 1660 check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1); 1661 check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1); 1662 1663 if (priv->np_qlast != NETMAP_HW_RING) { 1664 lim_tx = lim_rx = priv->np_qlast; 1665 } 1666 1667 /* 1668 * We start with a lock free round which is cheap if we have 1669 * slots available. If this fails, then lock and call the sync 1670 * routines. 1671 */ 1672 for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { 1673 kring = &na->rx_rings[i]; 1674 if (kring->ring->avail > 0) { 1675 revents |= want_rx; 1676 want_rx = 0; /* also breaks the loop */ 1677 } 1678 } 1679 for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { 1680 kring = &na->tx_rings[i]; 1681 if (kring->ring->avail > 0) { 1682 revents |= want_tx; 1683 want_tx = 0; /* also breaks the loop */ 1684 } 1685 } 1686 1687 /* 1688 * If we to push packets out (priv->np_txpoll) or want_tx is 1689 * still set, we do need to run the txsync calls (on all rings, 1690 * to avoid that the tx rings stall). 1691 * XXX should also check cur != hwcur on the tx rings. 1692 * Fortunately, normal tx mode has np_txpoll set. 1693 */ 1694 if (priv->np_txpoll || want_tx) { 1695 /* If we really want to be woken up (want_tx), 1696 * do a selrecord, either on the global or on 1697 * the private structure. Then issue the txsync 1698 * so there is no race in the selrecord/selwait 1699 */ 1700 flush_tx: 1701 for (i = priv->np_qfirst; i < lim_tx; i++) { 1702 kring = &na->tx_rings[i]; 1703 /* 1704 * Skip this ring if want_tx == 0 1705 * (we have already done a successful sync on 1706 * a previous ring) AND kring->cur == kring->hwcur 1707 * (there are no pending transmissions for this ring). 1708 */ 1709 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 1710 continue; 1711 /* make sure only one user thread is doing this */ 1712 if (nm_kr_tryget(kring)) { 1713 ND("ring %p busy is %d", 1714 kring, (int)kring->nr_busy); 1715 revents |= POLLERR; 1716 goto out; 1717 } 1718 1719 if (netmap_verbose & NM_VERB_TXSYNC) 1720 D("send %d on %s %d", 1721 kring->ring->cur, NM_IFPNAME(ifp), i); 1722 if (na->nm_txsync(na, i, 0)) 1723 revents |= POLLERR; 1724 1725 /* Check avail/call selrecord only if called with POLLOUT */ 1726 if (want_tx) { 1727 if (kring->ring->avail > 0) { 1728 /* stop at the first ring. We don't risk 1729 * starvation. 1730 */ 1731 revents |= want_tx; 1732 want_tx = 0; 1733 } 1734 } 1735 nm_kr_put(kring); 1736 } 1737 if (want_tx && retry_tx) { 1738 KNOTE(check_all_tx ? &na->tx_si.ki_note : 1739 &na->tx_rings[priv->np_qfirst].si.ki_note, 0); 1740 retry_tx = 0; 1741 goto flush_tx; 1742 } 1743 } 1744 1745 /* 1746 * now if want_rx is still set we need to lock and rxsync. 1747 * Do it on all rings because otherwise we starve. 1748 */ 1749 if (want_rx) { 1750 int retry_rx = 1; 1751 do_retry_rx: 1752 for (i = priv->np_qfirst; i < lim_rx; i++) { 1753 kring = &na->rx_rings[i]; 1754 1755 if (nm_kr_tryget(kring)) { 1756 revents |= POLLERR; 1757 goto out; 1758 } 1759 1760 /* XXX NR_FORWARD should only be read on 1761 * physical or NIC ports 1762 */ 1763 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { 1764 ND(10, "forwarding some buffers up %d to %d", 1765 kring->nr_hwcur, kring->ring->cur); 1766 netmap_grab_packets(kring, &q, netmap_fwd); 1767 } 1768 1769 if (na->nm_rxsync(na, i, 0)) 1770 revents |= POLLERR; 1771 if (netmap_no_timestamp == 0 || 1772 kring->ring->flags & NR_TIMESTAMP) { 1773 microtime(&kring->ring->ts); 1774 } 1775 1776 if (kring->ring->avail > 0) { 1777 revents |= want_rx; 1778 retry_rx = 0; 1779 } 1780 nm_kr_put(kring); 1781 } 1782 if (retry_rx) { 1783 retry_rx = 0; 1784 KNOTE(check_all_rx ? &na->rx_si.ki_note : 1785 &na->rx_rings[priv->np_qfirst].si.ki_note, 0); 1786 goto do_retry_rx; 1787 } 1788 } 1789 1790 /* forward host to the netmap ring. 1791 * I am accessing nr_hwavail without lock, but netmap_transmit 1792 * can only increment it, so the operation is safe. 1793 */ 1794 kring = &na->rx_rings[lim_rx]; 1795 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 1796 && (netmap_fwd || kring->ring->flags & NR_FORWARD) 1797 && kring->nr_hwavail > 0 && !host_forwarded) { 1798 netmap_sw_to_nic(na); 1799 host_forwarded = 1; /* prevent another pass */ 1800 want_rx = 0; 1801 goto flush_tx; 1802 } 1803 1804 if (q.head) 1805 netmap_send_up(na->ifp, &q); 1806 1807 out: 1808 1809 return (revents); 1810 } 1811 1812 /*------- driver support routines ------*/ 1813 1814 static int netmap_hw_krings_create(struct netmap_adapter *); 1815 1816 static int 1817 netmap_notify(struct netmap_adapter *na, u_int n_ring, enum txrx tx, int flags) 1818 { 1819 struct netmap_kring *kring; 1820 1821 if (tx == NR_TX) { 1822 kring = na->tx_rings + n_ring; 1823 KNOTE(&kring->si.ki_note, 0); 1824 wakeup(&kring->si.ki_note); 1825 if (flags & NAF_GLOBAL_NOTIFY) 1826 wakeup(&na->tx_si.ki_note); 1827 } else { 1828 kring = na->rx_rings + n_ring; 1829 KNOTE(&kring->si.ki_note, 0); 1830 wakeup(&kring->si.ki_note); 1831 if (flags & NAF_GLOBAL_NOTIFY) 1832 wakeup(&na->rx_si.ki_note); 1833 } 1834 return 0; 1835 } 1836 1837 1838 // XXX check handling of failures 1839 int 1840 netmap_attach_common(struct netmap_adapter *na) 1841 { 1842 struct ifnet *ifp = na->ifp; 1843 1844 if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { 1845 D("%s: invalid rings tx %d rx %d", 1846 ifp->if_xname, na->num_tx_rings, na->num_rx_rings); 1847 return EINVAL; 1848 } 1849 WNA(ifp) = na; 1850 NETMAP_SET_CAPABLE(ifp); 1851 if (na->nm_krings_create == NULL) { 1852 na->nm_krings_create = netmap_hw_krings_create; 1853 na->nm_krings_delete = netmap_krings_delete; 1854 } 1855 if (na->nm_notify == NULL) 1856 na->nm_notify = netmap_notify; 1857 na->active_fds = 0; 1858 1859 if (na->nm_mem == NULL) 1860 na->nm_mem = &nm_mem; 1861 return 0; 1862 } 1863 1864 1865 void 1866 netmap_detach_common(struct netmap_adapter *na) 1867 { 1868 if (na->ifp) 1869 WNA(na->ifp) = NULL; /* XXX do we need this? */ 1870 1871 if (na->tx_rings) { /* XXX should not happen */ 1872 D("freeing leftover tx_rings"); 1873 na->nm_krings_delete(na); 1874 } 1875 if (na->na_flags & NAF_MEM_OWNER) 1876 netmap_mem_private_delete(na->nm_mem); 1877 bzero(na, sizeof(*na)); 1878 kfree(na, M_DEVBUF); 1879 } 1880 1881 1882 /* 1883 * Initialize a ``netmap_adapter`` object created by driver on attach. 1884 * We allocate a block of memory with room for a struct netmap_adapter 1885 * plus two sets of N+2 struct netmap_kring (where N is the number 1886 * of hardware rings): 1887 * krings 0..N-1 are for the hardware queues. 1888 * kring N is for the host stack queue 1889 * kring N+1 is only used for the selinfo for all queues. 1890 * Return 0 on success, ENOMEM otherwise. 1891 * 1892 * By default the receive and transmit adapter ring counts are both initialized 1893 * to num_queues. na->num_tx_rings can be set for cards with different tx/rx 1894 * setups. 1895 */ 1896 int 1897 netmap_attach(struct netmap_adapter *arg) 1898 { 1899 struct netmap_hw_adapter *hwna = NULL; 1900 // XXX when is arg == NULL ? 1901 struct ifnet *ifp = arg ? arg->ifp : NULL; 1902 1903 if (arg == NULL || ifp == NULL) 1904 goto fail; 1905 hwna = kmalloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); 1906 if (hwna == NULL) 1907 goto fail; 1908 hwna->up = *arg; 1909 if (netmap_attach_common(&hwna->up)) { 1910 kfree(hwna, M_DEVBUF); 1911 goto fail; 1912 } 1913 netmap_adapter_get(&hwna->up); 1914 1915 D("success for %s", NM_IFPNAME(ifp)); 1916 return 0; 1917 1918 fail: 1919 D("fail, arg %p ifp %p na %p", arg, ifp, hwna); 1920 netmap_detach(ifp); 1921 return (hwna ? EINVAL : ENOMEM); 1922 } 1923 1924 1925 void 1926 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) 1927 { 1928 if (!na) { 1929 return; 1930 } 1931 1932 refcount_acquire(&na->na_refcount); 1933 } 1934 1935 1936 /* returns 1 iff the netmap_adapter is destroyed */ 1937 int 1938 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) 1939 { 1940 if (!na) 1941 return 1; 1942 1943 if (!refcount_release(&na->na_refcount)) 1944 return 0; 1945 1946 if (na->nm_dtor) 1947 na->nm_dtor(na); 1948 1949 netmap_detach_common(na); 1950 1951 return 1; 1952 } 1953 1954 1955 int 1956 netmap_hw_krings_create(struct netmap_adapter *na) 1957 { 1958 return netmap_krings_create(na, 1959 na->num_tx_rings + 1, na->num_rx_rings + 1, 0); 1960 } 1961 1962 1963 1964 /* 1965 * Free the allocated memory linked to the given ``netmap_adapter`` 1966 * object. 1967 */ 1968 void 1969 netmap_detach(struct ifnet *ifp) 1970 { 1971 struct netmap_adapter *na = NA(ifp); 1972 1973 if (!na) 1974 return; 1975 1976 NMG_LOCK(); 1977 netmap_disable_all_rings(ifp); 1978 netmap_adapter_put(na); 1979 na->ifp = NULL; 1980 netmap_enable_all_rings(ifp); 1981 NMG_UNLOCK(); 1982 } 1983 1984 1985 /* 1986 * Intercept packets from the network stack and pass them 1987 * to netmap as incoming packets on the 'software' ring. 1988 * We rely on the OS to make sure that the ifp and na do not go 1989 * away (typically the caller checks for IFF_DRV_RUNNING or the like). 1990 * In nm_register() or whenever there is a reinitialization, 1991 * we make sure to access the core lock and per-ring locks 1992 * so that IFCAP_NETMAP is visible here. 1993 */ 1994 int 1995 netmap_transmit(struct ifnet *ifp, struct mbuf *m) 1996 { 1997 struct netmap_adapter *na = NA(ifp); 1998 struct netmap_kring *kring; 1999 u_int i, len = MBUF_LEN(m); 2000 u_int error = EBUSY, lim; 2001 struct netmap_slot *slot; 2002 2003 // XXX [Linux] we do not need this lock 2004 // if we follow the down/configure/up protocol -gl 2005 // mtx_lock(&na->core_lock); 2006 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { 2007 /* interface not in netmap mode anymore */ 2008 error = ENXIO; 2009 goto done; 2010 } 2011 2012 kring = &na->rx_rings[na->num_rx_rings]; 2013 lim = kring->nkr_num_slots - 1; 2014 if (netmap_verbose & NM_VERB_HOST) 2015 D("%s packet %d len %d from the stack", NM_IFPNAME(ifp), 2016 kring->nr_hwcur + kring->nr_hwavail, len); 2017 // XXX reconsider long packets if we handle fragments 2018 if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ 2019 D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp), 2020 len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); 2021 goto done; 2022 } 2023 /* protect against other instances of netmap_transmit, 2024 * and userspace invocations of rxsync(). 2025 */ 2026 // XXX [Linux] there can be no other instances of netmap_transmit 2027 // on this same ring, but we still need this lock to protect 2028 // concurrent access from netmap_sw_to_nic() -gl 2029 lockmgr(&kring->q_lock, LK_EXCLUSIVE); 2030 if (kring->nr_hwavail >= lim) { 2031 if (netmap_verbose) 2032 D("stack ring %s full\n", NM_IFPNAME(ifp)); 2033 } else { 2034 /* compute the insert position */ 2035 i = nm_kr_rxpos(kring); 2036 slot = &kring->ring->slot[i]; 2037 m_copydata(m, 0, (int)len, BDG_NMB(na, slot)); 2038 slot->len = len; 2039 slot->flags = kring->nkr_slot_flags; 2040 kring->nr_hwavail++; 2041 if (netmap_verbose & NM_VERB_HOST) 2042 D("wake up host ring %s %d", NM_IFPNAME(na->ifp), na->num_rx_rings); 2043 na->nm_notify(na, na->num_rx_rings, NR_RX, 0); 2044 error = 0; 2045 } 2046 lockmgr(&kring->q_lock, LK_RELEASE); 2047 2048 done: 2049 // mtx_unlock(&na->core_lock); 2050 2051 /* release the mbuf in either cases of success or failure. As an 2052 * alternative, put the mbuf in a free list and free the list 2053 * only when really necessary. 2054 */ 2055 m_freem(m); 2056 2057 return (error); 2058 } 2059 2060 2061 /* 2062 * netmap_reset() is called by the driver routines when reinitializing 2063 * a ring. The driver is in charge of locking to protect the kring. 2064 * If native netmap mode is not set just return NULL. 2065 */ 2066 struct netmap_slot * 2067 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, 2068 u_int new_cur) 2069 { 2070 struct netmap_kring *kring; 2071 int new_hwofs, lim; 2072 2073 if (na == NULL) { 2074 D("NULL na, should not happen"); 2075 return NULL; /* no netmap support here */ 2076 } 2077 if (!(na->ifp->if_capenable & IFCAP_NETMAP) || nma_is_generic(na)) { 2078 ND("interface not in netmap mode"); 2079 return NULL; /* nothing to reinitialize */ 2080 } 2081 2082 /* XXX note- in the new scheme, we are not guaranteed to be 2083 * under lock (e.g. when called on a device reset). 2084 * In this case, we should set a flag and do not trust too 2085 * much the values. In practice: TODO 2086 * - set a RESET flag somewhere in the kring 2087 * - do the processing in a conservative way 2088 * - let the *sync() fixup at the end. 2089 */ 2090 if (tx == NR_TX) { 2091 if (n >= na->num_tx_rings) 2092 return NULL; 2093 kring = na->tx_rings + n; 2094 new_hwofs = kring->nr_hwcur - new_cur; 2095 } else { 2096 if (n >= na->num_rx_rings) 2097 return NULL; 2098 kring = na->rx_rings + n; 2099 new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur; 2100 } 2101 lim = kring->nkr_num_slots - 1; 2102 if (new_hwofs > lim) 2103 new_hwofs -= lim + 1; 2104 2105 /* Always set the new offset value and realign the ring. */ 2106 D("%s hwofs %d -> %d, hwavail %d -> %d", 2107 tx == NR_TX ? "TX" : "RX", 2108 kring->nkr_hwofs, new_hwofs, 2109 kring->nr_hwavail, 2110 tx == NR_TX ? lim : kring->nr_hwavail); 2111 kring->nkr_hwofs = new_hwofs; 2112 if (tx == NR_TX) 2113 kring->nr_hwavail = lim; 2114 kring->nr_hwreserved = 0; 2115 2116 /* 2117 * Wakeup on the individual and global selwait 2118 * We do the wakeup here, but the ring is not yet reconfigured. 2119 * However, we are under lock so there are no races. 2120 */ 2121 na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY); 2122 return kring->ring->slot; 2123 } 2124 2125 2126 /* 2127 * Default functions to handle rx/tx interrupts from a physical device. 2128 * "work_done" is non-null on the RX path, NULL for the TX path. 2129 * "generic" is 0 when we are called by a device driver, and 1 when we 2130 * are called by the generic netmap adapter layer. 2131 * We rely on the OS to make sure that there is only one active 2132 * instance per queue, and that there is appropriate locking. 2133 * 2134 * If the card is not in netmap mode, simply return 0, 2135 * so that the caller proceeds with regular processing. 2136 * 2137 * We return 0 also when the card is in netmap mode but the current 2138 * netmap adapter is the generic one, because this function will be 2139 * called by the generic layer. 2140 * 2141 * If the card is connected to a netmap file descriptor, 2142 * do a selwakeup on the individual queue, plus one on the global one 2143 * if needed (multiqueue card _and_ there are multiqueue listeners), 2144 * and return 1. 2145 * 2146 * Finally, if called on rx from an interface connected to a switch, 2147 * calls the proper forwarding routine, and return 1. 2148 */ 2149 int 2150 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2151 { 2152 struct netmap_adapter *na = NA(ifp); 2153 struct netmap_kring *kring; 2154 2155 q &= NETMAP_RING_MASK; 2156 2157 if (netmap_verbose) { 2158 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); 2159 } 2160 2161 if (work_done) { /* RX path */ 2162 if (q >= na->num_rx_rings) 2163 return 0; // not a physical queue 2164 kring = na->rx_rings + q; 2165 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? 2166 na->nm_notify(na, q, NR_RX, 2167 (na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); 2168 *work_done = 1; /* do not fire napi again */ 2169 } else { /* TX path */ 2170 if (q >= na->num_tx_rings) 2171 return 0; // not a physical queue 2172 kring = na->tx_rings + q; 2173 na->nm_notify(na, q, NR_TX, 2174 (na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); 2175 } 2176 return 1; 2177 } 2178 2179 /* 2180 * Default functions to handle rx/tx interrupts from a physical device. 2181 * "work_done" is non-null on the RX path, NULL for the TX path. 2182 * "generic" is 0 when we are called by a device driver, and 1 when we 2183 * are called by the generic netmap adapter layer. 2184 * We rely on the OS to make sure that there is only one active 2185 * instance per queue, and that there is appropriate locking. 2186 * 2187 * If the card is not in netmap mode, simply return 0, 2188 * so that the caller proceeds with regular processing. 2189 * 2190 * If the card is connected to a netmap file descriptor, 2191 * do a selwakeup on the individual queue, plus one on the global one 2192 * if needed (multiqueue card _and_ there are multiqueue listeners), 2193 * and return 1. 2194 * 2195 * Finally, if called on rx from an interface connected to a switch, 2196 * calls the proper forwarding routine, and return 1. 2197 */ 2198 int 2199 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2200 { 2201 // XXX could we check NAF_NATIVE_ON ? 2202 if (!(ifp->if_capenable & IFCAP_NETMAP)) 2203 return 0; 2204 2205 if (NA(ifp)->na_flags & NAF_SKIP_INTR) { 2206 ND("use regular interrupt"); 2207 return 0; 2208 } 2209 2210 return netmap_common_irq(ifp, q, work_done); 2211 } 2212 2213 2214 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 2215 2216 2217 /* 2218 * Module loader. 2219 * 2220 * Create the /dev/netmap device and initialize all global 2221 * variables. 2222 * 2223 * Return 0 on success, errno on failure. 2224 */ 2225 int 2226 netmap_init(void) 2227 { 2228 int error; 2229 2230 NMG_LOCK_INIT(); 2231 2232 error = netmap_mem_init(); 2233 if (error != 0) { 2234 kprintf("netmap: unable to initialize the memory allocator.\n"); 2235 return (error); 2236 } 2237 kprintf("netmap: loaded module\n"); 2238 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, 2239 "netmap"); 2240 2241 netmap_init_bridges(); 2242 return (error); 2243 } 2244 2245 2246 /* 2247 * Module unloader. 2248 * 2249 * Free all the memory, and destroy the ``/dev/netmap`` device. 2250 */ 2251 void 2252 netmap_fini(void) 2253 { 2254 destroy_dev(netmap_dev); 2255 netmap_mem_fini(); 2256 NMG_LOCK_DESTROY(); 2257 kprintf("netmap: unloaded module.\n"); 2258 } 2259