1 /* 2 * Copyright (c) 2003, 2004 Matthew Dillon. All rights reserved. 3 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. 4 * Copyright (c) 2003 Jonathan Lemon. All rights reserved. 5 * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. 6 * 7 * This code is derived from software contributed to The DragonFly Project 8 * by Jonathan Lemon, Jeffrey M. Hsu, and Matthew Dillon. 9 * 10 * Jonathan Lemon gave Jeffrey Hsu permission to combine his copyright 11 * into this one around July 8 2004. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of The DragonFly Project nor the names of its 22 * contributors may be used to endorse or promote products derived 23 * from this software without specific, prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 28 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 29 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 30 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 31 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 32 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 33 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 34 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 35 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/kernel.h> 42 #include <sys/malloc.h> 43 #include <sys/msgport.h> 44 #include <sys/proc.h> 45 #include <sys/interrupt.h> 46 #include <sys/socket.h> 47 #include <sys/sysctl.h> 48 #include <sys/socketvar.h> 49 #include <net/if.h> 50 #include <net/if_var.h> 51 #include <net/netisr.h> 52 #include <machine/cpufunc.h> 53 #include <machine/smp.h> 54 55 #include <sys/thread2.h> 56 #include <sys/msgport2.h> 57 #include <net/netmsg2.h> 58 #include <sys/mplock2.h> 59 60 static void netmsg_sync_func(netmsg_t msg); 61 static void netmsg_service_loop(void *arg); 62 static void cpu0_cpufn(struct mbuf **mp, int hoff); 63 64 struct netmsg_port_registration { 65 TAILQ_ENTRY(netmsg_port_registration) npr_entry; 66 lwkt_port_t npr_port; 67 }; 68 69 struct netmsg_rollup { 70 TAILQ_ENTRY(netmsg_rollup) ru_entry; 71 netisr_ru_t ru_func; 72 }; 73 74 struct netmsg_barrier { 75 struct netmsg_base base; 76 volatile cpumask_t *br_cpumask; 77 volatile uint32_t br_done; 78 }; 79 80 #define NETISR_BR_NOTDONE 0x1 81 #define NETISR_BR_WAITDONE 0x80000000 82 83 struct netisr_barrier { 84 struct netmsg_barrier *br_msgs[MAXCPU]; 85 int br_isset; 86 }; 87 88 static struct netisr netisrs[NETISR_MAX]; 89 static TAILQ_HEAD(,netmsg_port_registration) netreglist; 90 static TAILQ_HEAD(,netmsg_rollup) netrulist; 91 92 /* Per-CPU thread to handle any protocol. */ 93 static struct thread netisr_cpu[MAXCPU]; 94 lwkt_port netisr_afree_rport; 95 lwkt_port netisr_afree_free_so_rport; 96 lwkt_port netisr_adone_rport; 97 lwkt_port netisr_apanic_rport; 98 lwkt_port netisr_sync_port; 99 100 static int (*netmsg_fwd_port_fn)(lwkt_port_t, lwkt_msg_t); 101 102 SYSCTL_NODE(_net, OID_AUTO, netisr, CTLFLAG_RW, 0, "netisr"); 103 104 /* 105 * netisr_afree_rport replymsg function, only used to handle async 106 * messages which the sender has abandoned to their fate. 107 */ 108 static void 109 netisr_autofree_reply(lwkt_port_t port, lwkt_msg_t msg) 110 { 111 kfree(msg, M_LWKTMSG); 112 } 113 114 static void 115 netisr_autofree_free_so_reply(lwkt_port_t port, lwkt_msg_t msg) 116 { 117 sofree(((netmsg_t)msg)->base.nm_so); 118 kfree(msg, M_LWKTMSG); 119 } 120 121 /* 122 * We need a custom putport function to handle the case where the 123 * message target is the current thread's message port. This case 124 * can occur when the TCP or UDP stack does a direct callback to NFS and NFS 125 * then turns around and executes a network operation synchronously. 126 * 127 * To prevent deadlocking, we must execute these self-referential messages 128 * synchronously, effectively turning the message into a glorified direct 129 * procedure call back into the protocol stack. The operation must be 130 * complete on return or we will deadlock, so panic if it isn't. 131 * 132 * However, the target function is under no obligation to immediately 133 * reply the message. It may forward it elsewhere. 134 */ 135 static int 136 netmsg_put_port(lwkt_port_t port, lwkt_msg_t lmsg) 137 { 138 netmsg_base_t nmsg = (void *)lmsg; 139 140 if ((lmsg->ms_flags & MSGF_SYNC) && port == &curthread->td_msgport) { 141 nmsg->nm_dispatch((netmsg_t)nmsg); 142 return(EASYNC); 143 } else { 144 return(netmsg_fwd_port_fn(port, lmsg)); 145 } 146 } 147 148 /* 149 * UNIX DOMAIN sockets still have to run their uipc functions synchronously, 150 * because they depend on the user proc context for a number of things 151 * (like creds) which we have not yet incorporated into the message structure. 152 * 153 * However, we maintain or message/port abstraction. Having a special 154 * synchronous port which runs the commands synchronously gives us the 155 * ability to serialize operations in one place later on when we start 156 * removing the BGL. 157 */ 158 static int 159 netmsg_sync_putport(lwkt_port_t port, lwkt_msg_t lmsg) 160 { 161 netmsg_base_t nmsg = (void *)lmsg; 162 163 KKASSERT((lmsg->ms_flags & MSGF_DONE) == 0); 164 165 lmsg->ms_target_port = port; /* required for abort */ 166 nmsg->nm_dispatch((netmsg_t)nmsg); 167 return(EASYNC); 168 } 169 170 static void 171 netisr_init(void) 172 { 173 int i; 174 175 TAILQ_INIT(&netreglist); 176 TAILQ_INIT(&netrulist); 177 178 /* 179 * Create default per-cpu threads for generic protocol handling. 180 */ 181 for (i = 0; i < ncpus; ++i) { 182 lwkt_create(netmsg_service_loop, NULL, NULL, 183 &netisr_cpu[i], TDF_STOPREQ, i, 184 "netisr_cpu %d", i); 185 netmsg_service_port_init(&netisr_cpu[i].td_msgport); 186 lwkt_schedule(&netisr_cpu[i]); 187 } 188 189 /* 190 * The netisr_afree_rport is a special reply port which automatically 191 * frees the replied message. The netisr_adone_rport simply marks 192 * the message as being done. The netisr_apanic_rport panics if 193 * the message is replied to. 194 */ 195 lwkt_initport_replyonly(&netisr_afree_rport, netisr_autofree_reply); 196 lwkt_initport_replyonly(&netisr_afree_free_so_rport, 197 netisr_autofree_free_so_reply); 198 lwkt_initport_replyonly_null(&netisr_adone_rport); 199 lwkt_initport_panic(&netisr_apanic_rport); 200 201 /* 202 * The netisr_syncport is a special port which executes the message 203 * synchronously and waits for it if EASYNC is returned. 204 */ 205 lwkt_initport_putonly(&netisr_sync_port, netmsg_sync_putport); 206 } 207 208 SYSINIT(netisr, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, netisr_init, NULL); 209 210 /* 211 * Finish initializing the message port for a netmsg service. This also 212 * registers the port for synchronous cleanup operations such as when an 213 * ifnet is being destroyed. There is no deregistration API yet. 214 */ 215 void 216 netmsg_service_port_init(lwkt_port_t port) 217 { 218 struct netmsg_port_registration *reg; 219 220 /* 221 * Override the putport function. Our custom function checks for 222 * self-references and executes such commands synchronously. 223 */ 224 if (netmsg_fwd_port_fn == NULL) 225 netmsg_fwd_port_fn = port->mp_putport; 226 KKASSERT(netmsg_fwd_port_fn == port->mp_putport); 227 port->mp_putport = netmsg_put_port; 228 229 /* 230 * Keep track of ports using the netmsg API so we can synchronize 231 * certain operations (such as freeing an ifnet structure) across all 232 * consumers. 233 */ 234 reg = kmalloc(sizeof(*reg), M_TEMP, M_WAITOK|M_ZERO); 235 reg->npr_port = port; 236 TAILQ_INSERT_TAIL(&netreglist, reg, npr_entry); 237 } 238 239 /* 240 * This function synchronizes the caller with all netmsg services. For 241 * example, if an interface is being removed we must make sure that all 242 * packets related to that interface complete processing before the structure 243 * can actually be freed. This sort of synchronization is an alternative to 244 * ref-counting the netif, removing the ref counting overhead in favor of 245 * placing additional overhead in the netif freeing sequence (where it is 246 * inconsequential). 247 */ 248 void 249 netmsg_service_sync(void) 250 { 251 struct netmsg_port_registration *reg; 252 struct netmsg_base smsg; 253 254 netmsg_init(&smsg, NULL, &curthread->td_msgport, 0, netmsg_sync_func); 255 256 TAILQ_FOREACH(reg, &netreglist, npr_entry) { 257 lwkt_domsg(reg->npr_port, &smsg.lmsg, 0); 258 } 259 } 260 261 /* 262 * The netmsg function simply replies the message. API semantics require 263 * EASYNC to be returned if the netmsg function disposes of the message. 264 */ 265 static void 266 netmsg_sync_func(netmsg_t msg) 267 { 268 lwkt_replymsg(&msg->lmsg, 0); 269 } 270 271 /* 272 * Generic netmsg service loop. Some protocols may roll their own but all 273 * must do the basic command dispatch function call done here. 274 */ 275 static void 276 netmsg_service_loop(void *arg) 277 { 278 struct netmsg_rollup *ru; 279 netmsg_base_t msg; 280 thread_t td = curthread;; 281 int limit; 282 283 while ((msg = lwkt_waitport(&td->td_msgport, 0))) { 284 /* 285 * Run up to 512 pending netmsgs. 286 */ 287 limit = 512; 288 do { 289 KASSERT(msg->nm_dispatch != NULL, 290 ("netmsg_service isr %d badmsg\n", 291 msg->lmsg.u.ms_result)); 292 if (msg->nm_so && 293 msg->nm_so->so_port != &td->td_msgport) { 294 /* 295 * Sockets undergoing connect or disconnect 296 * ops can change ports on us. Chase the 297 * port. 298 */ 299 kprintf("netmsg_service_loop: Warning, " 300 "port changed so=%p\n", msg->nm_so); 301 lwkt_forwardmsg(msg->nm_so->so_port, 302 &msg->lmsg); 303 } else { 304 /* 305 * We are on the correct port, dispatch it. 306 */ 307 msg->nm_dispatch((netmsg_t)msg); 308 } 309 if (--limit == 0) 310 break; 311 } while ((msg = lwkt_getport(&td->td_msgport)) != NULL); 312 313 /* 314 * Run all registered rollup functions for this cpu 315 * (e.g. tcp_willblock()). 316 */ 317 TAILQ_FOREACH(ru, &netrulist, ru_entry) 318 ru->ru_func(); 319 } 320 } 321 322 /* 323 * Forward a packet to a netisr service function. 324 * 325 * If the packet has not been assigned to a protocol thread we call 326 * the port characterization function to assign it. The caller must 327 * clear M_HASH (or not have set it in the first place) if the caller 328 * wishes the packet to be recharacterized. 329 */ 330 int 331 netisr_queue(int num, struct mbuf *m) 332 { 333 struct netisr *ni; 334 struct netmsg_packet *pmsg; 335 lwkt_port_t port; 336 337 KASSERT((num > 0 && num <= NELEM(netisrs)), 338 ("Bad isr %d", num)); 339 340 ni = &netisrs[num]; 341 if (ni->ni_handler == NULL) { 342 kprintf("Unregistered isr %d\n", num); 343 m_freem(m); 344 return (EIO); 345 } 346 347 /* 348 * Figure out which protocol thread to send to. This does not 349 * have to be perfect but performance will be really good if it 350 * is correct. Major protocol inputs such as ip_input() will 351 * re-characterize the packet as necessary. 352 */ 353 if ((m->m_flags & M_HASH) == 0) { 354 ni->ni_cpufn(&m, 0); 355 if (m == NULL) { 356 m_freem(m); 357 return (EIO); 358 } 359 if ((m->m_flags & M_HASH) == 0) { 360 kprintf("netisr_queue(%d): packet hash failed\n", num); 361 m_freem(m); 362 return (EIO); 363 } 364 } 365 366 /* 367 * Get the protocol port based on the packet hash, initialize 368 * the netmsg, and send it off. 369 */ 370 port = cpu_portfn(m->m_pkthdr.hash); 371 pmsg = &m->m_hdr.mh_netmsg; 372 netmsg_init(&pmsg->base, NULL, &netisr_apanic_rport, 373 0, ni->ni_handler); 374 pmsg->nm_packet = m; 375 pmsg->base.lmsg.u.ms_result = num; 376 lwkt_sendmsg(port, &pmsg->base.lmsg); 377 378 return (0); 379 } 380 381 /* 382 * Pre-characterization of a deeper portion of the packet for the 383 * requested isr. 384 * 385 * The base of the ISR type (e.g. IP) that we want to characterize is 386 * at (hoff) relative to the beginning of the mbuf. This allows 387 * e.g. ether_input_chain() to not have to adjust the m_data/m_len. 388 */ 389 void 390 netisr_characterize(int num, struct mbuf **mp, int hoff) 391 { 392 struct netisr *ni; 393 struct mbuf *m; 394 395 /* 396 * Validation 397 */ 398 m = *mp; 399 KKASSERT(m != NULL); 400 401 if (num < 0 || num >= NETISR_MAX) { 402 if (num == NETISR_MAX) { 403 m->m_flags |= M_HASH; 404 m->m_pkthdr.hash = 0; 405 return; 406 } 407 panic("Bad isr %d", num); 408 } 409 410 /* 411 * Valid netisr? 412 */ 413 ni = &netisrs[num]; 414 if (ni->ni_handler == NULL) { 415 kprintf("Unregistered isr %d\n", num); 416 m_freem(m); 417 *mp = NULL; 418 } 419 420 /* 421 * Characterize the packet 422 */ 423 if ((m->m_flags & M_HASH) == 0) { 424 ni->ni_cpufn(mp, hoff); 425 m = *mp; 426 if (m && (m->m_flags & M_HASH) == 0) 427 kprintf("netisr_queue(%d): packet hash failed\n", num); 428 } 429 } 430 431 void 432 netisr_register(int num, netisr_fn_t handler, netisr_cpufn_t cpufn) 433 { 434 struct netisr *ni; 435 436 KASSERT((num > 0 && num <= NELEM(netisrs)), 437 ("netisr_register: bad isr %d", num)); 438 KKASSERT(handler != NULL); 439 440 if (cpufn == NULL) 441 cpufn = cpu0_cpufn; 442 443 ni = &netisrs[num]; 444 445 ni->ni_handler = handler; 446 ni->ni_cpufn = cpufn; 447 netmsg_init(&ni->ni_netmsg, NULL, &netisr_adone_rport, 0, NULL); 448 } 449 450 void 451 netisr_register_rollup(netisr_ru_t ru_func) 452 { 453 struct netmsg_rollup *ru; 454 455 ru = kmalloc(sizeof(*ru), M_TEMP, M_WAITOK|M_ZERO); 456 ru->ru_func = ru_func; 457 TAILQ_INSERT_TAIL(&netrulist, ru, ru_entry); 458 } 459 460 /* 461 * Return the message port for the general protocol message servicing 462 * thread for a particular cpu. 463 */ 464 lwkt_port_t 465 cpu_portfn(int cpu) 466 { 467 KKASSERT(cpu >= 0 && cpu < ncpus); 468 return (&netisr_cpu[cpu].td_msgport); 469 } 470 471 /* 472 * Return the current cpu's network protocol thread. 473 */ 474 lwkt_port_t 475 cur_netport(void) 476 { 477 return(cpu_portfn(mycpu->gd_cpuid)); 478 } 479 480 /* 481 * Return a default protocol control message processing thread port 482 */ 483 lwkt_port_t 484 cpu0_ctlport(int cmd __unused, struct sockaddr *sa __unused, 485 void *extra __unused) 486 { 487 return (&netisr_cpu[0].td_msgport); 488 } 489 490 /* 491 * This is a default netisr packet characterization function which 492 * sets M_HASH. If a netisr is registered with a NULL cpufn function 493 * this one is assigned. 494 * 495 * This function makes no attempt to validate the packet. 496 */ 497 static void 498 cpu0_cpufn(struct mbuf **mp, int hoff __unused) 499 { 500 struct mbuf *m = *mp; 501 502 m->m_flags |= M_HASH; 503 m->m_pkthdr.hash = 0; 504 } 505 506 /* 507 * schednetisr() is used to call the netisr handler from the appropriate 508 * netisr thread for polling and other purposes. 509 * 510 * This function may be called from a hard interrupt or IPI and must be 511 * MP SAFE and non-blocking. We use a fixed per-cpu message instead of 512 * trying to allocate one. We must get ourselves onto the target cpu 513 * to safely check the MSGF_DONE bit on the message but since the message 514 * will be sent to that cpu anyway this does not add any extra work beyond 515 * what lwkt_sendmsg() would have already had to do to schedule the target 516 * thread. 517 */ 518 static void 519 schednetisr_remote(void *data) 520 { 521 int num = (int)(intptr_t)data; 522 struct netisr *ni = &netisrs[num]; 523 lwkt_port_t port = &netisr_cpu[0].td_msgport; 524 netmsg_base_t pmsg; 525 526 pmsg = &netisrs[num].ni_netmsg; 527 if (pmsg->lmsg.ms_flags & MSGF_DONE) { 528 netmsg_init(pmsg, NULL, &netisr_adone_rport, 0, ni->ni_handler); 529 pmsg->lmsg.u.ms_result = num; 530 lwkt_sendmsg(port, &pmsg->lmsg); 531 } 532 } 533 534 void 535 schednetisr(int num) 536 { 537 KASSERT((num > 0 && num <= NELEM(netisrs)), 538 ("schednetisr: bad isr %d", num)); 539 KKASSERT(netisrs[num].ni_handler != NULL); 540 #ifdef SMP 541 if (mycpu->gd_cpuid != 0) { 542 lwkt_send_ipiq(globaldata_find(0), 543 schednetisr_remote, (void *)(intptr_t)num); 544 } else { 545 crit_enter(); 546 schednetisr_remote((void *)(intptr_t)num); 547 crit_exit(); 548 } 549 #else 550 crit_enter(); 551 schednetisr_remote((void *)(intptr_t)num); 552 crit_exit(); 553 #endif 554 } 555 556 #ifdef SMP 557 558 static void 559 netisr_barrier_dispatch(netmsg_t nmsg) 560 { 561 struct netmsg_barrier *msg = (struct netmsg_barrier *)nmsg; 562 563 atomic_clear_cpumask(msg->br_cpumask, mycpu->gd_cpumask); 564 if (*msg->br_cpumask == 0) 565 wakeup(msg->br_cpumask); 566 567 for (;;) { 568 uint32_t done = msg->br_done; 569 570 cpu_ccfence(); 571 if ((done & NETISR_BR_NOTDONE) == 0) 572 break; 573 574 tsleep_interlock(&msg->br_done, 0); 575 if (atomic_cmpset_int(&msg->br_done, 576 done, done | NETISR_BR_WAITDONE)) 577 tsleep(&msg->br_done, PINTERLOCKED, "nbrdsp", 0); 578 } 579 580 lwkt_replymsg(&nmsg->lmsg, 0); 581 } 582 583 #endif 584 585 struct netisr_barrier * 586 netisr_barrier_create(void) 587 { 588 struct netisr_barrier *br; 589 590 br = kmalloc(sizeof(*br), M_LWKTMSG, M_WAITOK | M_ZERO); 591 return br; 592 } 593 594 void 595 netisr_barrier_set(struct netisr_barrier *br) 596 { 597 #ifdef SMP 598 volatile cpumask_t other_cpumask; 599 int i, cur_cpuid; 600 601 KKASSERT(&curthread->td_msgport == cpu_portfn(0)); 602 KKASSERT(!br->br_isset); 603 604 other_cpumask = mycpu->gd_other_cpus & smp_active_mask; 605 cur_cpuid = mycpuid; 606 607 for (i = 0; i < ncpus; ++i) { 608 struct netmsg_barrier *msg; 609 610 if (i == cur_cpuid) 611 continue; 612 613 msg = kmalloc(sizeof(struct netmsg_barrier), 614 M_LWKTMSG, M_WAITOK); 615 netmsg_init(&msg->base, NULL, &netisr_afree_rport, 616 MSGF_PRIORITY, netisr_barrier_dispatch); 617 msg->br_cpumask = &other_cpumask; 618 msg->br_done = NETISR_BR_NOTDONE; 619 620 KKASSERT(br->br_msgs[i] == NULL); 621 br->br_msgs[i] = msg; 622 } 623 624 for (i = 0; i < ncpus; ++i) { 625 if (i == cur_cpuid) 626 continue; 627 lwkt_sendmsg(cpu_portfn(i), &br->br_msgs[i]->base.lmsg); 628 } 629 630 while (other_cpumask != 0) { 631 tsleep_interlock(&other_cpumask, 0); 632 if (other_cpumask != 0) 633 tsleep(&other_cpumask, PINTERLOCKED, "nbrset", 0); 634 } 635 #endif 636 br->br_isset = 1; 637 } 638 639 void 640 netisr_barrier_rem(struct netisr_barrier *br) 641 { 642 #ifdef SMP 643 int i, cur_cpuid; 644 645 KKASSERT(&curthread->td_msgport == cpu_portfn(0)); 646 KKASSERT(br->br_isset); 647 648 cur_cpuid = mycpuid; 649 for (i = 0; i < ncpus; ++i) { 650 struct netmsg_barrier *msg = br->br_msgs[i]; 651 uint32_t done; 652 653 msg = br->br_msgs[i]; 654 br->br_msgs[i] = NULL; 655 656 if (i == cur_cpuid) 657 continue; 658 659 done = atomic_swap_int(&msg->br_done, 0); 660 if (done & NETISR_BR_WAITDONE) 661 wakeup(&msg->br_done); 662 } 663 #endif 664 br->br_isset = 0; 665 } 666