1 /* 2 * Copyright (c) 2003, 2004 Matthew Dillon. All rights reserved. 3 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. 4 * Copyright (c) 2003 Jonathan Lemon. All rights reserved. 5 * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. 6 * 7 * This code is derived from software contributed to The DragonFly Project 8 * by Jonathan Lemon, Jeffrey M. Hsu, and Matthew Dillon. 9 * 10 * Jonathan Lemon gave Jeffrey Hsu permission to combine his copyright 11 * into this one around July 8 2004. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of The DragonFly Project nor the names of its 22 * contributors may be used to endorse or promote products derived 23 * from this software without specific, prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 28 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 29 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 30 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 31 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 32 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 33 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 34 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 35 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * $DragonFly: src/sys/net/netisr.c,v 1.49 2008/11/01 10:29:31 sephe Exp $ 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/kernel.h> 44 #include <sys/malloc.h> 45 #include <sys/msgport.h> 46 #include <sys/proc.h> 47 #include <sys/interrupt.h> 48 #include <sys/socket.h> 49 #include <sys/sysctl.h> 50 #include <sys/socketvar.h> 51 #include <net/if.h> 52 #include <net/if_var.h> 53 #include <net/netisr.h> 54 #include <machine/cpufunc.h> 55 56 #include <sys/thread2.h> 57 #include <sys/msgport2.h> 58 #include <net/netmsg2.h> 59 #include <sys/mplock2.h> 60 61 static void netmsg_sync_func(netmsg_t msg); 62 static void netmsg_service_loop(void *arg); 63 static void cpu0_cpufn(struct mbuf **mp, int hoff); 64 65 struct netmsg_port_registration { 66 TAILQ_ENTRY(netmsg_port_registration) npr_entry; 67 lwkt_port_t npr_port; 68 }; 69 70 struct netmsg_rollup { 71 TAILQ_ENTRY(netmsg_rollup) ru_entry; 72 netisr_ru_t ru_func; 73 }; 74 75 static struct netisr netisrs[NETISR_MAX]; 76 static TAILQ_HEAD(,netmsg_port_registration) netreglist; 77 static TAILQ_HEAD(,netmsg_rollup) netrulist; 78 79 /* Per-CPU thread to handle any protocol. */ 80 static struct thread netisr_cpu[MAXCPU]; 81 lwkt_port netisr_afree_rport; 82 lwkt_port netisr_afree_free_so_rport; 83 lwkt_port netisr_adone_rport; 84 lwkt_port netisr_apanic_rport; 85 lwkt_port netisr_sync_port; 86 87 static int (*netmsg_fwd_port_fn)(lwkt_port_t, lwkt_msg_t); 88 89 SYSCTL_NODE(_net, OID_AUTO, netisr, CTLFLAG_RW, 0, "netisr"); 90 91 /* 92 * netisr_afree_rport replymsg function, only used to handle async 93 * messages which the sender has abandoned to their fate. 94 */ 95 static void 96 netisr_autofree_reply(lwkt_port_t port, lwkt_msg_t msg) 97 { 98 kfree(msg, M_LWKTMSG); 99 } 100 101 static void 102 netisr_autofree_free_so_reply(lwkt_port_t port, lwkt_msg_t msg) 103 { 104 sofree(((netmsg_t)msg)->base.nm_so); 105 kfree(msg, M_LWKTMSG); 106 } 107 108 /* 109 * We need a custom putport function to handle the case where the 110 * message target is the current thread's message port. This case 111 * can occur when the TCP or UDP stack does a direct callback to NFS and NFS 112 * then turns around and executes a network operation synchronously. 113 * 114 * To prevent deadlocking, we must execute these self-referential messages 115 * synchronously, effectively turning the message into a glorified direct 116 * procedure call back into the protocol stack. The operation must be 117 * complete on return or we will deadlock, so panic if it isn't. 118 * 119 * However, the target function is under no obligation to immediately 120 * reply the message. It may forward it elsewhere. 121 */ 122 static int 123 netmsg_put_port(lwkt_port_t port, lwkt_msg_t lmsg) 124 { 125 netmsg_base_t nmsg = (void *)lmsg; 126 127 if ((lmsg->ms_flags & MSGF_SYNC) && port == &curthread->td_msgport) { 128 nmsg->nm_dispatch((netmsg_t)nmsg); 129 return(EASYNC); 130 } else { 131 return(netmsg_fwd_port_fn(port, lmsg)); 132 } 133 } 134 135 /* 136 * UNIX DOMAIN sockets still have to run their uipc functions synchronously, 137 * because they depend on the user proc context for a number of things 138 * (like creds) which we have not yet incorporated into the message structure. 139 * 140 * However, we maintain or message/port abstraction. Having a special 141 * synchronous port which runs the commands synchronously gives us the 142 * ability to serialize operations in one place later on when we start 143 * removing the BGL. 144 */ 145 static int 146 netmsg_sync_putport(lwkt_port_t port, lwkt_msg_t lmsg) 147 { 148 netmsg_base_t nmsg = (void *)lmsg; 149 150 KKASSERT((lmsg->ms_flags & MSGF_DONE) == 0); 151 152 lmsg->ms_target_port = port; /* required for abort */ 153 nmsg->nm_dispatch((netmsg_t)nmsg); 154 return(EASYNC); 155 } 156 157 static void 158 netisr_init(void) 159 { 160 int i; 161 162 TAILQ_INIT(&netreglist); 163 TAILQ_INIT(&netrulist); 164 165 /* 166 * Create default per-cpu threads for generic protocol handling. 167 */ 168 for (i = 0; i < ncpus; ++i) { 169 lwkt_create(netmsg_service_loop, NULL, NULL, 170 &netisr_cpu[i], TDF_STOPREQ, i, 171 "netisr_cpu %d", i); 172 netmsg_service_port_init(&netisr_cpu[i].td_msgport); 173 lwkt_schedule(&netisr_cpu[i]); 174 } 175 176 /* 177 * The netisr_afree_rport is a special reply port which automatically 178 * frees the replied message. The netisr_adone_rport simply marks 179 * the message as being done. The netisr_apanic_rport panics if 180 * the message is replied to. 181 */ 182 lwkt_initport_replyonly(&netisr_afree_rport, netisr_autofree_reply); 183 lwkt_initport_replyonly(&netisr_afree_free_so_rport, 184 netisr_autofree_free_so_reply); 185 lwkt_initport_replyonly_null(&netisr_adone_rport); 186 lwkt_initport_panic(&netisr_apanic_rport); 187 188 /* 189 * The netisr_syncport is a special port which executes the message 190 * synchronously and waits for it if EASYNC is returned. 191 */ 192 lwkt_initport_putonly(&netisr_sync_port, netmsg_sync_putport); 193 } 194 195 SYSINIT(netisr, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, netisr_init, NULL); 196 197 /* 198 * Finish initializing the message port for a netmsg service. This also 199 * registers the port for synchronous cleanup operations such as when an 200 * ifnet is being destroyed. There is no deregistration API yet. 201 */ 202 void 203 netmsg_service_port_init(lwkt_port_t port) 204 { 205 struct netmsg_port_registration *reg; 206 207 /* 208 * Override the putport function. Our custom function checks for 209 * self-references and executes such commands synchronously. 210 */ 211 if (netmsg_fwd_port_fn == NULL) 212 netmsg_fwd_port_fn = port->mp_putport; 213 KKASSERT(netmsg_fwd_port_fn == port->mp_putport); 214 port->mp_putport = netmsg_put_port; 215 216 /* 217 * Keep track of ports using the netmsg API so we can synchronize 218 * certain operations (such as freeing an ifnet structure) across all 219 * consumers. 220 */ 221 reg = kmalloc(sizeof(*reg), M_TEMP, M_WAITOK|M_ZERO); 222 reg->npr_port = port; 223 TAILQ_INSERT_TAIL(&netreglist, reg, npr_entry); 224 } 225 226 /* 227 * This function synchronizes the caller with all netmsg services. For 228 * example, if an interface is being removed we must make sure that all 229 * packets related to that interface complete processing before the structure 230 * can actually be freed. This sort of synchronization is an alternative to 231 * ref-counting the netif, removing the ref counting overhead in favor of 232 * placing additional overhead in the netif freeing sequence (where it is 233 * inconsequential). 234 */ 235 void 236 netmsg_service_sync(void) 237 { 238 struct netmsg_port_registration *reg; 239 struct netmsg_base smsg; 240 241 netmsg_init(&smsg, NULL, &curthread->td_msgport, 0, netmsg_sync_func); 242 243 TAILQ_FOREACH(reg, &netreglist, npr_entry) { 244 lwkt_domsg(reg->npr_port, &smsg.lmsg, 0); 245 } 246 } 247 248 /* 249 * The netmsg function simply replies the message. API semantics require 250 * EASYNC to be returned if the netmsg function disposes of the message. 251 */ 252 static void 253 netmsg_sync_func(netmsg_t msg) 254 { 255 lwkt_replymsg(&msg->lmsg, 0); 256 } 257 258 /* 259 * Generic netmsg service loop. Some protocols may roll their own but all 260 * must do the basic command dispatch function call done here. 261 */ 262 static void 263 netmsg_service_loop(void *arg) 264 { 265 struct netmsg_rollup *ru; 266 netmsg_base_t msg; 267 thread_t td = curthread;; 268 int limit; 269 270 while ((msg = lwkt_waitport(&td->td_msgport, 0))) { 271 /* 272 * Run up to 512 pending netmsgs. 273 */ 274 limit = 512; 275 do { 276 KASSERT(msg->nm_dispatch != NULL, 277 ("netmsg_service isr %d badmsg\n", 278 msg->lmsg.u.ms_result)); 279 if (msg->nm_so && 280 msg->nm_so->so_port != &td->td_msgport) { 281 /* 282 * Sockets undergoing connect or disconnect 283 * ops can change ports on us. Chase the 284 * port. 285 */ 286 kprintf("netmsg_service_loop: Warning, " 287 "port changed so=%p\n", msg->nm_so); 288 lwkt_forwardmsg(msg->nm_so->so_port, 289 &msg->lmsg); 290 } else { 291 /* 292 * We are on the correct port, dispatch it. 293 */ 294 msg->nm_dispatch((netmsg_t)msg); 295 } 296 if (--limit == 0) 297 break; 298 } while ((msg = lwkt_getport(&td->td_msgport)) != NULL); 299 300 /* 301 * Run all registered rollup functions for this cpu 302 * (e.g. tcp_willblock()). 303 */ 304 TAILQ_FOREACH(ru, &netrulist, ru_entry) 305 ru->ru_func(); 306 } 307 } 308 309 /* 310 * Forward a packet to a netisr service function. 311 * 312 * If the packet has not been assigned to a protocol thread we call 313 * the port characterization function to assign it. The caller must 314 * clear M_HASH (or not have set it in the first place) if the caller 315 * wishes the packet to be recharacterized. 316 */ 317 int 318 netisr_queue(int num, struct mbuf *m) 319 { 320 struct netisr *ni; 321 struct netmsg_packet *pmsg; 322 lwkt_port_t port; 323 324 KASSERT((num > 0 && num <= (sizeof(netisrs)/sizeof(netisrs[0]))), 325 ("Bad isr %d", num)); 326 327 ni = &netisrs[num]; 328 if (ni->ni_handler == NULL) { 329 kprintf("Unregistered isr %d\n", num); 330 m_freem(m); 331 return (EIO); 332 } 333 334 /* 335 * Figure out which protocol thread to send to. This does not 336 * have to be perfect but performance will be really good if it 337 * is correct. Major protocol inputs such as ip_input() will 338 * re-characterize the packet as necessary. 339 */ 340 if ((m->m_flags & M_HASH) == 0) { 341 ni->ni_cpufn(&m, 0); 342 if (m == NULL) { 343 m_freem(m); 344 return (EIO); 345 } 346 if ((m->m_flags & M_HASH) == 0) { 347 kprintf("netisr_queue(%d): packet hash failed\n", num); 348 m_freem(m); 349 return (EIO); 350 } 351 } 352 353 /* 354 * Get the protocol port based on the packet hash, initialize 355 * the netmsg, and send it off. 356 */ 357 port = cpu_portfn(m->m_pkthdr.hash); 358 pmsg = &m->m_hdr.mh_netmsg; 359 netmsg_init(&pmsg->base, NULL, &netisr_apanic_rport, 360 0, ni->ni_handler); 361 pmsg->nm_packet = m; 362 pmsg->base.lmsg.u.ms_result = num; 363 lwkt_sendmsg(port, &pmsg->base.lmsg); 364 365 return (0); 366 } 367 368 /* 369 * Pre-characterization of a deeper portion of the packet for the 370 * requested isr. 371 * 372 * The base of the ISR type (e.g. IP) that we want to characterize is 373 * at (hoff) relative to the beginning of the mbuf. This allows 374 * e.g. ether_input_chain() to not have to adjust the m_data/m_len. 375 */ 376 void 377 netisr_characterize(int num, struct mbuf **mp, int hoff) 378 { 379 struct netisr *ni; 380 struct mbuf *m; 381 382 /* 383 * Validation 384 */ 385 m = *mp; 386 KKASSERT(m != NULL); 387 388 if (num < 0 || num >= NETISR_MAX) { 389 if (num == NETISR_MAX) { 390 m->m_flags |= M_HASH; 391 m->m_pkthdr.hash = 0; 392 return; 393 } 394 panic("Bad isr %d", num); 395 } 396 397 /* 398 * Valid netisr? 399 */ 400 ni = &netisrs[num]; 401 if (ni->ni_handler == NULL) { 402 kprintf("Unregistered isr %d\n", num); 403 m_freem(m); 404 *mp = NULL; 405 } 406 407 /* 408 * Characterize the packet 409 */ 410 if ((m->m_flags & M_HASH) == 0) { 411 ni->ni_cpufn(mp, hoff); 412 m = *mp; 413 if (m && (m->m_flags & M_HASH) == 0) 414 kprintf("netisr_queue(%d): packet hash failed\n", num); 415 } 416 } 417 418 void 419 netisr_register(int num, netisr_fn_t handler, netisr_cpufn_t cpufn) 420 { 421 struct netisr *ni; 422 423 KASSERT((num > 0 && num <= (sizeof(netisrs)/sizeof(netisrs[0]))), 424 ("netisr_register: bad isr %d", num)); 425 KKASSERT(handler != NULL); 426 427 if (cpufn == NULL) 428 cpufn = cpu0_cpufn; 429 430 ni = &netisrs[num]; 431 432 ni->ni_handler = handler; 433 ni->ni_cpufn = cpufn; 434 netmsg_init(&ni->ni_netmsg, NULL, &netisr_adone_rport, 0, NULL); 435 } 436 437 void 438 netisr_register_rollup(netisr_ru_t ru_func) 439 { 440 struct netmsg_rollup *ru; 441 442 ru = kmalloc(sizeof(*ru), M_TEMP, M_WAITOK|M_ZERO); 443 ru->ru_func = ru_func; 444 TAILQ_INSERT_TAIL(&netrulist, ru, ru_entry); 445 } 446 447 /* 448 * Return the message port for the general protocol message servicing 449 * thread for a particular cpu. 450 */ 451 lwkt_port_t 452 cpu_portfn(int cpu) 453 { 454 KKASSERT(cpu >= 0 && cpu < ncpus); 455 return (&netisr_cpu[cpu].td_msgport); 456 } 457 458 /* 459 * Return the current cpu's network protocol thread. 460 */ 461 lwkt_port_t 462 cur_netport(void) 463 { 464 return(cpu_portfn(mycpu->gd_cpuid)); 465 } 466 467 /* 468 * Return a default protocol control message processing thread port 469 */ 470 lwkt_port_t 471 cpu0_ctlport(int cmd __unused, struct sockaddr *sa __unused, 472 void *extra __unused) 473 { 474 return (&netisr_cpu[0].td_msgport); 475 } 476 477 /* 478 * This is a default netisr packet characterization function which 479 * sets M_HASH. If a netisr is registered with a NULL cpufn function 480 * this one is assigned. 481 * 482 * This function makes no attempt to validate the packet. 483 */ 484 static void 485 cpu0_cpufn(struct mbuf **mp, int hoff __unused) 486 { 487 struct mbuf *m = *mp; 488 489 m->m_flags |= M_HASH; 490 m->m_pkthdr.hash = 0; 491 } 492 493 /* 494 * schednetisr() is used to call the netisr handler from the appropriate 495 * netisr thread for polling and other purposes. 496 * 497 * This function may be called from a hard interrupt or IPI and must be 498 * MP SAFE and non-blocking. We use a fixed per-cpu message instead of 499 * trying to allocate one. We must get ourselves onto the target cpu 500 * to safely check the MSGF_DONE bit on the message but since the message 501 * will be sent to that cpu anyway this does not add any extra work beyond 502 * what lwkt_sendmsg() would have already had to do to schedule the target 503 * thread. 504 */ 505 static void 506 schednetisr_remote(void *data) 507 { 508 int num = (int)(intptr_t)data; 509 struct netisr *ni = &netisrs[num]; 510 lwkt_port_t port = &netisr_cpu[0].td_msgport; 511 netmsg_base_t pmsg; 512 513 pmsg = &netisrs[num].ni_netmsg; 514 if (pmsg->lmsg.ms_flags & MSGF_DONE) { 515 netmsg_init(pmsg, NULL, &netisr_adone_rport, 0, ni->ni_handler); 516 pmsg->lmsg.u.ms_result = num; 517 lwkt_sendmsg(port, &pmsg->lmsg); 518 } 519 } 520 521 void 522 schednetisr(int num) 523 { 524 KASSERT((num > 0 && num <= (sizeof(netisrs)/sizeof(netisrs[0]))), 525 ("schednetisr: bad isr %d", num)); 526 KKASSERT(netisrs[num].ni_handler != NULL); 527 #ifdef SMP 528 if (mycpu->gd_cpuid != 0) { 529 lwkt_send_ipiq(globaldata_find(0), 530 schednetisr_remote, (void *)(intptr_t)num); 531 } else { 532 crit_enter(); 533 schednetisr_remote((void *)(intptr_t)num); 534 crit_exit(); 535 } 536 #else 537 crit_enter(); 538 schednetisr_remote((void *)(intptr_t)num); 539 crit_exit(); 540 #endif 541 } 542