1 /* 2 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * $DragonFly: src/sys/netinet/ip_demux.c,v 1.45 2008/11/11 10:46:58 sephe Exp $ 34 */ 35 36 #include "opt_inet.h" 37 #include "opt_rss.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/kernel.h> 42 #include <sys/socket.h> 43 #include <sys/socketvar.h> 44 #include <sys/thread.h> 45 #include <sys/sysctl.h> 46 #include <sys/globaldata.h> 47 48 #include <net/if.h> 49 #include <net/netisr.h> 50 #ifdef RSS 51 #include <net/toeplitz2.h> 52 #endif 53 54 #include <netinet/in_systm.h> 55 #include <netinet/in.h> 56 #include <netinet/in_var.h> 57 #include <netinet/in_pcb.h> 58 #include <netinet/ip.h> 59 #include <netinet/ip_var.h> 60 #include <netinet/tcp.h> 61 #include <netinet/tcpip.h> 62 #include <netinet/tcp_var.h> 63 #include <netinet/udp.h> 64 #include <netinet/udp_var.h> 65 66 extern struct thread netisr_cpu[]; 67 extern int udp_mpsafe_thread; 68 69 static struct thread tcp_thread[MAXCPU]; 70 static struct thread udp_thread[MAXCPU]; 71 72 #ifndef RSS 73 74 static __inline int 75 INP_MPORT_HASH(in_addr_t faddr, in_addr_t laddr, 76 in_port_t fport, in_port_t lport) 77 { 78 /* 79 * Use low order bytes. 80 */ 81 82 #if (BYTE_ORDER == LITTLE_ENDIAN) 83 KASSERT(ncpus2 < 256, ("need different hash function")); /* XXX JH */ 84 return (((faddr >> 24) ^ (fport >> 8) ^ (laddr >> 24) ^ (lport >> 8)) & 85 ncpus2_mask); 86 #else 87 return ((faddr ^ fport ^ laddr ^ lport) & ncpus2_mask); 88 #endif 89 } 90 91 #endif /* !RSS */ 92 93 static __inline int 94 INP_MPORT_HASH_UDP(in_addr_t faddr, in_addr_t laddr, 95 in_port_t fport, in_port_t lport) 96 { 97 #ifndef RSS 98 return INP_MPORT_HASH(faddr, laddr, fport, lport); 99 #else 100 return toeplitz_hash(toeplitz_rawhash_addr(faddr, laddr)); 101 #endif 102 } 103 104 static __inline int 105 INP_MPORT_HASH_TCP(in_addr_t faddr, in_addr_t laddr, 106 in_port_t fport, in_port_t lport) 107 { 108 #ifndef RSS 109 return INP_MPORT_HASH(faddr, laddr, fport, lport); 110 #else 111 return toeplitz_hash( 112 toeplitz_rawhash_addrport(faddr, laddr, fport, lport)); 113 #endif 114 } 115 116 /* 117 * If the packet is a valid IP datagram, upon returning of this function 118 * following things are promised: 119 * 120 * o IP header (including any possible IP options) is in one mbuf (m_len). 121 * o IP header length is not less than the minimum (sizeof(struct ip)). 122 * o IP total length is not less than IP header length. 123 * o IP datagram resides completely in the mbuf chain, 124 * i.e. pkthdr.len >= IP total length. 125 * 126 * If the packet is a UDP datagram, 127 * o IP header (including any possible IP options) and UDP header are in 128 * one mbuf (m_len). 129 * o IP total length is not less than (IP header length + UDP header length). 130 * 131 * If the packet is a TCP segment, 132 * o IP header (including any possible IP options) and TCP header (including 133 * any possible TCP options) are in one mbuf (m_len). 134 * o TCP header length is not less than the minimum (sizeof(struct tcphdr)). 135 * o IP total length is not less than (IP header length + TCP header length). 136 */ 137 boolean_t 138 ip_lengthcheck(struct mbuf **mp) 139 { 140 struct mbuf *m = *mp; 141 struct ip *ip; 142 int iphlen, iplen; 143 struct tcphdr *th; 144 int thoff; /* TCP data offset */ 145 146 /* The packet must be at least the size of an IP header. */ 147 if (m->m_pkthdr.len < sizeof(struct ip)) { 148 ipstat.ips_tooshort++; 149 goto fail; 150 } 151 152 /* The fixed IP header must reside completely in the first mbuf. */ 153 if (m->m_len < sizeof(struct ip)) { 154 m = m_pullup(m, sizeof(struct ip)); 155 if (m == NULL) { 156 ipstat.ips_toosmall++; 157 goto fail; 158 } 159 } 160 161 ip = mtod(m, struct ip *); 162 163 /* Bound check the packet's stated IP header length. */ 164 iphlen = ip->ip_hl << 2; 165 if (iphlen < sizeof(struct ip)) { /* minimum header length */ 166 ipstat.ips_badhlen++; 167 goto fail; 168 } 169 170 /* The full IP header must reside completely in the one mbuf. */ 171 if (m->m_len < iphlen) { 172 m = m_pullup(m, iphlen); 173 if (m == NULL) { 174 ipstat.ips_badhlen++; 175 goto fail; 176 } 177 ip = mtod(m, struct ip *); 178 } 179 180 iplen = ntohs(ip->ip_len); 181 182 /* 183 * Check that the amount of data in the buffers is as 184 * at least much as the IP header would have us expect. 185 */ 186 if (m->m_pkthdr.len < iplen) { 187 ipstat.ips_tooshort++; 188 goto fail; 189 } 190 191 /* 192 * Fragments other than the first fragment don't have much 193 * length information. 194 */ 195 if (ntohs(ip->ip_off) & IP_OFFMASK) 196 goto ipcheckonly; 197 198 /* 199 * The TCP/IP or UDP/IP header must be entirely contained within 200 * the first fragment of a packet. Packet filters will break if they 201 * aren't. 202 * 203 * Since the packet will be trimmed to ip_len we must also make sure 204 * the potentially trimmed down length is still sufficient to hold 205 * the header(s). 206 */ 207 switch (ip->ip_p) { 208 case IPPROTO_TCP: 209 if (iplen < iphlen + sizeof(struct tcphdr)) { 210 ++tcpstat.tcps_rcvshort; 211 goto fail; 212 } 213 if (m->m_len < iphlen + sizeof(struct tcphdr)) { 214 m = m_pullup(m, iphlen + sizeof(struct tcphdr)); 215 if (m == NULL) { 216 tcpstat.tcps_rcvshort++; 217 goto fail; 218 } 219 ip = mtod(m, struct ip *); 220 } 221 th = (struct tcphdr *)((caddr_t)ip + iphlen); 222 thoff = th->th_off << 2; 223 if (thoff < sizeof(struct tcphdr) || 224 thoff + iphlen > ntohs(ip->ip_len)) { 225 tcpstat.tcps_rcvbadoff++; 226 goto fail; 227 } 228 if (m->m_len < iphlen + thoff) { 229 m = m_pullup(m, iphlen + thoff); 230 if (m == NULL) { 231 tcpstat.tcps_rcvshort++; 232 goto fail; 233 } 234 } 235 break; 236 case IPPROTO_UDP: 237 if (iplen < iphlen + sizeof(struct udphdr)) { 238 ++udpstat.udps_hdrops; 239 goto fail; 240 } 241 if (m->m_len < iphlen + sizeof(struct udphdr)) { 242 m = m_pullup(m, iphlen + sizeof(struct udphdr)); 243 if (m == NULL) { 244 udpstat.udps_hdrops++; 245 goto fail; 246 } 247 } 248 break; 249 default: 250 ipcheckonly: 251 if (iplen < iphlen) { 252 ++ipstat.ips_badlen; 253 goto fail; 254 } 255 break; 256 } 257 258 m->m_flags |= M_LENCHECKED; 259 *mp = m; 260 return TRUE; 261 262 fail: 263 if (m != NULL) 264 m_freem(m); 265 *mp = NULL; 266 return FALSE; 267 } 268 269 /* 270 * Map a packet to a protocol processing thread and return the thread's port. 271 * If an error occurs, the passed mbuf will be freed, *mptr will be set 272 * to NULL, and NULL will be returned. If no error occurs, the passed mbuf 273 * may be modified and a port pointer will be returned. 274 */ 275 lwkt_port_t 276 ip_mport(struct mbuf **mptr, int dir) 277 { 278 struct ip *ip; 279 int iphlen; 280 struct tcphdr *th; 281 struct udphdr *uh; 282 struct mbuf *m; 283 int thoff; /* TCP data offset */ 284 lwkt_port_t port; 285 int cpu; 286 287 if (!ip_lengthcheck(mptr)) 288 return (NULL); 289 290 m = *mptr; 291 ip = mtod(m, struct ip *); 292 iphlen = ip->ip_hl << 2; 293 294 /* 295 * XXX generic packet handling defrag on CPU 0 for now. 296 */ 297 if (ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) { 298 cpu = 0; 299 port = &netisr_cpu[cpu].td_msgport; 300 goto back; 301 } 302 303 switch (ip->ip_p) { 304 case IPPROTO_TCP: 305 th = (struct tcphdr *)((caddr_t)ip + iphlen); 306 thoff = th->th_off << 2; 307 cpu = INP_MPORT_HASH_TCP(ip->ip_src.s_addr, ip->ip_dst.s_addr, 308 th->th_sport, th->th_dport); 309 port = &tcp_thread[cpu].td_msgport; 310 break; 311 312 case IPPROTO_UDP: 313 uh = (struct udphdr *)((caddr_t)ip + iphlen); 314 315 #ifndef RSS 316 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 317 (dir == IP_MPORT_IN && 318 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))) { 319 cpu = 0; 320 } else 321 #endif 322 { 323 cpu = INP_MPORT_HASH_UDP(ip->ip_src.s_addr, 324 ip->ip_dst.s_addr, uh->uh_sport, uh->uh_dport); 325 } 326 port = &udp_thread[cpu].td_msgport; 327 break; 328 329 default: 330 cpu = 0; 331 port = &netisr_cpu[cpu].td_msgport; 332 break; 333 } 334 back: 335 m->m_flags |= M_HASH; 336 m->m_pkthdr.hash = cpu; 337 return (port); 338 } 339 340 lwkt_port_t 341 ip_mport_in(struct mbuf **mptr) 342 { 343 return ip_mport(mptr, IP_MPORT_IN); 344 } 345 346 /* 347 * Map a packet to a protocol processing thread and return the thread's port. 348 * Unlike ip_mport(), the packet content is not accessed. The packet info 349 * (pi) and the hash of the packet (m_pkthdr.hash) is used instead. NULL is 350 * returned if the packet info does not contain enough information. 351 * 352 * Caller has already made sure that m_pkthdr.hash is valid, i.e. m_flags 353 * has M_HASH set. 354 */ 355 lwkt_port_t 356 ip_mport_pktinfo(const struct pktinfo *pi, struct mbuf *m) 357 { 358 lwkt_port_t port; 359 360 KASSERT(m->m_pkthdr.hash < ncpus2, 361 ("invalid packet hash %#x\n", m->m_pkthdr.hash)); 362 363 /* 364 * XXX generic packet handling defrag on CPU 0 for now. 365 */ 366 if (pi->pi_flags & PKTINFO_FLAG_FRAG) { 367 m->m_pkthdr.hash = 0; 368 return &netisr_cpu[0].td_msgport; 369 } 370 371 switch (pi->pi_l3proto) { 372 case IPPROTO_TCP: 373 port = &tcp_thread[m->m_pkthdr.hash].td_msgport; 374 break; 375 376 case IPPROTO_UDP: 377 port = &udp_thread[m->m_pkthdr.hash].td_msgport; 378 break; 379 380 default: 381 port = NULL; 382 break; 383 } 384 return port; 385 } 386 387 /* 388 * Map a TCP socket to a protocol processing thread. 389 */ 390 lwkt_port_t 391 tcp_soport(struct socket *so, struct sockaddr *nam __unused, 392 struct mbuf **dummy __unused, int req) 393 { 394 struct inpcb *inp; 395 396 /* The following processing all take place on Protocol Thread 0. */ 397 if (req == PRU_BIND || req == PRU_CONNECT || req == PRU_ATTACH || 398 req == PRU_LISTEN) 399 return (&tcp_thread[0].td_msgport); 400 401 inp = so->so_pcb; 402 if (!inp) /* connection reset by peer */ 403 return (&tcp_thread[0].td_msgport); 404 405 /* 406 * Already bound and connected or listening. For TCP connections, 407 * the (faddr, fport, laddr, lport) association cannot change now. 408 * 409 * Note: T/TCP code needs some reorganization to fit into 410 * this model. XXX JH 411 * 412 * Rely on type-stable memory and check in protocol handler 413 * to fix race condition here w/ deallocation of inp. XXX JH 414 */ 415 return (&tcp_thread[INP_MPORT_HASH_TCP(inp->inp_faddr.s_addr, 416 inp->inp_laddr.s_addr, inp->inp_fport, inp->inp_lport)].td_msgport); 417 } 418 419 /* 420 * Used to route icmp messages to the proper protocol thread for ctlinput 421 * operation. 422 */ 423 lwkt_port_t 424 tcp_ctlport(int cmd, struct sockaddr *sa, void *vip) 425 { 426 struct ip *ip = vip; 427 struct tcphdr *th; 428 struct in_addr faddr; 429 int cpu; 430 431 faddr = ((struct sockaddr_in *)sa)->sin_addr; 432 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 433 return(NULL); 434 if (ip == NULL || PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) { 435 /* 436 * Message will be forwarded to all TCP protocol threads 437 * in following way: 438 * 439 * netisr0 (the msgport we return here) 440 * | 441 * | 442 * | domsg <----------------------------+ 443 * | | 444 * | | replymsg 445 * | | 446 * V forwardmsg forwardmsg | 447 * tcp0 ------------> tcp1 ------------> tcpN 448 */ 449 return cpu0_ctlport(cmd, sa, vip); 450 } else { 451 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); 452 cpu = tcp_addrcpu(faddr.s_addr, th->th_dport, 453 ip->ip_src.s_addr, th->th_sport); 454 } 455 return(&tcp_thread[cpu].td_msgport); 456 } 457 458 lwkt_port_t 459 tcp_addrport(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport) 460 { 461 return (&tcp_thread[tcp_addrcpu(faddr, fport, 462 laddr, lport)].td_msgport); 463 } 464 465 /* 466 * Map a UDP socket to a protocol processing thread. 467 */ 468 lwkt_port_t 469 udp_soport(struct socket *so, struct sockaddr *nam __unused, 470 struct mbuf **dummy __unused, int req) 471 { 472 struct inpcb *inp; 473 474 /* 475 * The following processing all take place on Protocol Thread 0: 476 * bind() 477 * attach() has a null socket parameter 478 * Fast and slow timeouts pass in null socket parameter 479 */ 480 if (req == PRU_BIND || so == NULL) 481 return (&udp_thread[0].td_msgport); 482 483 inp = so->so_pcb; 484 485 #ifndef RSS 486 if (IN_MULTICAST(ntohl(inp->inp_laddr.s_addr))) 487 return (&udp_thread[0].td_msgport); 488 #endif 489 490 /* 491 * Rely on type-stable memory and check in protocol handler 492 * to fix race condition here w/ deallocation of inp. XXX JH 493 */ 494 495 return (&udp_thread[INP_MPORT_HASH_UDP(inp->inp_faddr.s_addr, 496 inp->inp_laddr.s_addr, inp->inp_fport, inp->inp_lport)].td_msgport); 497 } 498 499 /* 500 * Used to route icmp messages to the proper protocol thread for ctlinput 501 * operation. 502 */ 503 lwkt_port_t 504 udp_ctlport(int cmd, struct sockaddr *sa, void *vip) 505 { 506 struct ip *ip = vip; 507 struct udphdr *uh; 508 struct in_addr faddr; 509 int cpu; 510 511 faddr = ((struct sockaddr_in *)sa)->sin_addr; 512 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 513 return(NULL); 514 if (PRC_IS_REDIRECT(cmd)) { 515 /* 516 * See the comment in tcp_ctlport; the only difference 517 * is that message is forwarded to UDP protocol theads. 518 */ 519 return cpu0_ctlport(cmd, sa, vip); 520 } else if (ip == NULL || cmd == PRC_HOSTDEAD) { 521 /* 522 * XXX 523 * Once UDP inpcbs are CPU localized, we should do 524 * the same forwarding as PRC_IS_REDIRECT(cmd) 525 */ 526 cpu = 0; 527 } else { 528 uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); 529 530 cpu = INP_MPORT_HASH_UDP(faddr.s_addr, ip->ip_src.s_addr, 531 uh->uh_dport, uh->uh_sport); 532 } 533 return (&udp_thread[cpu].td_msgport); 534 } 535 536 /* 537 * Map a network address to a processor. 538 */ 539 int 540 tcp_addrcpu(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport) 541 { 542 return (INP_MPORT_HASH_TCP(faddr, laddr, fport, lport)); 543 } 544 545 int 546 udp_addrcpu(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport) 547 { 548 #ifndef RSS 549 if (IN_MULTICAST(ntohl(laddr))) 550 return (0); 551 else 552 #endif 553 return (INP_MPORT_HASH_UDP(faddr, laddr, fport, lport)); 554 } 555 556 /* 557 * Return LWKT port for cpu. 558 */ 559 lwkt_port_t 560 tcp_cport(int cpu) 561 { 562 return (&tcp_thread[cpu].td_msgport); 563 } 564 565 lwkt_port_t 566 udp_cport(int cpu) 567 { 568 return (&udp_thread[cpu].td_msgport); 569 } 570 571 void 572 tcp_thread_init(void) 573 { 574 int cpu; 575 576 for (cpu = 0; cpu < ncpus2; cpu++) { 577 lwkt_create(tcpmsg_service_loop, NULL, NULL, 578 &tcp_thread[cpu], TDF_NETWORK | TDF_MPSAFE, cpu, 579 "tcp_thread %d", cpu); 580 netmsg_service_port_init(&tcp_thread[cpu].td_msgport); 581 } 582 } 583 584 void 585 udp_thread_init(void) 586 { 587 int cpu; 588 589 for (cpu = 0; cpu < ncpus2; cpu++) { 590 lwkt_create(netmsg_service_loop, &udp_mpsafe_thread, NULL, 591 &udp_thread[cpu], TDF_NETWORK | TDF_MPSAFE, cpu, 592 "udp_thread %d", cpu); 593 netmsg_service_port_init(&udp_thread[cpu].td_msgport); 594 } 595 } 596