1 /* 2 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 #include "opt_inet.h" 35 #include "opt_rss.h" 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/socket.h> 41 #include <sys/socketvar.h> 42 #include <sys/thread.h> 43 #include <sys/sysctl.h> 44 #include <sys/globaldata.h> 45 46 #include <net/if.h> 47 #include <net/netisr2.h> 48 #include <net/toeplitz2.h> 49 50 #include <netinet/in_systm.h> 51 #include <netinet/in.h> 52 #include <netinet/in_var.h> 53 #include <netinet/in_pcb.h> 54 #include <netinet/ip.h> 55 #include <netinet/ip_var.h> 56 #include <netinet/tcp.h> 57 #include <netinet/tcpip.h> 58 #include <netinet/tcp_var.h> 59 #include <netinet/udp.h> 60 #include <netinet/udp_var.h> 61 62 /* 63 * Toeplitz hash functions - the idea is to match the hardware. 64 */ 65 static __inline int 66 INP_MPORT_HASH_UDP(in_addr_t faddr, in_addr_t laddr, 67 in_port_t fport, in_port_t lport) 68 { 69 return toeplitz_hash(toeplitz_rawhash_addr(faddr, laddr)); 70 } 71 72 static __inline int 73 INP_MPORT_HASH_TCP(in_addr_t faddr, in_addr_t laddr, 74 in_port_t fport, in_port_t lport) 75 { 76 return toeplitz_hash( 77 toeplitz_rawhash_addrport(faddr, laddr, fport, lport)); 78 } 79 80 /* 81 * Map a network address to a processor. 82 */ 83 int 84 tcp_addrcpu(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport) 85 { 86 return (netisr_hashcpu(INP_MPORT_HASH_TCP(faddr, laddr, fport, lport))); 87 } 88 89 /* 90 * Not implemented yet, use protocol thread 0 91 */ 92 int 93 udp_addrcpu(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport) 94 { 95 #ifdef notyet 96 return (netisr_hashcpu(INP_MPORT_HASH_UDP(faddr, laddr, fport, lport))); 97 #else 98 return 0; 99 #endif 100 } 101 102 int 103 udp_addrcpu_pkt(in_addr_t faddr, in_port_t fport, in_addr_t laddr, 104 in_port_t lport) 105 { 106 if (IN_MULTICAST(ntohl(faddr))) { 107 /* XXX handle multicast on CPU0 for now */ 108 return 0; 109 } 110 return (netisr_hashcpu(INP_MPORT_HASH_UDP(faddr, laddr, fport, lport))); 111 } 112 113 /* 114 * If the packet is a valid IP datagram, upon returning of this function 115 * following things are promised: 116 * 117 * o IP header (including any possible IP options) and any data preceding 118 * IP header (usually linker layer header) are in one mbuf (m_len). 119 * o IP header length is not less than the minimum (sizeof(struct ip)). 120 * o IP total length is not less than IP header length. 121 * o IP datagram resides completely in the mbuf chain, 122 * i.e. pkthdr.len >= IP total length. 123 * 124 * If the packet is a UDP datagram, 125 * o IP header (including any possible IP options) and UDP header are in 126 * one mbuf (m_len). 127 * o IP total length is not less than (IP header length + UDP header length). 128 * 129 * If the packet is a TCP segment, 130 * o IP header (including any possible IP options) and TCP header (including 131 * any possible TCP options) are in one mbuf (m_len). 132 * o TCP header length is not less than the minimum (sizeof(struct tcphdr)). 133 * o IP total length is not less than (IP header length + TCP header length). 134 */ 135 boolean_t 136 ip_lengthcheck(struct mbuf **mp, int hoff) 137 { 138 struct mbuf *m = *mp; 139 struct ip *ip; 140 int len, iphlen, iplen; 141 struct tcphdr *th; 142 int thoff; /* TCP data offset */ 143 144 len = hoff + sizeof(struct ip); 145 146 /* The packet must be at least the size of an IP header. */ 147 if (m->m_pkthdr.len < len) { 148 ipstat.ips_tooshort++; 149 goto fail; 150 } 151 152 /* The fixed IP header must reside completely in the first mbuf. */ 153 if (m->m_len < len) { 154 m = m_pullup(m, len); 155 if (m == NULL) { 156 ipstat.ips_toosmall++; 157 goto fail; 158 } 159 } 160 161 ip = mtodoff(m, struct ip *, hoff); 162 163 /* Bound check the packet's stated IP header length. */ 164 iphlen = ip->ip_hl << 2; 165 if (iphlen < sizeof(struct ip)) { /* minimum header length */ 166 ipstat.ips_badhlen++; 167 goto fail; 168 } 169 170 /* The full IP header must reside completely in the one mbuf. */ 171 if (m->m_len < hoff + iphlen) { 172 m = m_pullup(m, hoff + iphlen); 173 if (m == NULL) { 174 ipstat.ips_badhlen++; 175 goto fail; 176 } 177 ip = mtodoff(m, struct ip *, hoff); 178 } 179 180 iplen = ntohs(ip->ip_len); 181 182 /* 183 * Check that the amount of data in the buffers is as 184 * at least much as the IP header would have us expect. 185 */ 186 if (m->m_pkthdr.len < hoff + iplen) { 187 ipstat.ips_tooshort++; 188 goto fail; 189 } 190 191 /* 192 * Fragments other than the first fragment don't have much 193 * length information. 194 */ 195 if (ntohs(ip->ip_off) & IP_OFFMASK) 196 goto ipcheckonly; 197 198 /* 199 * The TCP/IP or UDP/IP header must be entirely contained within 200 * the first fragment of a packet. Packet filters will break if they 201 * aren't. 202 * 203 * Since the packet will be trimmed to ip_len we must also make sure 204 * the potentially trimmed down length is still sufficient to hold 205 * the header(s). 206 */ 207 switch (ip->ip_p) { 208 case IPPROTO_TCP: 209 if (iplen < iphlen + sizeof(struct tcphdr)) { 210 ++tcpstat.tcps_rcvshort; 211 goto fail; 212 } 213 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) { 214 m = m_pullup(m, hoff + iphlen + sizeof(struct tcphdr)); 215 if (m == NULL) { 216 tcpstat.tcps_rcvshort++; 217 goto fail; 218 } 219 ip = mtodoff(m, struct ip *, hoff); 220 } 221 th = (struct tcphdr *)((caddr_t)ip + iphlen); 222 thoff = th->th_off << 2; 223 if (thoff < sizeof(struct tcphdr) || 224 thoff + iphlen > ntohs(ip->ip_len)) { 225 tcpstat.tcps_rcvbadoff++; 226 goto fail; 227 } 228 if (m->m_len < hoff + iphlen + thoff) { 229 m = m_pullup(m, hoff + iphlen + thoff); 230 if (m == NULL) { 231 tcpstat.tcps_rcvshort++; 232 goto fail; 233 } 234 } 235 break; 236 case IPPROTO_UDP: 237 if (iplen < iphlen + sizeof(struct udphdr)) { 238 ++udp_stat.udps_hdrops; 239 goto fail; 240 } 241 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) { 242 m = m_pullup(m, hoff + iphlen + sizeof(struct udphdr)); 243 if (m == NULL) { 244 udp_stat.udps_hdrops++; 245 goto fail; 246 } 247 } 248 break; 249 default: 250 ipcheckonly: 251 if (iplen < iphlen) { 252 ++ipstat.ips_badlen; 253 goto fail; 254 } 255 break; 256 } 257 258 m->m_flags |= M_LENCHECKED; 259 *mp = m; 260 return TRUE; 261 262 fail: 263 if (m != NULL) 264 m_freem(m); 265 *mp = NULL; 266 return FALSE; 267 } 268 269 /* 270 * Assign a protocol processing thread to a packet. The IP header is at 271 * offset (hoff) in the packet (i.e. the mac header might still be intact). 272 * 273 * This function can blow away the mbuf if the packet is malformed. 274 */ 275 void 276 ip_hashfn(struct mbuf **mptr, int hoff, int dir) 277 { 278 struct ip *ip; 279 int iphlen; 280 struct tcphdr *th; 281 struct udphdr *uh; 282 struct mbuf *m; 283 int hash; 284 285 if (!ip_lengthcheck(mptr, hoff)) 286 return; 287 288 m = *mptr; 289 ip = mtodoff(m, struct ip *, hoff); 290 iphlen = ip->ip_hl << 2; 291 292 if (ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) { 293 hash = toeplitz_hash(toeplitz_rawhash_addr( 294 ip->ip_src.s_addr, ip->ip_dst.s_addr)); 295 goto back; 296 } 297 298 switch (ip->ip_p) { 299 case IPPROTO_TCP: 300 th = (struct tcphdr *)((caddr_t)ip + iphlen); 301 hash = INP_MPORT_HASH_TCP(ip->ip_src.s_addr, ip->ip_dst.s_addr, 302 th->th_sport, th->th_dport); 303 break; 304 305 case IPPROTO_UDP: 306 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 307 /* XXX handle multicast on CPU0 for now */ 308 hash = 0; 309 break; 310 } 311 uh = (struct udphdr *)((caddr_t)ip + iphlen); 312 hash = INP_MPORT_HASH_UDP(ip->ip_src.s_addr, ip->ip_dst.s_addr, 313 uh->uh_sport, uh->uh_dport); 314 break; 315 316 default: 317 hash = 0; 318 break; 319 } 320 back: 321 m->m_flags |= M_HASH; 322 m->m_pkthdr.hash = hash; 323 } 324 325 void 326 ip_hashfn_in(struct mbuf **mptr, int hoff) 327 { 328 ip_hashfn(mptr, hoff, IP_MPORT_IN); 329 } 330 331 /* 332 * Verify and adjust the hash value of the packet. 333 * 334 * Unlike ip_hashfn(), the packet content is not accessed. The packet info 335 * (pi) and the hash of the packet (m_pkthdr.hash) is used instead. 336 * 337 * Caller has already made sure that m_pkthdr.hash is valid, i.e. m_flags 338 * has M_HASH set. 339 */ 340 void 341 ip_hashcheck(struct mbuf *m, const struct pktinfo *pi) 342 { 343 KASSERT((m->m_flags & M_HASH), ("no valid packet hash")); 344 345 /* 346 * XXX generic packet handling defrag on CPU 0 for now. 347 */ 348 if (pi->pi_flags & PKTINFO_FLAG_FRAG) { 349 m->m_pkthdr.hash = 0; 350 return; 351 } 352 353 switch (pi->pi_l3proto) { 354 case IPPROTO_TCP: 355 case IPPROTO_UDP: 356 break; 357 358 default: 359 /* Let software calculate the hash */ 360 m->m_flags &= ~M_HASH; 361 break; 362 } 363 } 364 365 /* 366 * This is used to map a socket to a message port for sendmsg() and friends. 367 * It is not called for any other purpose. In the case of TCP we just return 368 * the port already installed in the socket. 369 */ 370 lwkt_port_t 371 tcp_soport(struct socket *so, struct sockaddr *nam, 372 struct mbuf **dummy __unused) 373 { 374 return(so->so_port); 375 } 376 377 /* 378 * Used to route icmp messages to the proper protocol thread for ctlinput 379 * operation. 380 */ 381 lwkt_port_t 382 tcp_ctlport(int cmd, struct sockaddr *sa, void *vip) 383 { 384 struct ip *ip = vip; 385 struct tcphdr *th; 386 struct in_addr faddr; 387 int cpu; 388 389 faddr = ((struct sockaddr_in *)sa)->sin_addr; 390 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 391 return(NULL); 392 if (ip == NULL || PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) { 393 /* 394 * A new message will be allocated later to save necessary 395 * information and will be forwarded to all network protocol 396 * threads in the following way: 397 * 398 * (the the thread owns the msgport that we return here) 399 * netisr0 <--+ 400 * | | 401 * | | 402 * | | 403 * +-------+ 404 * sendmsg 405 * [msg is kmalloc()ed] 406 * 407 * 408 * Later on, when the msg is received by netisr0: 409 * 410 * forwardmsg forwardmsg 411 * netisr0 ---------> netisr1 ---------> netisrN 412 * [msg is kfree()ed] 413 */ 414 return cpu0_ctlport(cmd, sa, vip); 415 } else { 416 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); 417 cpu = tcp_addrcpu(faddr.s_addr, th->th_dport, 418 ip->ip_src.s_addr, th->th_sport); 419 } 420 return(netisr_cpuport(cpu)); 421 } 422 423 lwkt_port_t 424 tcp_addrport(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport) 425 { 426 return(netisr_cpuport(tcp_addrcpu(faddr, fport, laddr, lport))); 427 } 428 429 lwkt_port_t 430 tcp_addrport0(void) 431 { 432 return(netisr_cpuport(0)); 433 } 434 435 lwkt_port_t 436 udp_addrport(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport) 437 { 438 return(netisr_cpuport(udp_addrcpu(faddr, fport, laddr, lport))); 439 } 440 441 /* 442 * Used to route icmp messages to the proper protocol thread for ctlinput 443 * operation. 444 */ 445 lwkt_port_t 446 udp_ctlport(int cmd, struct sockaddr *sa, void *vip) 447 { 448 struct ip *ip = vip; 449 struct udphdr *uh; 450 struct in_addr faddr; 451 int cpu; 452 453 faddr = ((struct sockaddr_in *)sa)->sin_addr; 454 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 455 return(NULL); 456 if (ip == NULL || PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) { 457 /* 458 * See the comment in tcp_ctlport. 459 */ 460 return cpu0_ctlport(cmd, sa, vip); 461 } else { 462 uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); 463 464 cpu = udp_addrcpu(faddr.s_addr, ip->ip_src.s_addr, 465 uh->uh_dport, uh->uh_sport); 466 } 467 return (netisr_cpuport(cpu)); 468 } 469 470 struct lwkt_port * 471 tcp_initport(void) 472 { 473 return netisr_cpuport(mycpuid & ncpus2_mask); 474 } 475