1 /* $OpenBSD: raw_ip.c,v 1.159 2024/04/17 20:48:51 bluhm Exp $ */ 2 /* $NetBSD: raw_ip.c,v 1.25 1996/02/18 18:58:33 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/protosw.h> 76 #include <sys/socketvar.h> 77 78 #include <net/if.h> 79 #include <net/if_var.h> 80 #include <net/route.h> 81 82 #include <netinet/in.h> 83 #include <netinet/ip.h> 84 #include <netinet/ip_mroute.h> 85 #include <netinet/ip_var.h> 86 #include <netinet/in_pcb.h> 87 #include <netinet/in_var.h> 88 #include <netinet/ip_icmp.h> 89 90 #include <net/pfvar.h> 91 92 #include "pf.h" 93 94 struct inpcbtable rawcbtable; 95 96 /* 97 * Nominal space allocated to a raw ip socket. 98 */ 99 #define RIPSNDQ 8192 100 #define RIPRCVQ 8192 101 102 /* 103 * Raw interface to IP protocol. 104 */ 105 106 const struct pr_usrreqs rip_usrreqs = { 107 .pru_attach = rip_attach, 108 .pru_detach = rip_detach, 109 .pru_lock = rip_lock, 110 .pru_unlock = rip_unlock, 111 .pru_locked = rip_locked, 112 .pru_bind = rip_bind, 113 .pru_connect = rip_connect, 114 .pru_disconnect = rip_disconnect, 115 .pru_shutdown = rip_shutdown, 116 .pru_send = rip_send, 117 .pru_control = in_control, 118 .pru_sockaddr = in_sockaddr, 119 .pru_peeraddr = in_peeraddr, 120 }; 121 122 /* 123 * Initialize raw connection block q. 124 */ 125 void 126 rip_init(void) 127 { 128 in_pcbinit(&rawcbtable, 1); 129 } 130 131 int 132 rip_input(struct mbuf **mp, int *offp, int proto, int af) 133 { 134 struct mbuf *m = *mp; 135 struct ip *ip = mtod(m, struct ip *); 136 struct inpcb *inp; 137 SIMPLEQ_HEAD(, inpcb) inpcblist; 138 struct in_addr *key; 139 struct counters_ref ref; 140 uint64_t *counters; 141 struct sockaddr_in ripsrc; 142 143 KASSERT(af == AF_INET); 144 145 memset(&ripsrc, 0, sizeof(ripsrc)); 146 ripsrc.sin_family = AF_INET; 147 ripsrc.sin_len = sizeof(ripsrc); 148 ripsrc.sin_addr = ip->ip_src; 149 150 key = &ip->ip_dst; 151 #if NPF > 0 152 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 153 struct pf_divert *divert; 154 155 divert = pf_find_divert(m); 156 KASSERT(divert != NULL); 157 switch (divert->type) { 158 case PF_DIVERT_TO: 159 key = &divert->addr.v4; 160 break; 161 case PF_DIVERT_REPLY: 162 break; 163 default: 164 panic("%s: unknown divert type %d, mbuf %p, divert %p", 165 __func__, divert->type, m, divert); 166 } 167 } 168 #endif 169 SIMPLEQ_INIT(&inpcblist); 170 rw_enter_write(&rawcbtable.inpt_notify); 171 mtx_enter(&rawcbtable.inpt_mtx); 172 TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) { 173 KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); 174 175 /* 176 * Packet must not be inserted after disconnected wakeup 177 * call. To avoid race, check again when holding receive 178 * buffer mutex. 179 */ 180 if (ISSET(READ_ONCE(inp->inp_socket->so_rcv.sb_state), 181 SS_CANTRCVMORE)) 182 continue; 183 if (rtable_l2(inp->inp_rtableid) != 184 rtable_l2(m->m_pkthdr.ph_rtableid)) 185 continue; 186 187 if (inp->inp_ip.ip_p && inp->inp_ip.ip_p != ip->ip_p) 188 continue; 189 if (inp->inp_laddr.s_addr && 190 inp->inp_laddr.s_addr != key->s_addr) 191 continue; 192 if (inp->inp_faddr.s_addr && 193 inp->inp_faddr.s_addr != ip->ip_src.s_addr) 194 continue; 195 196 in_pcbref(inp); 197 SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify); 198 } 199 mtx_leave(&rawcbtable.inpt_mtx); 200 201 if (SIMPLEQ_EMPTY(&inpcblist)) { 202 rw_exit_write(&rawcbtable.inpt_notify); 203 204 if (ip->ip_p != IPPROTO_ICMP) 205 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 206 0, 0); 207 else 208 m_freem(m); 209 210 counters = counters_enter(&ref, ipcounters); 211 counters[ips_noproto]++; 212 counters[ips_delivered]--; 213 counters_leave(&ref, ipcounters); 214 215 return IPPROTO_DONE; 216 } 217 218 while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) { 219 struct mbuf *n, *opts = NULL; 220 221 SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify); 222 if (SIMPLEQ_EMPTY(&inpcblist)) 223 n = m; 224 else 225 n = m_copym(m, 0, M_COPYALL, M_NOWAIT); 226 if (n != NULL) { 227 struct socket *so = inp->inp_socket; 228 int ret = 0; 229 230 if (inp->inp_flags & INP_CONTROLOPTS || 231 so->so_options & SO_TIMESTAMP) 232 ip_savecontrol(inp, &opts, ip, n); 233 234 mtx_enter(&so->so_rcv.sb_mtx); 235 if (!ISSET(inp->inp_socket->so_rcv.sb_state, 236 SS_CANTRCVMORE)) { 237 ret = sbappendaddr(so, &so->so_rcv, 238 sintosa(&ripsrc), n, opts); 239 } 240 mtx_leave(&so->so_rcv.sb_mtx); 241 242 if (ret == 0) { 243 m_freem(n); 244 m_freem(opts); 245 ipstat_inc(ips_noproto); 246 } else 247 sorwakeup(so); 248 } 249 in_pcbunref(inp); 250 } 251 rw_exit_write(&rawcbtable.inpt_notify); 252 253 return IPPROTO_DONE; 254 } 255 256 /* 257 * Generate IP header and pass packet to ip_output. 258 * Tack on options user may have setup with control call. 259 */ 260 int 261 rip_output(struct mbuf *m, struct socket *so, struct sockaddr *dstaddr, 262 struct mbuf *control) 263 { 264 struct sockaddr_in *dst = satosin(dstaddr); 265 struct ip *ip; 266 struct inpcb *inp; 267 int flags, error; 268 269 inp = sotoinpcb(so); 270 flags = IP_ALLOWBROADCAST; 271 272 /* 273 * If the user handed us a complete IP packet, use it. 274 * Otherwise, allocate an mbuf for a header and fill it in. 275 */ 276 if ((inp->inp_flags & INP_HDRINCL) == 0) { 277 if ((m->m_pkthdr.len + sizeof(struct ip)) > IP_MAXPACKET) { 278 m_freem(m); 279 return (EMSGSIZE); 280 } 281 M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); 282 if (!m) 283 return (ENOBUFS); 284 ip = mtod(m, struct ip *); 285 ip->ip_tos = inp->inp_ip.ip_tos; 286 ip->ip_off = htons(0); 287 ip->ip_p = inp->inp_ip.ip_p; 288 ip->ip_len = htons(m->m_pkthdr.len); 289 ip->ip_src.s_addr = INADDR_ANY; 290 ip->ip_dst = dst->sin_addr; 291 ip->ip_ttl = inp->inp_ip.ip_ttl ? inp->inp_ip.ip_ttl : MAXTTL; 292 } else { 293 if (m->m_pkthdr.len > IP_MAXPACKET) { 294 m_freem(m); 295 return (EMSGSIZE); 296 } 297 298 m = rip_chkhdr(m, inp->inp_options); 299 if (m == NULL) 300 return (EINVAL); 301 302 ip = mtod(m, struct ip *); 303 if (ip->ip_id == 0) 304 ip->ip_id = htons(ip_randomid()); 305 dst->sin_addr = ip->ip_dst; 306 307 /* XXX prevent ip_output from overwriting header fields */ 308 flags |= IP_RAWOUTPUT; 309 ipstat_inc(ips_rawout); 310 } 311 312 if (ip->ip_src.s_addr == INADDR_ANY) { 313 error = in_pcbselsrc(&ip->ip_src, dst, inp); 314 if (error != 0) 315 return (error); 316 } 317 318 #ifdef INET6 319 /* 320 * A thought: Even though raw IP shouldn't be able to set IPv6 321 * multicast options, if it does, the last parameter to 322 * ip_output should be guarded against v6/v4 problems. 323 */ 324 #endif 325 /* force routing table */ 326 m->m_pkthdr.ph_rtableid = inp->inp_rtableid; 327 328 #if NPF > 0 329 if (inp->inp_socket->so_state & SS_ISCONNECTED && 330 ip->ip_p != IPPROTO_ICMP) 331 pf_mbuf_link_inpcb(m, inp); 332 #endif 333 334 error = ip_output(m, inp->inp_options, &inp->inp_route, flags, 335 inp->inp_moptions, &inp->inp_seclevel, 0); 336 return (error); 337 } 338 339 struct mbuf * 340 rip_chkhdr(struct mbuf *m, struct mbuf *options) 341 { 342 struct ip *ip; 343 int hlen, opt, optlen, cnt; 344 u_char *cp; 345 346 if (m->m_pkthdr.len < sizeof(struct ip)) { 347 m_freem(m); 348 return NULL; 349 } 350 351 m = m_pullup(m, sizeof (struct ip)); 352 if (m == NULL) 353 return NULL; 354 355 ip = mtod(m, struct ip *); 356 hlen = ip->ip_hl << 2; 357 358 /* Don't allow packet length sizes that will crash. */ 359 if (hlen < sizeof (struct ip) || 360 ntohs(ip->ip_len) < hlen || 361 ntohs(ip->ip_len) != m->m_pkthdr.len) { 362 m_freem(m); 363 return NULL; 364 } 365 m = m_pullup(m, hlen); 366 if (m == NULL) 367 return NULL; 368 369 ip = mtod(m, struct ip *); 370 371 if (ip->ip_v != IPVERSION) { 372 m_freem(m); 373 return NULL; 374 } 375 376 /* 377 * Don't allow both user specified and setsockopt options. 378 * If options are present verify them. 379 */ 380 if (hlen != sizeof(struct ip)) { 381 if (options) { 382 m_freem(m); 383 return NULL; 384 } else { 385 cp = (u_char *)(ip + 1); 386 cnt = hlen - sizeof(struct ip); 387 for (; cnt > 0; cnt -= optlen, cp += optlen) { 388 opt = cp[IPOPT_OPTVAL]; 389 if (opt == IPOPT_EOL) 390 break; 391 if (opt == IPOPT_NOP) 392 optlen = 1; 393 else { 394 if (cnt < IPOPT_OLEN + sizeof(*cp)) { 395 m_freem(m); 396 return NULL; 397 } 398 optlen = cp[IPOPT_OLEN]; 399 if (optlen < IPOPT_OLEN + sizeof(*cp) || 400 optlen > cnt) { 401 m_freem(m); 402 return NULL; 403 } 404 } 405 } 406 } 407 } 408 409 return m; 410 } 411 412 /* 413 * Raw IP socket option processing. 414 */ 415 int 416 rip_ctloutput(int op, struct socket *so, int level, int optname, 417 struct mbuf *m) 418 { 419 struct inpcb *inp = sotoinpcb(so); 420 int error; 421 422 if (level != IPPROTO_IP) 423 return (EINVAL); 424 425 switch (optname) { 426 427 case IP_HDRINCL: 428 error = 0; 429 if (op == PRCO_SETOPT) { 430 if (m == NULL || m->m_len < sizeof (int)) 431 error = EINVAL; 432 else if (*mtod(m, int *)) 433 inp->inp_flags |= INP_HDRINCL; 434 else 435 inp->inp_flags &= ~INP_HDRINCL; 436 } else { 437 m->m_len = sizeof(int); 438 *mtod(m, int *) = inp->inp_flags & INP_HDRINCL; 439 } 440 return (error); 441 442 case MRT_INIT: 443 case MRT_DONE: 444 case MRT_ADD_VIF: 445 case MRT_DEL_VIF: 446 case MRT_ADD_MFC: 447 case MRT_DEL_MFC: 448 case MRT_VERSION: 449 case MRT_ASSERT: 450 case MRT_API_SUPPORT: 451 case MRT_API_CONFIG: 452 #ifdef MROUTING 453 switch (op) { 454 case PRCO_SETOPT: 455 error = ip_mrouter_set(so, optname, m); 456 break; 457 case PRCO_GETOPT: 458 error = ip_mrouter_get(so, optname, m); 459 break; 460 default: 461 error = EINVAL; 462 break; 463 } 464 return (error); 465 #else 466 return (EOPNOTSUPP); 467 #endif 468 } 469 return (ip_ctloutput(op, so, level, optname, m)); 470 } 471 472 u_long rip_sendspace = RIPSNDQ; 473 u_long rip_recvspace = RIPRCVQ; 474 475 int 476 rip_attach(struct socket *so, int proto, int wait) 477 { 478 struct inpcb *inp; 479 int error; 480 481 if (so->so_pcb) 482 panic("rip_attach"); 483 if ((so->so_state & SS_PRIV) == 0) 484 return EACCES; 485 if (proto < 0 || proto >= IPPROTO_MAX) 486 return EPROTONOSUPPORT; 487 488 if ((error = soreserve(so, rip_sendspace, rip_recvspace))) 489 return error; 490 NET_ASSERT_LOCKED(); 491 if ((error = in_pcballoc(so, &rawcbtable, wait))) 492 return error; 493 inp = sotoinpcb(so); 494 inp->inp_ip.ip_p = proto; 495 return 0; 496 } 497 498 int 499 rip_detach(struct socket *so) 500 { 501 struct inpcb *inp = sotoinpcb(so); 502 503 soassertlocked(so); 504 505 if (inp == NULL) 506 return (EINVAL); 507 508 #ifdef MROUTING 509 if (so == ip_mrouter[inp->inp_rtableid]) 510 ip_mrouter_done(so); 511 #endif 512 in_pcbdetach(inp); 513 514 return (0); 515 } 516 517 void 518 rip_lock(struct socket *so) 519 { 520 struct inpcb *inp = sotoinpcb(so); 521 522 NET_ASSERT_LOCKED(); 523 mtx_enter(&inp->inp_mtx); 524 } 525 526 void 527 rip_unlock(struct socket *so) 528 { 529 struct inpcb *inp = sotoinpcb(so); 530 531 NET_ASSERT_LOCKED(); 532 mtx_leave(&inp->inp_mtx); 533 } 534 535 int 536 rip_locked(struct socket *so) 537 { 538 struct inpcb *inp = sotoinpcb(so); 539 540 return mtx_owned(&inp->inp_mtx); 541 } 542 543 int 544 rip_bind(struct socket *so, struct mbuf *nam, struct proc *p) 545 { 546 struct inpcb *inp = sotoinpcb(so); 547 struct sockaddr_in *addr; 548 int error; 549 550 soassertlocked(so); 551 552 if ((error = in_nam2sin(nam, &addr))) 553 return (error); 554 555 if (!((so->so_options & SO_BINDANY) || 556 addr->sin_addr.s_addr == INADDR_ANY || 557 addr->sin_addr.s_addr == INADDR_BROADCAST || 558 in_broadcast(addr->sin_addr, inp->inp_rtableid) || 559 ifa_ifwithaddr(sintosa(addr), inp->inp_rtableid))) 560 return (EADDRNOTAVAIL); 561 562 mtx_enter(&rawcbtable.inpt_mtx); 563 inp->inp_laddr = addr->sin_addr; 564 mtx_leave(&rawcbtable.inpt_mtx); 565 566 return (0); 567 } 568 569 int 570 rip_connect(struct socket *so, struct mbuf *nam) 571 { 572 struct inpcb *inp = sotoinpcb(so); 573 struct sockaddr_in *addr; 574 int error; 575 576 soassertlocked(so); 577 578 if ((error = in_nam2sin(nam, &addr))) 579 return (error); 580 581 mtx_enter(&rawcbtable.inpt_mtx); 582 inp->inp_faddr = addr->sin_addr; 583 mtx_leave(&rawcbtable.inpt_mtx); 584 soisconnected(so); 585 586 return (0); 587 } 588 589 int 590 rip_disconnect(struct socket *so) 591 { 592 struct inpcb *inp = sotoinpcb(so); 593 594 soassertlocked(so); 595 596 if ((so->so_state & SS_ISCONNECTED) == 0) 597 return (ENOTCONN); 598 599 soisdisconnected(so); 600 mtx_enter(&rawcbtable.inpt_mtx); 601 inp->inp_faddr.s_addr = INADDR_ANY; 602 mtx_leave(&rawcbtable.inpt_mtx); 603 604 return (0); 605 } 606 607 int 608 rip_shutdown(struct socket *so) 609 { 610 /* 611 * Mark the connection as being incapable of further input. 612 */ 613 614 soassertlocked(so); 615 socantsendmore(so); 616 617 return (0); 618 } 619 620 int 621 rip_send(struct socket *so, struct mbuf *m, struct mbuf *nam, 622 struct mbuf *control) 623 { 624 struct inpcb *inp = sotoinpcb(so); 625 struct sockaddr_in dst; 626 int error; 627 628 soassertlocked(so); 629 630 /* 631 * Ship a packet out. The appropriate raw output 632 * routine handles any massaging necessary. 633 */ 634 memset(&dst, 0, sizeof(dst)); 635 dst.sin_family = AF_INET; 636 dst.sin_len = sizeof(dst); 637 if (so->so_state & SS_ISCONNECTED) { 638 if (nam) { 639 error = EISCONN; 640 goto out; 641 } 642 dst.sin_addr = inp->inp_faddr; 643 } else { 644 struct sockaddr_in *addr; 645 646 if (nam == NULL) { 647 error = ENOTCONN; 648 goto out; 649 } 650 if ((error = in_nam2sin(nam, &addr))) 651 goto out; 652 dst.sin_addr = addr->sin_addr; 653 } 654 #ifdef IPSEC 655 /* XXX Find an IPsec TDB */ 656 #endif 657 error = rip_output(m, so, sintosa(&dst), NULL); 658 m = NULL; 659 660 out: 661 m_freem(control); 662 m_freem(m); 663 664 return (error); 665 } 666