1 /* $OpenBSD: raw_ip.c,v 1.163 2025/01/01 13:44:22 bluhm Exp $ */
2 /* $NetBSD: raw_ip.c,v 1.25 1996/02/18 18:58:33 christos Exp $ */
3
4 /*
5 * Copyright (c) 1982, 1986, 1988, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
33 *
34 * NRL grants permission for redistribution and use in source and binary
35 * forms, with or without modification, of the software and documentation
36 * created at NRL provided that the following conditions are met:
37 *
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. All advertising materials mentioning features or use of this software
44 * must display the following acknowledgements:
45 * This product includes software developed by the University of
46 * California, Berkeley and its contributors.
47 * This product includes software developed at the Information
48 * Technology Division, US Naval Research Laboratory.
49 * 4. Neither the name of the NRL nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64 *
65 * The views and conclusions contained in the software and documentation
66 * are those of the authors and should not be interpreted as representing
67 * official policies, either expressed or implied, of the US Naval
68 * Research Laboratory (NRL).
69 */
70
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/mbuf.h>
74 #include <sys/socket.h>
75 #include <sys/protosw.h>
76 #include <sys/socketvar.h>
77
78 #include <net/if.h>
79 #include <net/if_var.h>
80 #include <net/route.h>
81
82 #include <netinet/in.h>
83 #include <netinet/ip.h>
84 #include <netinet/ip_mroute.h>
85 #include <netinet/ip_var.h>
86 #include <netinet/in_pcb.h>
87 #include <netinet/in_var.h>
88 #include <netinet/ip_icmp.h>
89
90 #include <net/pfvar.h>
91
92 #include "pf.h"
93
94 struct inpcbtable rawcbtable;
95
96 /*
97 * Nominal space allocated to a raw ip socket.
98 */
99 #define RIPSNDQ 8192
100 #define RIPRCVQ 8192
101
102 /*
103 * Raw interface to IP protocol.
104 */
105
106 const struct pr_usrreqs rip_usrreqs = {
107 .pru_attach = rip_attach,
108 .pru_detach = rip_detach,
109 .pru_bind = rip_bind,
110 .pru_connect = rip_connect,
111 .pru_disconnect = rip_disconnect,
112 .pru_shutdown = rip_shutdown,
113 .pru_send = rip_send,
114 .pru_control = in_control,
115 .pru_sockaddr = in_sockaddr,
116 .pru_peeraddr = in_peeraddr,
117 };
118
119 void rip_sbappend(struct inpcb *, struct mbuf *, struct ip *,
120 struct sockaddr_in *);
121
122 /*
123 * Initialize raw connection block q.
124 */
125 void
rip_init(void)126 rip_init(void)
127 {
128 in_pcbinit(&rawcbtable, 1);
129 }
130
131 int
rip_input(struct mbuf ** mp,int * offp,int proto,int af)132 rip_input(struct mbuf **mp, int *offp, int proto, int af)
133 {
134 struct mbuf *m = *mp;
135 struct ip *ip = mtod(m, struct ip *);
136 struct inpcb_iterator iter = { .inp_table = NULL };
137 struct inpcb *inp, *last;
138 struct in_addr *key;
139 struct sockaddr_in ripsrc;
140
141 KASSERT(af == AF_INET);
142
143 memset(&ripsrc, 0, sizeof(ripsrc));
144 ripsrc.sin_family = AF_INET;
145 ripsrc.sin_len = sizeof(ripsrc);
146 ripsrc.sin_addr = ip->ip_src;
147
148 key = &ip->ip_dst;
149 #if NPF > 0
150 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
151 struct pf_divert *divert;
152
153 divert = pf_find_divert(m);
154 KASSERT(divert != NULL);
155 switch (divert->type) {
156 case PF_DIVERT_TO:
157 key = &divert->addr.v4;
158 break;
159 case PF_DIVERT_REPLY:
160 break;
161 default:
162 panic("%s: unknown divert type %d, mbuf %p, divert %p",
163 __func__, divert->type, m, divert);
164 }
165 }
166 #endif
167 mtx_enter(&rawcbtable.inpt_mtx);
168 last = inp = NULL;
169 while ((inp = in_pcb_iterator(&rawcbtable, inp, &iter)) != NULL) {
170 KASSERT(!ISSET(inp->inp_flags, INP_IPV6));
171
172 /*
173 * Packet must not be inserted after disconnected wakeup
174 * call. To avoid race, check again when holding receive
175 * buffer mutex.
176 */
177 if (ISSET(READ_ONCE(inp->inp_socket->so_rcv.sb_state),
178 SS_CANTRCVMORE))
179 continue;
180 if (rtable_l2(inp->inp_rtableid) !=
181 rtable_l2(m->m_pkthdr.ph_rtableid))
182 continue;
183
184 if (inp->inp_ip.ip_p && inp->inp_ip.ip_p != ip->ip_p)
185 continue;
186 if (inp->inp_laddr.s_addr &&
187 inp->inp_laddr.s_addr != key->s_addr)
188 continue;
189 if (inp->inp_faddr.s_addr &&
190 inp->inp_faddr.s_addr != ip->ip_src.s_addr)
191 continue;
192
193 if (last != NULL) {
194 struct mbuf *n;
195
196 mtx_leave(&rawcbtable.inpt_mtx);
197
198 n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
199 if (n != NULL)
200 rip_sbappend(last, n, ip, &ripsrc);
201 in_pcbunref(last);
202
203 mtx_enter(&rawcbtable.inpt_mtx);
204 }
205 last = in_pcbref(inp);
206 }
207 mtx_leave(&rawcbtable.inpt_mtx);
208
209 if (last == NULL) {
210 struct counters_ref ref;
211 uint64_t *counters;
212
213 if (ip->ip_p == IPPROTO_ICMP) {
214 m_freem(m);
215 } else {
216 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL,
217 0, 0);
218 }
219 counters = counters_enter(&ref, ipcounters);
220 counters[ips_noproto]++;
221 counters[ips_delivered]--;
222 counters_leave(&ref, ipcounters);
223
224 return IPPROTO_DONE;
225 }
226
227 rip_sbappend(last, m, ip, &ripsrc);
228 in_pcbunref(last);
229
230 return IPPROTO_DONE;
231 }
232
233 void
rip_sbappend(struct inpcb * inp,struct mbuf * m,struct ip * ip,struct sockaddr_in * ripsrc)234 rip_sbappend(struct inpcb *inp, struct mbuf *m, struct ip *ip,
235 struct sockaddr_in *ripsrc)
236 {
237 struct socket *so = inp->inp_socket;
238 struct mbuf *opts = NULL;
239 int ret = 0;
240
241 if (inp->inp_flags & INP_CONTROLOPTS || so->so_options & SO_TIMESTAMP)
242 ip_savecontrol(inp, &opts, ip, m);
243
244 mtx_enter(&so->so_rcv.sb_mtx);
245 if (!ISSET(inp->inp_socket->so_rcv.sb_state, SS_CANTRCVMORE))
246 ret = sbappendaddr(so, &so->so_rcv, sintosa(ripsrc), m, opts);
247 mtx_leave(&so->so_rcv.sb_mtx);
248
249 if (ret == 0) {
250 m_freem(m);
251 m_freem(opts);
252 ipstat_inc(ips_noproto);
253 } else
254 sorwakeup(so);
255 }
256
257 /*
258 * Generate IP header and pass packet to ip_output.
259 * Tack on options user may have setup with control call.
260 */
261 int
rip_output(struct mbuf * m,struct socket * so,struct sockaddr * dstaddr,struct mbuf * control)262 rip_output(struct mbuf *m, struct socket *so, struct sockaddr *dstaddr,
263 struct mbuf *control)
264 {
265 struct sockaddr_in *dst = satosin(dstaddr);
266 struct ip *ip;
267 struct inpcb *inp;
268 int flags, error;
269
270 inp = sotoinpcb(so);
271 flags = IP_ALLOWBROADCAST;
272
273 /*
274 * If the user handed us a complete IP packet, use it.
275 * Otherwise, allocate an mbuf for a header and fill it in.
276 */
277 if ((inp->inp_flags & INP_HDRINCL) == 0) {
278 if ((m->m_pkthdr.len + sizeof(struct ip)) > IP_MAXPACKET) {
279 m_freem(m);
280 return (EMSGSIZE);
281 }
282 M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
283 if (!m)
284 return (ENOBUFS);
285 ip = mtod(m, struct ip *);
286 ip->ip_tos = inp->inp_ip.ip_tos;
287 ip->ip_off = htons(0);
288 ip->ip_p = inp->inp_ip.ip_p;
289 ip->ip_len = htons(m->m_pkthdr.len);
290 ip->ip_src.s_addr = INADDR_ANY;
291 ip->ip_dst = dst->sin_addr;
292 ip->ip_ttl = inp->inp_ip.ip_ttl ? inp->inp_ip.ip_ttl : MAXTTL;
293 } else {
294 if (m->m_pkthdr.len > IP_MAXPACKET) {
295 m_freem(m);
296 return (EMSGSIZE);
297 }
298
299 m = rip_chkhdr(m, inp->inp_options);
300 if (m == NULL)
301 return (EINVAL);
302
303 ip = mtod(m, struct ip *);
304 if (ip->ip_id == 0)
305 ip->ip_id = htons(ip_randomid());
306 dst->sin_addr = ip->ip_dst;
307
308 /* XXX prevent ip_output from overwriting header fields */
309 flags |= IP_RAWOUTPUT;
310 ipstat_inc(ips_rawout);
311 }
312
313 if (ip->ip_src.s_addr == INADDR_ANY) {
314 error = in_pcbselsrc(&ip->ip_src, dst, inp);
315 if (error != 0)
316 return (error);
317 }
318
319 #ifdef INET6
320 /*
321 * A thought: Even though raw IP shouldn't be able to set IPv6
322 * multicast options, if it does, the last parameter to
323 * ip_output should be guarded against v6/v4 problems.
324 */
325 #endif
326 /* force routing table */
327 m->m_pkthdr.ph_rtableid = inp->inp_rtableid;
328
329 #if NPF > 0
330 if (inp->inp_socket->so_state & SS_ISCONNECTED &&
331 ip->ip_p != IPPROTO_ICMP)
332 pf_mbuf_link_inpcb(m, inp);
333 #endif
334
335 error = ip_output(m, inp->inp_options, &inp->inp_route, flags,
336 inp->inp_moptions, &inp->inp_seclevel, 0);
337 return (error);
338 }
339
340 struct mbuf *
rip_chkhdr(struct mbuf * m,struct mbuf * options)341 rip_chkhdr(struct mbuf *m, struct mbuf *options)
342 {
343 struct ip *ip;
344 int hlen, opt, optlen, cnt;
345 u_char *cp;
346
347 if (m->m_pkthdr.len < sizeof(struct ip)) {
348 m_freem(m);
349 return NULL;
350 }
351
352 m = m_pullup(m, sizeof (struct ip));
353 if (m == NULL)
354 return NULL;
355
356 ip = mtod(m, struct ip *);
357 hlen = ip->ip_hl << 2;
358
359 /* Don't allow packet length sizes that will crash. */
360 if (hlen < sizeof (struct ip) ||
361 ntohs(ip->ip_len) < hlen ||
362 ntohs(ip->ip_len) != m->m_pkthdr.len) {
363 m_freem(m);
364 return NULL;
365 }
366 m = m_pullup(m, hlen);
367 if (m == NULL)
368 return NULL;
369
370 ip = mtod(m, struct ip *);
371
372 if (ip->ip_v != IPVERSION) {
373 m_freem(m);
374 return NULL;
375 }
376
377 /*
378 * Don't allow both user specified and setsockopt options.
379 * If options are present verify them.
380 */
381 if (hlen != sizeof(struct ip)) {
382 if (options) {
383 m_freem(m);
384 return NULL;
385 } else {
386 cp = (u_char *)(ip + 1);
387 cnt = hlen - sizeof(struct ip);
388 for (; cnt > 0; cnt -= optlen, cp += optlen) {
389 opt = cp[IPOPT_OPTVAL];
390 if (opt == IPOPT_EOL)
391 break;
392 if (opt == IPOPT_NOP)
393 optlen = 1;
394 else {
395 if (cnt < IPOPT_OLEN + sizeof(*cp)) {
396 m_freem(m);
397 return NULL;
398 }
399 optlen = cp[IPOPT_OLEN];
400 if (optlen < IPOPT_OLEN + sizeof(*cp) ||
401 optlen > cnt) {
402 m_freem(m);
403 return NULL;
404 }
405 }
406 }
407 }
408 }
409
410 return m;
411 }
412
413 /*
414 * Raw IP socket option processing.
415 */
416 int
rip_ctloutput(int op,struct socket * so,int level,int optname,struct mbuf * m)417 rip_ctloutput(int op, struct socket *so, int level, int optname,
418 struct mbuf *m)
419 {
420 struct inpcb *inp = sotoinpcb(so);
421 int error;
422
423 if (level != IPPROTO_IP)
424 return (EINVAL);
425
426 switch (optname) {
427
428 case IP_HDRINCL:
429 error = 0;
430 if (op == PRCO_SETOPT) {
431 if (m == NULL || m->m_len < sizeof (int))
432 error = EINVAL;
433 else if (*mtod(m, int *))
434 inp->inp_flags |= INP_HDRINCL;
435 else
436 inp->inp_flags &= ~INP_HDRINCL;
437 } else {
438 m->m_len = sizeof(int);
439 *mtod(m, int *) = inp->inp_flags & INP_HDRINCL;
440 }
441 return (error);
442
443 case MRT_INIT:
444 case MRT_DONE:
445 case MRT_ADD_VIF:
446 case MRT_DEL_VIF:
447 case MRT_ADD_MFC:
448 case MRT_DEL_MFC:
449 case MRT_VERSION:
450 case MRT_ASSERT:
451 case MRT_API_SUPPORT:
452 case MRT_API_CONFIG:
453 #ifdef MROUTING
454 switch (op) {
455 case PRCO_SETOPT:
456 error = ip_mrouter_set(so, optname, m);
457 break;
458 case PRCO_GETOPT:
459 error = ip_mrouter_get(so, optname, m);
460 break;
461 default:
462 error = EINVAL;
463 break;
464 }
465 return (error);
466 #else
467 return (EOPNOTSUPP);
468 #endif
469 }
470 return (ip_ctloutput(op, so, level, optname, m));
471 }
472
473 u_long rip_sendspace = RIPSNDQ;
474 u_long rip_recvspace = RIPRCVQ;
475
476 int
rip_attach(struct socket * so,int proto,int wait)477 rip_attach(struct socket *so, int proto, int wait)
478 {
479 struct inpcb *inp;
480 int error;
481
482 if (so->so_pcb)
483 panic("rip_attach");
484 if ((so->so_state & SS_PRIV) == 0)
485 return EACCES;
486 if (proto < 0 || proto >= IPPROTO_MAX)
487 return EPROTONOSUPPORT;
488
489 if ((error = soreserve(so, rip_sendspace, rip_recvspace)))
490 return error;
491 NET_ASSERT_LOCKED();
492 if ((error = in_pcballoc(so, &rawcbtable, wait)))
493 return error;
494 inp = sotoinpcb(so);
495 inp->inp_ip.ip_p = proto;
496 return 0;
497 }
498
499 int
rip_detach(struct socket * so)500 rip_detach(struct socket *so)
501 {
502 struct inpcb *inp = sotoinpcb(so);
503
504 soassertlocked(so);
505
506 if (inp == NULL)
507 return (EINVAL);
508
509 #ifdef MROUTING
510 if (so == ip_mrouter[inp->inp_rtableid])
511 ip_mrouter_done(so);
512 #endif
513 in_pcbdetach(inp);
514
515 return (0);
516 }
517
518 int
rip_bind(struct socket * so,struct mbuf * nam,struct proc * p)519 rip_bind(struct socket *so, struct mbuf *nam, struct proc *p)
520 {
521 struct inpcb *inp = sotoinpcb(so);
522 struct sockaddr_in *addr;
523 int error;
524
525 soassertlocked(so);
526
527 if ((error = in_nam2sin(nam, &addr)))
528 return (error);
529
530 if (!((so->so_options & SO_BINDANY) ||
531 addr->sin_addr.s_addr == INADDR_ANY ||
532 addr->sin_addr.s_addr == INADDR_BROADCAST ||
533 in_broadcast(addr->sin_addr, inp->inp_rtableid) ||
534 ifa_ifwithaddr(sintosa(addr), inp->inp_rtableid)))
535 return (EADDRNOTAVAIL);
536
537 mtx_enter(&rawcbtable.inpt_mtx);
538 inp->inp_laddr = addr->sin_addr;
539 mtx_leave(&rawcbtable.inpt_mtx);
540
541 return (0);
542 }
543
544 int
rip_connect(struct socket * so,struct mbuf * nam)545 rip_connect(struct socket *so, struct mbuf *nam)
546 {
547 struct inpcb *inp = sotoinpcb(so);
548 struct sockaddr_in *addr;
549 int error;
550
551 soassertlocked(so);
552
553 if ((error = in_nam2sin(nam, &addr)))
554 return (error);
555
556 mtx_enter(&rawcbtable.inpt_mtx);
557 inp->inp_faddr = addr->sin_addr;
558 mtx_leave(&rawcbtable.inpt_mtx);
559 soisconnected(so);
560
561 return (0);
562 }
563
564 int
rip_disconnect(struct socket * so)565 rip_disconnect(struct socket *so)
566 {
567 struct inpcb *inp = sotoinpcb(so);
568
569 soassertlocked(so);
570
571 if ((so->so_state & SS_ISCONNECTED) == 0)
572 return (ENOTCONN);
573
574 soisdisconnected(so);
575 mtx_enter(&rawcbtable.inpt_mtx);
576 inp->inp_faddr.s_addr = INADDR_ANY;
577 mtx_leave(&rawcbtable.inpt_mtx);
578
579 return (0);
580 }
581
582 int
rip_shutdown(struct socket * so)583 rip_shutdown(struct socket *so)
584 {
585 /*
586 * Mark the connection as being incapable of further input.
587 */
588
589 soassertlocked(so);
590 socantsendmore(so);
591
592 return (0);
593 }
594
595 int
rip_send(struct socket * so,struct mbuf * m,struct mbuf * nam,struct mbuf * control)596 rip_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
597 struct mbuf *control)
598 {
599 struct inpcb *inp = sotoinpcb(so);
600 struct sockaddr_in dst;
601 int error;
602
603 soassertlocked(so);
604
605 /*
606 * Ship a packet out. The appropriate raw output
607 * routine handles any massaging necessary.
608 */
609 memset(&dst, 0, sizeof(dst));
610 dst.sin_family = AF_INET;
611 dst.sin_len = sizeof(dst);
612 if (so->so_state & SS_ISCONNECTED) {
613 if (nam) {
614 error = EISCONN;
615 goto out;
616 }
617 dst.sin_addr = inp->inp_faddr;
618 } else {
619 struct sockaddr_in *addr;
620
621 if (nam == NULL) {
622 error = ENOTCONN;
623 goto out;
624 }
625 if ((error = in_nam2sin(nam, &addr)))
626 goto out;
627 dst.sin_addr = addr->sin_addr;
628 }
629 #ifdef IPSEC
630 /* XXX Find an IPsec TDB */
631 #endif
632 error = rip_output(m, so, sintosa(&dst), NULL);
633 m = NULL;
634
635 out:
636 m_freem(control);
637 m_freem(m);
638
639 return (error);
640 }
641