xref: /openbsd/sys/netinet/raw_ip.c (revision 261a77c2)
1 /*	$OpenBSD: raw_ip.c,v 1.156 2024/02/11 18:14:26 mvs Exp $	*/
2 /*	$NetBSD: raw_ip.c,v 1.25 1996/02/18 18:58:33 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
33  *
34  * NRL grants permission for redistribution and use in source and binary
35  * forms, with or without modification, of the software and documentation
36  * created at NRL provided that the following conditions are met:
37  *
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 3. All advertising materials mentioning features or use of this software
44  *    must display the following acknowledgements:
45  *	This product includes software developed by the University of
46  *	California, Berkeley and its contributors.
47  *	This product includes software developed at the Information
48  *	Technology Division, US Naval Research Laboratory.
49  * 4. Neither the name of the NRL nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
57  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64  *
65  * The views and conclusions contained in the software and documentation
66  * are those of the authors and should not be interpreted as representing
67  * official policies, either expressed or implied, of the US Naval
68  * Research Laboratory (NRL).
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/mbuf.h>
74 #include <sys/socket.h>
75 #include <sys/protosw.h>
76 #include <sys/socketvar.h>
77 
78 #include <net/if.h>
79 #include <net/if_var.h>
80 #include <net/route.h>
81 
82 #include <netinet/in.h>
83 #include <netinet/ip.h>
84 #include <netinet/ip_mroute.h>
85 #include <netinet/ip_var.h>
86 #include <netinet/in_pcb.h>
87 #include <netinet/in_var.h>
88 #include <netinet/ip_icmp.h>
89 
90 #include <net/pfvar.h>
91 
92 #include "pf.h"
93 
94 struct inpcbtable rawcbtable;
95 
96 /*
97  * Nominal space allocated to a raw ip socket.
98  */
99 #define	RIPSNDQ		8192
100 #define	RIPRCVQ		8192
101 
102 /*
103  * Raw interface to IP protocol.
104  */
105 
106 const struct pr_usrreqs rip_usrreqs = {
107 	.pru_attach	= rip_attach,
108 	.pru_detach	= rip_detach,
109 	.pru_lock	= rip_lock,
110 	.pru_unlock	= rip_unlock,
111 	.pru_locked	= rip_locked,
112 	.pru_bind	= rip_bind,
113 	.pru_connect	= rip_connect,
114 	.pru_disconnect	= rip_disconnect,
115 	.pru_shutdown	= rip_shutdown,
116 	.pru_send	= rip_send,
117 	.pru_control	= in_control,
118 	.pru_sockaddr	= in_sockaddr,
119 	.pru_peeraddr	= in_peeraddr,
120 };
121 
122 /*
123  * Initialize raw connection block q.
124  */
125 void
126 rip_init(void)
127 {
128 	in_pcbinit(&rawcbtable, 1);
129 }
130 
131 struct mbuf	*rip_chkhdr(struct mbuf *, struct mbuf *);
132 
133 int
134 rip_input(struct mbuf **mp, int *offp, int proto, int af)
135 {
136 	struct mbuf *m = *mp;
137 	struct ip *ip = mtod(m, struct ip *);
138 	struct inpcb *inp;
139 	SIMPLEQ_HEAD(, inpcb) inpcblist;
140 	struct in_addr *key;
141 	struct counters_ref ref;
142 	uint64_t *counters;
143 	struct sockaddr_in ripsrc;
144 
145 	KASSERT(af == AF_INET);
146 
147 	memset(&ripsrc, 0, sizeof(ripsrc));
148 	ripsrc.sin_family = AF_INET;
149 	ripsrc.sin_len = sizeof(ripsrc);
150 	ripsrc.sin_addr = ip->ip_src;
151 
152 	key = &ip->ip_dst;
153 #if NPF > 0
154 	if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
155 		struct pf_divert *divert;
156 
157 		divert = pf_find_divert(m);
158 		KASSERT(divert != NULL);
159 		switch (divert->type) {
160 		case PF_DIVERT_TO:
161 			key = &divert->addr.v4;
162 			break;
163 		case PF_DIVERT_REPLY:
164 			break;
165 		default:
166 			panic("%s: unknown divert type %d, mbuf %p, divert %p",
167 			    __func__, divert->type, m, divert);
168 		}
169 	}
170 #endif
171 	SIMPLEQ_INIT(&inpcblist);
172 	rw_enter_write(&rawcbtable.inpt_notify);
173 	mtx_enter(&rawcbtable.inpt_mtx);
174 	TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
175 		KASSERT(!ISSET(inp->inp_flags, INP_IPV6));
176 
177 		if (inp->inp_socket->so_rcv.sb_state & SS_CANTRCVMORE)
178 			continue;
179 		if (rtable_l2(inp->inp_rtableid) !=
180 		    rtable_l2(m->m_pkthdr.ph_rtableid))
181 			continue;
182 
183 		if (inp->inp_ip.ip_p && inp->inp_ip.ip_p != ip->ip_p)
184 			continue;
185 		if (inp->inp_laddr.s_addr &&
186 		    inp->inp_laddr.s_addr != key->s_addr)
187 			continue;
188 		if (inp->inp_faddr.s_addr &&
189 		    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
190 			continue;
191 
192 		in_pcbref(inp);
193 		SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
194 	}
195 	mtx_leave(&rawcbtable.inpt_mtx);
196 
197 	if (SIMPLEQ_EMPTY(&inpcblist)) {
198 		rw_exit_write(&rawcbtable.inpt_notify);
199 
200 		if (ip->ip_p != IPPROTO_ICMP)
201 			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL,
202 			    0, 0);
203 		else
204 			m_freem(m);
205 
206 		counters = counters_enter(&ref, ipcounters);
207 		counters[ips_noproto]++;
208 		counters[ips_delivered]--;
209 		counters_leave(&ref, ipcounters);
210 
211 		return IPPROTO_DONE;
212 	}
213 
214 	while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
215 		struct mbuf *n, *opts = NULL;
216 
217 		SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
218 		if (SIMPLEQ_EMPTY(&inpcblist))
219 			n = m;
220 		else
221 			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
222 		if (n != NULL) {
223 			struct socket *so = inp->inp_socket;
224 			int ret;
225 
226 			if (inp->inp_flags & INP_CONTROLOPTS ||
227 			    so->so_options & SO_TIMESTAMP)
228 				ip_savecontrol(inp, &opts, ip, n);
229 
230 			mtx_enter(&so->so_rcv.sb_mtx);
231 			ret = sbappendaddr(so, &so->so_rcv,
232 			    sintosa(&ripsrc), n, opts);
233 			mtx_leave(&so->so_rcv.sb_mtx);
234 
235 			if (ret == 0) {
236 				/* should notify about lost packet */
237 				m_freem(n);
238 				m_freem(opts);
239 			} else
240 				sorwakeup(so);
241 		}
242 		in_pcbunref(inp);
243 	}
244 	rw_exit_write(&rawcbtable.inpt_notify);
245 
246 	return IPPROTO_DONE;
247 }
248 
249 /*
250  * Generate IP header and pass packet to ip_output.
251  * Tack on options user may have setup with control call.
252  */
253 int
254 rip_output(struct mbuf *m, struct socket *so, struct sockaddr *dstaddr,
255     struct mbuf *control)
256 {
257 	struct sockaddr_in *dst = satosin(dstaddr);
258 	struct ip *ip;
259 	struct inpcb *inp;
260 	int flags, error;
261 
262 	inp = sotoinpcb(so);
263 	flags = IP_ALLOWBROADCAST;
264 
265 	/*
266 	 * If the user handed us a complete IP packet, use it.
267 	 * Otherwise, allocate an mbuf for a header and fill it in.
268 	 */
269 	if ((inp->inp_flags & INP_HDRINCL) == 0) {
270 		if ((m->m_pkthdr.len + sizeof(struct ip)) > IP_MAXPACKET) {
271 			m_freem(m);
272 			return (EMSGSIZE);
273 		}
274 		M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
275 		if (!m)
276 			return (ENOBUFS);
277 		ip = mtod(m, struct ip *);
278 		ip->ip_tos = inp->inp_ip.ip_tos;
279 		ip->ip_off = htons(0);
280 		ip->ip_p = inp->inp_ip.ip_p;
281 		ip->ip_len = htons(m->m_pkthdr.len);
282 		ip->ip_src.s_addr = INADDR_ANY;
283 		ip->ip_dst = dst->sin_addr;
284 		ip->ip_ttl = inp->inp_ip.ip_ttl ? inp->inp_ip.ip_ttl : MAXTTL;
285 	} else {
286 		if (m->m_pkthdr.len > IP_MAXPACKET) {
287 			m_freem(m);
288 			return (EMSGSIZE);
289 		}
290 
291 		m = rip_chkhdr(m, inp->inp_options);
292 		if (m == NULL)
293 			return (EINVAL);
294 
295 		ip = mtod(m, struct ip *);
296 		if (ip->ip_id == 0)
297 			ip->ip_id = htons(ip_randomid());
298 		dst->sin_addr = ip->ip_dst;
299 
300 		/* XXX prevent ip_output from overwriting header fields */
301 		flags |= IP_RAWOUTPUT;
302 		ipstat_inc(ips_rawout);
303 	}
304 
305 	if (ip->ip_src.s_addr == INADDR_ANY) {
306 		error = in_pcbselsrc(&ip->ip_src, dst, inp);
307 		if (error != 0)
308 			return (error);
309 	}
310 
311 #ifdef INET6
312 	/*
313 	 * A thought:  Even though raw IP shouldn't be able to set IPv6
314 	 *             multicast options, if it does, the last parameter to
315 	 *             ip_output should be guarded against v6/v4 problems.
316 	 */
317 #endif
318 	/* force routing table */
319 	m->m_pkthdr.ph_rtableid = inp->inp_rtableid;
320 
321 #if NPF > 0
322 	if (inp->inp_socket->so_state & SS_ISCONNECTED &&
323 	    ip->ip_p != IPPROTO_ICMP)
324 		pf_mbuf_link_inpcb(m, inp);
325 #endif
326 
327 	error = ip_output(m, inp->inp_options, &inp->inp_route, flags,
328 	    inp->inp_moptions, inp->inp_seclevel, 0);
329 	return (error);
330 }
331 
332 struct mbuf *
333 rip_chkhdr(struct mbuf *m, struct mbuf *options)
334 {
335 	struct ip *ip;
336 	int hlen, opt, optlen, cnt;
337 	u_char *cp;
338 
339 	if (m->m_pkthdr.len < sizeof(struct ip)) {
340 		m_freem(m);
341 		return NULL;
342 	}
343 
344 	m = m_pullup(m, sizeof (struct ip));
345 	if (m == NULL)
346 		return NULL;
347 
348 	ip = mtod(m, struct ip *);
349 	hlen = ip->ip_hl << 2;
350 
351 	/* Don't allow packet length sizes that will crash. */
352 	if (hlen < sizeof (struct ip) ||
353 	    ntohs(ip->ip_len) < hlen ||
354 	    ntohs(ip->ip_len) != m->m_pkthdr.len) {
355 		m_freem(m);
356 		return NULL;
357 	}
358 	m = m_pullup(m, hlen);
359 	if (m == NULL)
360 		return NULL;
361 
362 	ip = mtod(m, struct ip *);
363 
364 	if (ip->ip_v != IPVERSION) {
365 		m_freem(m);
366 		return NULL;
367 	}
368 
369 	/*
370 	 * Don't allow both user specified and setsockopt options.
371 	 * If options are present verify them.
372 	 */
373 	if (hlen != sizeof(struct ip)) {
374 		if (options) {
375 			m_freem(m);
376 			return NULL;
377 		} else {
378 			cp = (u_char *)(ip + 1);
379 			cnt = hlen - sizeof(struct ip);
380 			for (; cnt > 0; cnt -= optlen, cp += optlen) {
381 				opt = cp[IPOPT_OPTVAL];
382 				if (opt == IPOPT_EOL)
383 					break;
384 				if (opt == IPOPT_NOP)
385 					optlen = 1;
386 				else {
387 					if (cnt < IPOPT_OLEN + sizeof(*cp)) {
388 						m_freem(m);
389 						return NULL;
390 					}
391 					optlen = cp[IPOPT_OLEN];
392 					if (optlen < IPOPT_OLEN + sizeof(*cp) ||
393 					    optlen > cnt) {
394 						m_freem(m);
395 						return NULL;
396 					}
397 				}
398 			}
399 		}
400 	}
401 
402 	return m;
403 }
404 
405 /*
406  * Raw IP socket option processing.
407  */
408 int
409 rip_ctloutput(int op, struct socket *so, int level, int optname,
410     struct mbuf *m)
411 {
412 	struct inpcb *inp = sotoinpcb(so);
413 	int error;
414 
415 	if (level != IPPROTO_IP)
416 		return (EINVAL);
417 
418 	switch (optname) {
419 
420 	case IP_HDRINCL:
421 		error = 0;
422 		if (op == PRCO_SETOPT) {
423 			if (m == NULL || m->m_len < sizeof (int))
424 				error = EINVAL;
425 			else if (*mtod(m, int *))
426 				inp->inp_flags |= INP_HDRINCL;
427 			else
428 				inp->inp_flags &= ~INP_HDRINCL;
429 		} else {
430 			m->m_len = sizeof(int);
431 			*mtod(m, int *) = inp->inp_flags & INP_HDRINCL;
432 		}
433 		return (error);
434 
435 	case MRT_INIT:
436 	case MRT_DONE:
437 	case MRT_ADD_VIF:
438 	case MRT_DEL_VIF:
439 	case MRT_ADD_MFC:
440 	case MRT_DEL_MFC:
441 	case MRT_VERSION:
442 	case MRT_ASSERT:
443 	case MRT_API_SUPPORT:
444 	case MRT_API_CONFIG:
445 #ifdef MROUTING
446 		switch (op) {
447 		case PRCO_SETOPT:
448 			error = ip_mrouter_set(so, optname, m);
449 			break;
450 		case PRCO_GETOPT:
451 			error = ip_mrouter_get(so, optname, m);
452 			break;
453 		default:
454 			error = EINVAL;
455 			break;
456 		}
457 		return (error);
458 #else
459 		return (EOPNOTSUPP);
460 #endif
461 	}
462 	return (ip_ctloutput(op, so, level, optname, m));
463 }
464 
465 u_long	rip_sendspace = RIPSNDQ;
466 u_long	rip_recvspace = RIPRCVQ;
467 
468 int
469 rip_attach(struct socket *so, int proto, int wait)
470 {
471 	struct inpcb *inp;
472 	int error;
473 
474 	if (so->so_pcb)
475 		panic("rip_attach");
476 	if ((so->so_state & SS_PRIV) == 0)
477 		return EACCES;
478 	if (proto < 0 || proto >= IPPROTO_MAX)
479 		return EPROTONOSUPPORT;
480 
481 	if ((error = soreserve(so, rip_sendspace, rip_recvspace)))
482 		return error;
483 	NET_ASSERT_LOCKED();
484 	if ((error = in_pcballoc(so, &rawcbtable, wait)))
485 		return error;
486 	inp = sotoinpcb(so);
487 	inp->inp_ip.ip_p = proto;
488 	return 0;
489 }
490 
491 int
492 rip_detach(struct socket *so)
493 {
494 	struct inpcb *inp = sotoinpcb(so);
495 
496 	soassertlocked(so);
497 
498 	if (inp == NULL)
499 		return (EINVAL);
500 
501 #ifdef MROUTING
502 	if (so == ip_mrouter[inp->inp_rtableid])
503 		ip_mrouter_done(so);
504 #endif
505 	in_pcbdetach(inp);
506 
507 	return (0);
508 }
509 
510 void
511 rip_lock(struct socket *so)
512 {
513 	struct inpcb *inp = sotoinpcb(so);
514 
515 	NET_ASSERT_LOCKED();
516 	mtx_enter(&inp->inp_mtx);
517 }
518 
519 void
520 rip_unlock(struct socket *so)
521 {
522 	struct inpcb *inp = sotoinpcb(so);
523 
524 	NET_ASSERT_LOCKED();
525 	mtx_leave(&inp->inp_mtx);
526 }
527 
528 int
529 rip_locked(struct socket *so)
530 {
531 	struct inpcb *inp = sotoinpcb(so);
532 
533 	return mtx_owned(&inp->inp_mtx);
534 }
535 
536 int
537 rip_bind(struct socket *so, struct mbuf *nam, struct proc *p)
538 {
539 	struct inpcb *inp = sotoinpcb(so);
540 	struct sockaddr_in *addr;
541 	int error;
542 
543 	soassertlocked(so);
544 
545 	if ((error = in_nam2sin(nam, &addr)))
546 		return (error);
547 
548 	if (!((so->so_options & SO_BINDANY) ||
549 	    addr->sin_addr.s_addr == INADDR_ANY ||
550 	    addr->sin_addr.s_addr == INADDR_BROADCAST ||
551 	    in_broadcast(addr->sin_addr, inp->inp_rtableid) ||
552 	    ifa_ifwithaddr(sintosa(addr), inp->inp_rtableid)))
553 		return (EADDRNOTAVAIL);
554 
555 	mtx_enter(&rawcbtable.inpt_mtx);
556 	inp->inp_laddr = addr->sin_addr;
557 	mtx_leave(&rawcbtable.inpt_mtx);
558 
559 	return (0);
560 }
561 
562 int
563 rip_connect(struct socket *so, struct mbuf *nam)
564 {
565 	struct inpcb *inp = sotoinpcb(so);
566 	struct sockaddr_in *addr;
567 	int error;
568 
569 	soassertlocked(so);
570 
571 	if ((error = in_nam2sin(nam, &addr)))
572 		return (error);
573 
574 	mtx_enter(&rawcbtable.inpt_mtx);
575 	inp->inp_faddr = addr->sin_addr;
576 	mtx_leave(&rawcbtable.inpt_mtx);
577 	soisconnected(so);
578 
579 	return (0);
580 }
581 
582 int
583 rip_disconnect(struct socket *so)
584 {
585 	struct inpcb *inp = sotoinpcb(so);
586 
587 	soassertlocked(so);
588 
589 	if ((so->so_state & SS_ISCONNECTED) == 0)
590 		return (ENOTCONN);
591 
592 	soisdisconnected(so);
593 	mtx_enter(&rawcbtable.inpt_mtx);
594 	inp->inp_faddr.s_addr = INADDR_ANY;
595 	mtx_leave(&rawcbtable.inpt_mtx);
596 
597 	return (0);
598 }
599 
600 int
601 rip_shutdown(struct socket *so)
602 {
603 	/*
604 	 * Mark the connection as being incapable of further input.
605 	 */
606 
607 	soassertlocked(so);
608 	socantsendmore(so);
609 
610 	return (0);
611 }
612 
613 int
614 rip_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
615     struct mbuf *control)
616 {
617 	struct inpcb *inp = sotoinpcb(so);
618 	struct sockaddr_in dst;
619 	int error;
620 
621 	soassertlocked(so);
622 
623 	/*
624 	 * Ship a packet out.  The appropriate raw output
625 	 * routine handles any massaging necessary.
626 	 */
627 	memset(&dst, 0, sizeof(dst));
628 	dst.sin_family = AF_INET;
629 	dst.sin_len = sizeof(dst);
630 	if (so->so_state & SS_ISCONNECTED) {
631 		if (nam) {
632 			error = EISCONN;
633 			goto out;
634 		}
635 		dst.sin_addr = inp->inp_faddr;
636 	} else {
637 		struct sockaddr_in *addr;
638 
639 		if (nam == NULL) {
640 			error = ENOTCONN;
641 			goto out;
642 		}
643 		if ((error = in_nam2sin(nam, &addr)))
644 			goto out;
645 		dst.sin_addr = addr->sin_addr;
646 	}
647 #ifdef IPSEC
648 	/* XXX Find an IPsec TDB */
649 #endif
650 	error = rip_output(m, so, sintosa(&dst), NULL);
651 	m = NULL;
652 
653 out:
654 	m_freem(control);
655 	m_freem(m);
656 
657 	return (error);
658 }
659