xref: /freebsd/sys/netinet/raw_ip.c (revision 2b833162)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1993
5  *	The Regents of the University of California.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include "opt_inet.h"
39 #include "opt_inet6.h"
40 #include "opt_ipsec.h"
41 #include "opt_route.h"
42 
43 #include <sys/param.h>
44 #include <sys/jail.h>
45 #include <sys/kernel.h>
46 #include <sys/eventhandler.h>
47 #include <sys/lock.h>
48 #include <sys/malloc.h>
49 #include <sys/mbuf.h>
50 #include <sys/priv.h>
51 #include <sys/proc.h>
52 #include <sys/protosw.h>
53 #include <sys/rwlock.h>
54 #include <sys/signalvar.h>
55 #include <sys/socket.h>
56 #include <sys/socketvar.h>
57 #include <sys/sx.h>
58 #include <sys/sysctl.h>
59 #include <sys/systm.h>
60 
61 #include <vm/uma.h>
62 
63 #include <net/if.h>
64 #include <net/if_var.h>
65 #include <net/route.h>
66 #include <net/route/route_ctl.h>
67 #include <net/vnet.h>
68 
69 #include <netinet/in.h>
70 #include <netinet/in_systm.h>
71 #include <netinet/in_fib.h>
72 #include <netinet/in_pcb.h>
73 #include <netinet/in_var.h>
74 #include <netinet/if_ether.h>
75 #include <netinet/ip.h>
76 #include <netinet/ip_var.h>
77 #include <netinet/ip_mroute.h>
78 #include <netinet/ip_icmp.h>
79 
80 #include <netipsec/ipsec_support.h>
81 
82 #include <machine/stdarg.h>
83 #include <security/mac/mac_framework.h>
84 
85 extern ipproto_input_t *ip_protox[];
86 
87 VNET_DEFINE(int, ip_defttl) = IPDEFTTL;
88 SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW,
89     &VNET_NAME(ip_defttl), 0,
90     "Maximum TTL on IP packets");
91 
92 VNET_DEFINE(struct inpcbinfo, ripcbinfo);
93 #define	V_ripcbinfo		VNET(ripcbinfo)
94 
95 /*
96  * Control and data hooks for ipfw, dummynet, divert and so on.
97  * The data hooks are not used here but it is convenient
98  * to keep them all in one place.
99  */
100 VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL;
101 
102 int	(*ip_dn_ctl_ptr)(struct sockopt *);
103 int	(*ip_dn_io_ptr)(struct mbuf **, struct ip_fw_args *);
104 void	(*ip_divert_ptr)(struct mbuf *, bool);
105 int	(*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool);
106 
107 #ifdef INET
108 /*
109  * Hooks for multicast routing. They all default to NULL, so leave them not
110  * initialized and rely on BSS being set to 0.
111  */
112 
113 /*
114  * The socket used to communicate with the multicast routing daemon.
115  */
116 VNET_DEFINE(struct socket *, ip_mrouter);
117 
118 /*
119  * The various mrouter and rsvp functions.
120  */
121 int (*ip_mrouter_set)(struct socket *, struct sockopt *);
122 int (*ip_mrouter_get)(struct socket *, struct sockopt *);
123 int (*ip_mrouter_done)(void);
124 int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
125 		   struct ip_moptions *);
126 int (*mrt_ioctl)(u_long, caddr_t, int);
127 int (*legal_vif_num)(int);
128 u_long (*ip_mcast_src)(int);
129 
130 int (*rsvp_input_p)(struct mbuf **, int *, int);
131 int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
132 void (*ip_rsvp_force_done)(struct socket *);
133 #endif /* INET */
134 
135 u_long	rip_sendspace = 9216;
136 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
137     &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
138 
139 u_long	rip_recvspace = 9216;
140 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
141     &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
142 
143 /*
144  * Hash functions
145  */
146 
147 #define INP_PCBHASH_RAW_SIZE	256
148 #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
149         (((proto) + (laddr) + (faddr)) % (mask) + 1)
150 
151 #ifdef INET
152 static void
153 rip_inshash(struct inpcb *inp)
154 {
155 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
156 	struct inpcbhead *pcbhash;
157 	int hash;
158 
159 	INP_HASH_WLOCK_ASSERT(pcbinfo);
160 	INP_WLOCK_ASSERT(inp);
161 
162 	if (inp->inp_ip_p != 0 &&
163 	    inp->inp_laddr.s_addr != INADDR_ANY &&
164 	    inp->inp_faddr.s_addr != INADDR_ANY) {
165 		hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
166 		    inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
167 	} else
168 		hash = 0;
169 	pcbhash = &pcbinfo->ipi_hashbase[hash];
170 	CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
171 }
172 
173 static void
174 rip_delhash(struct inpcb *inp)
175 {
176 
177 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
178 	INP_WLOCK_ASSERT(inp);
179 
180 	CK_LIST_REMOVE(inp, inp_hash);
181 }
182 #endif /* INET */
183 
184 INPCBSTORAGE_DEFINE(ripcbstor, inpcb, "rawinp", "ripcb", "rip", "riphash");
185 
186 static void
187 rip_init(void *arg __unused)
188 {
189 
190 	in_pcbinfo_init(&V_ripcbinfo, &ripcbstor, INP_PCBHASH_RAW_SIZE, 1);
191 }
192 VNET_SYSINIT(rip_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rip_init, NULL);
193 
194 #ifdef VIMAGE
195 static void
196 rip_destroy(void *unused __unused)
197 {
198 
199 	in_pcbinfo_destroy(&V_ripcbinfo);
200 }
201 VNET_SYSUNINIT(raw_ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, rip_destroy, NULL);
202 #endif
203 
204 #ifdef INET
205 static int
206 rip_append(struct inpcb *inp, struct ip *ip, struct mbuf *m,
207     struct sockaddr_in *ripsrc)
208 {
209 	struct socket *so = inp->inp_socket;
210 	struct mbuf *n, *opts = NULL;
211 
212 	INP_LOCK_ASSERT(inp);
213 
214 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
215 	/* check AH/ESP integrity. */
216 	if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, m, inp) != 0)
217 		return (0);
218 #endif /* IPSEC */
219 #ifdef MAC
220 	if (mac_inpcb_check_deliver(inp, m) != 0)
221 		return (0);
222 #endif
223 	/* Check the minimum TTL for socket. */
224 	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl)
225 		return (0);
226 
227 	if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL)
228 		return (0);
229 
230 	if ((inp->inp_flags & INP_CONTROLOPTS) ||
231 	    (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
232 		ip_savecontrol(inp, &opts, ip, n);
233 	SOCKBUF_LOCK(&so->so_rcv);
234 	if (sbappendaddr_locked(&so->so_rcv,
235 	    (struct sockaddr *)ripsrc, n, opts) == 0) {
236 		soroverflow_locked(so);
237 		m_freem(n);
238 		if (opts)
239 			m_freem(opts);
240 		return (0);
241 	}
242 	sorwakeup_locked(so);
243 
244 	return (1);
245 }
246 
247 struct rip_inp_match_ctx {
248 	struct ip *ip;
249 	int proto;
250 };
251 
252 static bool
253 rip_inp_match1(const struct inpcb *inp, void *v)
254 {
255 	struct rip_inp_match_ctx *ctx = v;
256 
257 	if (inp->inp_ip_p != ctx->proto)
258 		return (false);
259 #ifdef INET6
260 	/* XXX inp locking */
261 	if ((inp->inp_vflag & INP_IPV4) == 0)
262 		return (false);
263 #endif
264 	if (inp->inp_laddr.s_addr != ctx->ip->ip_dst.s_addr)
265 		return (false);
266 	if (inp->inp_faddr.s_addr != ctx->ip->ip_src.s_addr)
267 		return (false);
268 	return (true);
269 }
270 
271 static bool
272 rip_inp_match2(const struct inpcb *inp, void *v)
273 {
274 	struct rip_inp_match_ctx *ctx = v;
275 
276 	if (inp->inp_ip_p && inp->inp_ip_p != ctx->proto)
277 		return (false);
278 #ifdef INET6
279 	/* XXX inp locking */
280 	if ((inp->inp_vflag & INP_IPV4) == 0)
281 		return (false);
282 #endif
283 	if (!in_nullhost(inp->inp_laddr) &&
284 	    !in_hosteq(inp->inp_laddr, ctx->ip->ip_dst))
285 		return (false);
286 	if (!in_nullhost(inp->inp_faddr) &&
287 	    !in_hosteq(inp->inp_faddr, ctx->ip->ip_src))
288 		return (false);
289 	return (true);
290 }
291 
292 /*
293  * Setup generic address and protocol structures for raw_input routine, then
294  * pass them along with mbuf chain.
295  */
296 int
297 rip_input(struct mbuf **mp, int *offp, int proto)
298 {
299 	struct rip_inp_match_ctx ctx = {
300 		.ip = mtod(*mp, struct ip *),
301 		.proto = proto,
302 	};
303 	struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo,
304 	    INPLOOKUP_RLOCKPCB, rip_inp_match1, &ctx);
305 	struct ifnet *ifp;
306 	struct mbuf *m = *mp;
307 	struct inpcb *inp;
308 	struct sockaddr_in ripsrc;
309 	int appended;
310 
311 	*mp = NULL;
312 	appended = 0;
313 
314 	bzero(&ripsrc, sizeof(ripsrc));
315 	ripsrc.sin_len = sizeof(ripsrc);
316 	ripsrc.sin_family = AF_INET;
317 	ripsrc.sin_addr = ctx.ip->ip_src;
318 
319 	ifp = m->m_pkthdr.rcvif;
320 
321 	inpi.hash = INP_PCBHASH_RAW(proto, ctx.ip->ip_src.s_addr,
322 	    ctx.ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
323 	while ((inp = inp_next(&inpi)) != NULL) {
324 		INP_RLOCK_ASSERT(inp);
325 		if (jailed_without_vnet(inp->inp_cred) &&
326 		    prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0) {
327 			/*
328 			 * XXX: If faddr was bound to multicast group,
329 			 * jailed raw socket will drop datagram.
330 			 */
331 			continue;
332 		}
333 		appended += rip_append(inp, ctx.ip, m, &ripsrc);
334 	}
335 
336 	inpi.hash = 0;
337 	inpi.match = rip_inp_match2;
338 	MPASS(inpi.inp == NULL);
339 	while ((inp = inp_next(&inpi)) != NULL) {
340 		INP_RLOCK_ASSERT(inp);
341 		if (jailed_without_vnet(inp->inp_cred) &&
342 		    !IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr)) &&
343 		    prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0)
344 			/*
345 			 * Allow raw socket in jail to receive multicast;
346 			 * assume process had PRIV_NETINET_RAW at attach,
347 			 * and fall through into normal filter path if so.
348 			 */
349 			continue;
350 		/*
351 		 * If this raw socket has multicast state, and we
352 		 * have received a multicast, check if this socket
353 		 * should receive it, as multicast filtering is now
354 		 * the responsibility of the transport layer.
355 		 */
356 		if (inp->inp_moptions != NULL &&
357 		    IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr))) {
358 			/*
359 			 * If the incoming datagram is for IGMP, allow it
360 			 * through unconditionally to the raw socket.
361 			 *
362 			 * In the case of IGMPv2, we may not have explicitly
363 			 * joined the group, and may have set IFF_ALLMULTI
364 			 * on the interface. imo_multi_filter() may discard
365 			 * control traffic we actually need to see.
366 			 *
367 			 * Userland multicast routing daemons should continue
368 			 * filter the control traffic appropriately.
369 			 */
370 			int blocked;
371 
372 			blocked = MCAST_PASS;
373 			if (proto != IPPROTO_IGMP) {
374 				struct sockaddr_in group;
375 
376 				bzero(&group, sizeof(struct sockaddr_in));
377 				group.sin_len = sizeof(struct sockaddr_in);
378 				group.sin_family = AF_INET;
379 				group.sin_addr = ctx.ip->ip_dst;
380 
381 				blocked = imo_multi_filter(inp->inp_moptions,
382 				    ifp,
383 				    (struct sockaddr *)&group,
384 				    (struct sockaddr *)&ripsrc);
385 			}
386 
387 			if (blocked != MCAST_PASS) {
388 				IPSTAT_INC(ips_notmember);
389 				continue;
390 			}
391 		}
392 		appended += rip_append(inp, ctx.ip, m, &ripsrc);
393 	}
394 	if (appended == 0 && ip_protox[ctx.ip->ip_p] == rip_input) {
395 		IPSTAT_INC(ips_noproto);
396 		IPSTAT_DEC(ips_delivered);
397 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0);
398 	} else
399 		m_freem(m);
400 	return (IPPROTO_DONE);
401 }
402 
403 /*
404  * Generate IP header and pass packet to ip_output.  Tack on options user may
405  * have setup with control call.
406  */
407 static int
408 rip_send(struct socket *so, int pruflags, struct mbuf *m, struct sockaddr *nam,
409     struct mbuf *control, struct thread *td)
410 {
411 	struct epoch_tracker et;
412 	struct ip *ip;
413 	struct inpcb *inp;
414 	in_addr_t *dst;
415 	int error, flags, cnt, hlen;
416 	u_char opttype, optlen, *cp;
417 
418 	inp = sotoinpcb(so);
419 	KASSERT(inp != NULL, ("rip_send: inp == NULL"));
420 
421 	if (control != NULL) {
422 		m_freem(control);
423 		control = NULL;
424 	}
425 
426 	if (so->so_state & SS_ISCONNECTED) {
427 		if (nam) {
428 			error = EISCONN;
429 			m_freem(m);
430 			return (error);
431 		}
432 		dst = &inp->inp_faddr.s_addr;
433 	} else {
434 		if (nam == NULL)
435 			error = ENOTCONN;
436 		else if (nam->sa_family != AF_INET)
437 			error = EAFNOSUPPORT;
438 		else if (nam->sa_len != sizeof(struct sockaddr_in))
439 			error = EINVAL;
440 		else
441 			error = 0;
442 		if (error != 0) {
443 			m_freem(m);
444 			return (error);
445 		}
446 		dst = &((struct sockaddr_in *)nam)->sin_addr.s_addr;
447 	}
448 
449 	flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
450 	    IP_ALLOWBROADCAST;
451 
452 	/*
453 	 * If the user handed us a complete IP packet, use it.  Otherwise,
454 	 * allocate an mbuf for a header and fill it in.
455 	 */
456 	if ((inp->inp_flags & INP_HDRINCL) == 0) {
457 		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
458 			m_freem(m);
459 			return(EMSGSIZE);
460 		}
461 		M_PREPEND(m, sizeof(struct ip), M_NOWAIT);
462 		if (m == NULL)
463 			return(ENOBUFS);
464 
465 		INP_RLOCK(inp);
466 		ip = mtod(m, struct ip *);
467 		ip->ip_tos = inp->inp_ip_tos;
468 		if (inp->inp_flags & INP_DONTFRAG)
469 			ip->ip_off = htons(IP_DF);
470 		else
471 			ip->ip_off = htons(0);
472 		ip->ip_p = inp->inp_ip_p;
473 		ip->ip_len = htons(m->m_pkthdr.len);
474 		ip->ip_src = inp->inp_laddr;
475 		ip->ip_dst.s_addr = *dst;
476 #ifdef ROUTE_MPATH
477 		if (CALC_FLOWID_OUTBOUND) {
478 			uint32_t hash_type, hash_val;
479 
480 			hash_val = fib4_calc_software_hash(ip->ip_src,
481 			    ip->ip_dst, 0, 0, ip->ip_p, &hash_type);
482 			m->m_pkthdr.flowid = hash_val;
483 			M_HASHTYPE_SET(m, hash_type);
484 			flags |= IP_NODEFAULTFLOWID;
485 		}
486 #endif
487 		if (jailed(inp->inp_cred)) {
488 			/*
489 			 * prison_local_ip4() would be good enough but would
490 			 * let a source of INADDR_ANY pass, which we do not
491 			 * want to see from jails.
492 			 */
493 			if (ip->ip_src.s_addr == INADDR_ANY) {
494 				NET_EPOCH_ENTER(et);
495 				error = in_pcbladdr(inp, &ip->ip_dst,
496 				    &ip->ip_src, inp->inp_cred);
497 				NET_EPOCH_EXIT(et);
498 			} else {
499 				error = prison_local_ip4(inp->inp_cred,
500 				    &ip->ip_src);
501 			}
502 			if (error != 0) {
503 				INP_RUNLOCK(inp);
504 				m_freem(m);
505 				return (error);
506 			}
507 		}
508 		ip->ip_ttl = inp->inp_ip_ttl;
509 	} else {
510 		if (m->m_pkthdr.len > IP_MAXPACKET) {
511 			m_freem(m);
512 			return (EMSGSIZE);
513 		}
514 		if (m->m_pkthdr.len < sizeof(*ip)) {
515 			m_freem(m);
516 			return (EINVAL);
517 		}
518 		m = m_pullup(m, sizeof(*ip));
519 		if (m == NULL)
520 			return (ENOMEM);
521 		ip = mtod(m, struct ip *);
522 		hlen = ip->ip_hl << 2;
523 		if (m->m_len < hlen) {
524 			m = m_pullup(m, hlen);
525 			if (m == NULL)
526 				return (EINVAL);
527 			ip = mtod(m, struct ip *);
528 		}
529 #ifdef ROUTE_MPATH
530 		if (CALC_FLOWID_OUTBOUND) {
531 			uint32_t hash_type, hash_val;
532 
533 			hash_val = fib4_calc_software_hash(ip->ip_dst,
534 			    ip->ip_src, 0, 0, ip->ip_p, &hash_type);
535 			m->m_pkthdr.flowid = hash_val;
536 			M_HASHTYPE_SET(m, hash_type);
537 			flags |= IP_NODEFAULTFLOWID;
538 		}
539 #endif
540 		INP_RLOCK(inp);
541 		/*
542 		 * Don't allow both user specified and setsockopt options,
543 		 * and don't allow packet length sizes that will crash.
544 		 */
545 		if ((hlen < sizeof (*ip))
546 		    || ((hlen > sizeof (*ip)) && inp->inp_options)
547 		    || (ntohs(ip->ip_len) != m->m_pkthdr.len)) {
548 			INP_RUNLOCK(inp);
549 			m_freem(m);
550 			return (EINVAL);
551 		}
552 		error = prison_check_ip4(inp->inp_cred, &ip->ip_src);
553 		if (error != 0) {
554 			INP_RUNLOCK(inp);
555 			m_freem(m);
556 			return (error);
557 		}
558 		/*
559 		 * Don't allow IP options which do not have the required
560 		 * structure as specified in section 3.1 of RFC 791 on
561 		 * pages 15-23.
562 		 */
563 		cp = (u_char *)(ip + 1);
564 		cnt = hlen - sizeof (struct ip);
565 		for (; cnt > 0; cnt -= optlen, cp += optlen) {
566 			opttype = cp[IPOPT_OPTVAL];
567 			if (opttype == IPOPT_EOL)
568 				break;
569 			if (opttype == IPOPT_NOP) {
570 				optlen = 1;
571 				continue;
572 			}
573 			if (cnt < IPOPT_OLEN + sizeof(u_char)) {
574 				INP_RUNLOCK(inp);
575 				m_freem(m);
576 				return (EINVAL);
577 			}
578 			optlen = cp[IPOPT_OLEN];
579 			if (optlen < IPOPT_OLEN + sizeof(u_char) ||
580 			    optlen > cnt) {
581 				INP_RUNLOCK(inp);
582 				m_freem(m);
583 				return (EINVAL);
584 			}
585 		}
586 		/*
587 		 * This doesn't allow application to specify ID of zero,
588 		 * but we got this limitation from the beginning of history.
589 		 */
590 		if (ip->ip_id == 0)
591 			ip_fillid(ip);
592 
593 		/*
594 		 * XXX prevent ip_output from overwriting header fields.
595 		 */
596 		flags |= IP_RAWOUTPUT;
597 		IPSTAT_INC(ips_rawout);
598 	}
599 
600 	if (inp->inp_flags & INP_ONESBCAST)
601 		flags |= IP_SENDONES;
602 
603 #ifdef MAC
604 	mac_inpcb_create_mbuf(inp, m);
605 #endif
606 
607 	NET_EPOCH_ENTER(et);
608 	error = ip_output(m, inp->inp_options, NULL, flags,
609 	    inp->inp_moptions, inp);
610 	NET_EPOCH_EXIT(et);
611 	INP_RUNLOCK(inp);
612 	return (error);
613 }
614 
615 /*
616  * Raw IP socket option processing.
617  *
618  * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
619  * only be created by a privileged process, and as such, socket option
620  * operations to manage system properties on any raw socket were allowed to
621  * take place without explicit additional access control checks.  However,
622  * raw sockets can now also be created in jail(), and therefore explicit
623  * checks are now required.  Likewise, raw sockets can be used by a process
624  * after it gives up privilege, so some caution is required.  For options
625  * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
626  * performed in ip_ctloutput() and therefore no check occurs here.
627  * Unilaterally checking priv_check() here breaks normal IP socket option
628  * operations on raw sockets.
629  *
630  * When adding new socket options here, make sure to add access control
631  * checks here as necessary.
632  *
633  * XXX-BZ inp locking?
634  */
635 int
636 rip_ctloutput(struct socket *so, struct sockopt *sopt)
637 {
638 	struct	inpcb *inp = sotoinpcb(so);
639 	int	error, optval;
640 
641 	if (sopt->sopt_level != IPPROTO_IP) {
642 		if ((sopt->sopt_level == SOL_SOCKET) &&
643 		    (sopt->sopt_name == SO_SETFIB)) {
644 			inp->inp_inc.inc_fibnum = so->so_fibnum;
645 			return (0);
646 		}
647 		return (EINVAL);
648 	}
649 
650 	error = 0;
651 	switch (sopt->sopt_dir) {
652 	case SOPT_GET:
653 		switch (sopt->sopt_name) {
654 		case IP_HDRINCL:
655 			optval = inp->inp_flags & INP_HDRINCL;
656 			error = sooptcopyout(sopt, &optval, sizeof optval);
657 			break;
658 
659 		case IP_FW3:	/* generic ipfw v.3 functions */
660 		case IP_FW_ADD:	/* ADD actually returns the body... */
661 		case IP_FW_GET:
662 		case IP_FW_TABLE_GETSIZE:
663 		case IP_FW_TABLE_LIST:
664 		case IP_FW_NAT_GET_CONFIG:
665 		case IP_FW_NAT_GET_LOG:
666 			if (V_ip_fw_ctl_ptr != NULL)
667 				error = V_ip_fw_ctl_ptr(sopt);
668 			else
669 				error = ENOPROTOOPT;
670 			break;
671 
672 		case IP_DUMMYNET3:	/* generic dummynet v.3 functions */
673 		case IP_DUMMYNET_GET:
674 			if (ip_dn_ctl_ptr != NULL)
675 				error = ip_dn_ctl_ptr(sopt);
676 			else
677 				error = ENOPROTOOPT;
678 			break ;
679 
680 		case MRT_INIT:
681 		case MRT_DONE:
682 		case MRT_ADD_VIF:
683 		case MRT_DEL_VIF:
684 		case MRT_ADD_MFC:
685 		case MRT_DEL_MFC:
686 		case MRT_VERSION:
687 		case MRT_ASSERT:
688 		case MRT_API_SUPPORT:
689 		case MRT_API_CONFIG:
690 		case MRT_ADD_BW_UPCALL:
691 		case MRT_DEL_BW_UPCALL:
692 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
693 			if (error != 0)
694 				return (error);
695 			if (inp->inp_ip_p != IPPROTO_IGMP)
696 				return (EOPNOTSUPP);
697 			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
698 				EOPNOTSUPP;
699 			break;
700 
701 		default:
702 			error = ip_ctloutput(so, sopt);
703 			break;
704 		}
705 		break;
706 
707 	case SOPT_SET:
708 		switch (sopt->sopt_name) {
709 		case IP_HDRINCL:
710 			error = sooptcopyin(sopt, &optval, sizeof optval,
711 					    sizeof optval);
712 			if (error)
713 				break;
714 			if (optval)
715 				inp->inp_flags |= INP_HDRINCL;
716 			else
717 				inp->inp_flags &= ~INP_HDRINCL;
718 			break;
719 
720 		case IP_FW3:	/* generic ipfw v.3 functions */
721 		case IP_FW_ADD:
722 		case IP_FW_DEL:
723 		case IP_FW_FLUSH:
724 		case IP_FW_ZERO:
725 		case IP_FW_RESETLOG:
726 		case IP_FW_TABLE_ADD:
727 		case IP_FW_TABLE_DEL:
728 		case IP_FW_TABLE_FLUSH:
729 		case IP_FW_NAT_CFG:
730 		case IP_FW_NAT_DEL:
731 			if (V_ip_fw_ctl_ptr != NULL)
732 				error = V_ip_fw_ctl_ptr(sopt);
733 			else
734 				error = ENOPROTOOPT;
735 			break;
736 
737 		case IP_DUMMYNET3:	/* generic dummynet v.3 functions */
738 		case IP_DUMMYNET_CONFIGURE:
739 		case IP_DUMMYNET_DEL:
740 		case IP_DUMMYNET_FLUSH:
741 			if (ip_dn_ctl_ptr != NULL)
742 				error = ip_dn_ctl_ptr(sopt);
743 			else
744 				error = ENOPROTOOPT ;
745 			break ;
746 
747 		case IP_RSVP_ON:
748 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
749 			if (error != 0)
750 				return (error);
751 			if (inp->inp_ip_p != IPPROTO_RSVP)
752 				return (EOPNOTSUPP);
753 			error = ip_rsvp_init(so);
754 			break;
755 
756 		case IP_RSVP_OFF:
757 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
758 			if (error != 0)
759 				return (error);
760 			error = ip_rsvp_done();
761 			break;
762 
763 		case IP_RSVP_VIF_ON:
764 		case IP_RSVP_VIF_OFF:
765 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
766 			if (error != 0)
767 				return (error);
768 			if (inp->inp_ip_p != IPPROTO_RSVP)
769 				return (EOPNOTSUPP);
770 			error = ip_rsvp_vif ?
771 				ip_rsvp_vif(so, sopt) : EINVAL;
772 			break;
773 
774 		case MRT_INIT:
775 		case MRT_DONE:
776 		case MRT_ADD_VIF:
777 		case MRT_DEL_VIF:
778 		case MRT_ADD_MFC:
779 		case MRT_DEL_MFC:
780 		case MRT_VERSION:
781 		case MRT_ASSERT:
782 		case MRT_API_SUPPORT:
783 		case MRT_API_CONFIG:
784 		case MRT_ADD_BW_UPCALL:
785 		case MRT_DEL_BW_UPCALL:
786 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
787 			if (error != 0)
788 				return (error);
789 			if (inp->inp_ip_p != IPPROTO_IGMP)
790 				return (EOPNOTSUPP);
791 			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
792 					EOPNOTSUPP;
793 			break;
794 
795 		default:
796 			error = ip_ctloutput(so, sopt);
797 			break;
798 		}
799 		break;
800 	}
801 
802 	return (error);
803 }
804 
805 void
806 rip_ctlinput(struct icmp *icmp)
807 {
808 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
809 	if (IPSEC_ENABLED(ipv4))
810 		IPSEC_CTLINPUT(ipv4, icmp);
811 #endif
812 }
813 
814 static int
815 rip_attach(struct socket *so, int proto, struct thread *td)
816 {
817 	struct inpcb *inp;
818 	int error;
819 
820 	inp = sotoinpcb(so);
821 	KASSERT(inp == NULL, ("rip_attach: inp != NULL"));
822 
823 	error = priv_check(td, PRIV_NETINET_RAW);
824 	if (error)
825 		return (error);
826 	if (proto >= IPPROTO_MAX || proto < 0)
827 		return EPROTONOSUPPORT;
828 	error = soreserve(so, rip_sendspace, rip_recvspace);
829 	if (error)
830 		return (error);
831 	error = in_pcballoc(so, &V_ripcbinfo);
832 	if (error)
833 		return (error);
834 	inp = (struct inpcb *)so->so_pcb;
835 	inp->inp_ip_p = proto;
836 	inp->inp_ip_ttl = V_ip_defttl;
837 	INP_HASH_WLOCK(&V_ripcbinfo);
838 	rip_inshash(inp);
839 	INP_HASH_WUNLOCK(&V_ripcbinfo);
840 	INP_WUNLOCK(inp);
841 	return (0);
842 }
843 
844 static void
845 rip_detach(struct socket *so)
846 {
847 	struct inpcb *inp;
848 
849 	inp = sotoinpcb(so);
850 	KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
851 	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
852 	    ("rip_detach: not closed"));
853 
854 	/* Disable mrouter first */
855 	if (so == V_ip_mrouter && ip_mrouter_done)
856 		ip_mrouter_done();
857 
858 	INP_WLOCK(inp);
859 	INP_HASH_WLOCK(&V_ripcbinfo);
860 	rip_delhash(inp);
861 	INP_HASH_WUNLOCK(&V_ripcbinfo);
862 
863 	if (ip_rsvp_force_done)
864 		ip_rsvp_force_done(so);
865 	if (so == V_ip_rsvpd)
866 		ip_rsvp_done();
867 	in_pcbdetach(inp);
868 	in_pcbfree(inp);
869 }
870 
871 static void
872 rip_dodisconnect(struct socket *so, struct inpcb *inp)
873 {
874 	struct inpcbinfo *pcbinfo;
875 
876 	pcbinfo = inp->inp_pcbinfo;
877 	INP_WLOCK(inp);
878 	INP_HASH_WLOCK(pcbinfo);
879 	rip_delhash(inp);
880 	inp->inp_faddr.s_addr = INADDR_ANY;
881 	rip_inshash(inp);
882 	INP_HASH_WUNLOCK(pcbinfo);
883 	SOCK_LOCK(so);
884 	so->so_state &= ~SS_ISCONNECTED;
885 	SOCK_UNLOCK(so);
886 	INP_WUNLOCK(inp);
887 }
888 
889 static void
890 rip_abort(struct socket *so)
891 {
892 	struct inpcb *inp;
893 
894 	inp = sotoinpcb(so);
895 	KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
896 
897 	rip_dodisconnect(so, inp);
898 }
899 
900 static void
901 rip_close(struct socket *so)
902 {
903 	struct inpcb *inp;
904 
905 	inp = sotoinpcb(so);
906 	KASSERT(inp != NULL, ("rip_close: inp == NULL"));
907 
908 	rip_dodisconnect(so, inp);
909 }
910 
911 static int
912 rip_disconnect(struct socket *so)
913 {
914 	struct inpcb *inp;
915 
916 	if ((so->so_state & SS_ISCONNECTED) == 0)
917 		return (ENOTCONN);
918 
919 	inp = sotoinpcb(so);
920 	KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
921 
922 	rip_dodisconnect(so, inp);
923 	return (0);
924 }
925 
926 static int
927 rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
928 {
929 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
930 	struct inpcb *inp;
931 	int error;
932 
933 	if (nam->sa_family != AF_INET)
934 		return (EAFNOSUPPORT);
935 	if (nam->sa_len != sizeof(*addr))
936 		return (EINVAL);
937 
938 	error = prison_check_ip4(td->td_ucred, &addr->sin_addr);
939 	if (error != 0)
940 		return (error);
941 
942 	inp = sotoinpcb(so);
943 	KASSERT(inp != NULL, ("rip_bind: inp == NULL"));
944 
945 	if (CK_STAILQ_EMPTY(&V_ifnet) ||
946 	    (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
947 	    (addr->sin_addr.s_addr &&
948 	     (inp->inp_flags & INP_BINDANY) == 0 &&
949 	     ifa_ifwithaddr_check((struct sockaddr *)addr) == 0))
950 		return (EADDRNOTAVAIL);
951 
952 	INP_WLOCK(inp);
953 	INP_HASH_WLOCK(&V_ripcbinfo);
954 	rip_delhash(inp);
955 	inp->inp_laddr = addr->sin_addr;
956 	rip_inshash(inp);
957 	INP_HASH_WUNLOCK(&V_ripcbinfo);
958 	INP_WUNLOCK(inp);
959 	return (0);
960 }
961 
962 static int
963 rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
964 {
965 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
966 	struct inpcb *inp;
967 
968 	if (nam->sa_len != sizeof(*addr))
969 		return (EINVAL);
970 	if (CK_STAILQ_EMPTY(&V_ifnet))
971 		return (EADDRNOTAVAIL);
972 	if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
973 		return (EAFNOSUPPORT);
974 
975 	inp = sotoinpcb(so);
976 	KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
977 
978 	INP_WLOCK(inp);
979 	INP_HASH_WLOCK(&V_ripcbinfo);
980 	rip_delhash(inp);
981 	inp->inp_faddr = addr->sin_addr;
982 	rip_inshash(inp);
983 	INP_HASH_WUNLOCK(&V_ripcbinfo);
984 	soisconnected(so);
985 	INP_WUNLOCK(inp);
986 	return (0);
987 }
988 
989 static int
990 rip_shutdown(struct socket *so)
991 {
992 	struct inpcb *inp;
993 
994 	inp = sotoinpcb(so);
995 	KASSERT(inp != NULL, ("rip_shutdown: inp == NULL"));
996 
997 	INP_WLOCK(inp);
998 	socantsendmore(so);
999 	INP_WUNLOCK(inp);
1000 	return (0);
1001 }
1002 #endif /* INET */
1003 
1004 static int
1005 rip_pcblist(SYSCTL_HANDLER_ARGS)
1006 {
1007 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_ripcbinfo,
1008 	    INPLOOKUP_RLOCKPCB);
1009 	struct xinpgen xig;
1010 	struct inpcb *inp;
1011 	int error;
1012 
1013 	if (req->newptr != 0)
1014 		return (EPERM);
1015 
1016 	if (req->oldptr == 0) {
1017 		int n;
1018 
1019 		n = V_ripcbinfo.ipi_count;
1020 		n += imax(n / 8, 10);
1021 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
1022 		return (0);
1023 	}
1024 
1025 	if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
1026 		return (error);
1027 
1028 	bzero(&xig, sizeof(xig));
1029 	xig.xig_len = sizeof xig;
1030 	xig.xig_count = V_ripcbinfo.ipi_count;
1031 	xig.xig_gen = V_ripcbinfo.ipi_gencnt;
1032 	xig.xig_sogen = so_gencnt;
1033 	error = SYSCTL_OUT(req, &xig, sizeof xig);
1034 	if (error)
1035 		return (error);
1036 
1037 	while ((inp = inp_next(&inpi)) != NULL) {
1038 		if (inp->inp_gencnt <= xig.xig_gen &&
1039 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
1040 			struct xinpcb xi;
1041 
1042 			in_pcbtoxinpcb(inp, &xi);
1043 			error = SYSCTL_OUT(req, &xi, sizeof xi);
1044 			if (error) {
1045 				INP_RUNLOCK(inp);
1046 				break;
1047 			}
1048 		}
1049 	}
1050 
1051 	if (!error) {
1052 		/*
1053 		 * Give the user an updated idea of our state.  If the
1054 		 * generation differs from what we told her before, she knows
1055 		 * that something happened while we were processing this
1056 		 * request, and it might be necessary to retry.
1057 		 */
1058 		xig.xig_gen = V_ripcbinfo.ipi_gencnt;
1059 		xig.xig_sogen = so_gencnt;
1060 		xig.xig_count = V_ripcbinfo.ipi_count;
1061 		error = SYSCTL_OUT(req, &xig, sizeof xig);
1062 	}
1063 
1064 	return (error);
1065 }
1066 
1067 SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist,
1068     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1069     rip_pcblist, "S,xinpcb",
1070     "List of active raw IP sockets");
1071 
1072 #ifdef INET
1073 struct protosw rip_protosw = {
1074 	.pr_type =		SOCK_RAW,
1075 	.pr_flags =		PR_ATOMIC|PR_ADDR,
1076 	.pr_ctloutput =		rip_ctloutput,
1077 	.pr_abort =		rip_abort,
1078 	.pr_attach =		rip_attach,
1079 	.pr_bind =		rip_bind,
1080 	.pr_connect =		rip_connect,
1081 	.pr_control =		in_control,
1082 	.pr_detach =		rip_detach,
1083 	.pr_disconnect =	rip_disconnect,
1084 	.pr_peeraddr =		in_getpeeraddr,
1085 	.pr_send =		rip_send,
1086 	.pr_shutdown =		rip_shutdown,
1087 	.pr_sockaddr =		in_getsockaddr,
1088 	.pr_sosetlabel =	in_pcbsosetlabel,
1089 	.pr_close =		rip_close
1090 };
1091 #endif /* INET */
1092