xref: /dragonfly/sys/netinet/in_pcb.c (revision 23e52a52)
1 /*
2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 /*
35  * Copyright (c) 1982, 1986, 1991, 1993, 1995
36  *	The Regents of the University of California.  All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 4. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
63  * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.27 2004/01/02 04:06:42 ambrisko Exp $
64  */
65 
66 #include "opt_ipsec.h"
67 #include "opt_inet6.h"
68 
69 #include <sys/param.h>
70 #include <sys/systm.h>
71 #include <sys/malloc.h>
72 #include <sys/mbuf.h>
73 #include <sys/domain.h>
74 #include <sys/protosw.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
77 #include <sys/proc.h>
78 #include <sys/priv.h>
79 #include <sys/jail.h>
80 #include <sys/kernel.h>
81 #include <sys/sysctl.h>
82 
83 #include <sys/thread2.h>
84 #include <sys/socketvar2.h>
85 #include <sys/msgport2.h>
86 
87 #include <machine/limits.h>
88 
89 #include <net/if.h>
90 #include <net/if_types.h>
91 #include <net/route.h>
92 
93 #include <netinet/in.h>
94 #include <netinet/in_pcb.h>
95 #include <netinet/in_var.h>
96 #include <netinet/ip_var.h>
97 #ifdef INET6
98 #include <netinet/ip6.h>
99 #include <netinet6/ip6_var.h>
100 #endif /* INET6 */
101 
102 #ifdef IPSEC
103 #include <netinet6/ipsec.h>
104 #include <netproto/key/key.h>
105 #include <netproto/ipsec/esp_var.h>
106 #endif
107 
108 #ifdef FAST_IPSEC
109 #if defined(IPSEC) || defined(IPSEC_ESP)
110 #error "Bad idea: don't compile with both IPSEC and FAST_IPSEC!"
111 #endif
112 
113 #include <netproto/ipsec/ipsec.h>
114 #include <netproto/ipsec/key.h>
115 #define	IPSEC
116 #endif /* FAST_IPSEC */
117 
118 #define INP_LOCALGROUP_SIZMIN	8
119 #define INP_LOCALGROUP_SIZMAX	256
120 
121 struct in_addr zeroin_addr;
122 
123 /*
124  * These configure the range of local port addresses assigned to
125  * "unspecified" outgoing connections/packets/whatever.
126  */
127 int ipport_lowfirstauto = IPPORT_RESERVED - 1;	/* 1023 */
128 int ipport_lowlastauto = IPPORT_RESERVEDSTART;	/* 600 */
129 
130 int ipport_firstauto = IPPORT_RESERVED;		/* 1024 */
131 int ipport_lastauto = IPPORT_USERRESERVED;	/* 5000 */
132 
133 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO;	/* 49152 */
134 int ipport_hilastauto = IPPORT_HILASTAUTO;	/* 65535 */
135 
136 #define RANGECHK(var, min, max) \
137 	if ((var) < (min)) { (var) = (min); } \
138 	else if ((var) > (max)) { (var) = (max); }
139 
140 int udpencap_enable = 1;	/* enabled by default */
141 int udpencap_port = 4500;	/* triggers decapsulation */
142 
143 static int
144 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
145 {
146 	int error;
147 
148 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
149 	if (!error) {
150 		RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
151 		RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
152 
153 		RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX);
154 		RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX);
155 
156 		RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX);
157 		RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX);
158 	}
159 	return (error);
160 }
161 
162 #undef RANGECHK
163 
164 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
165 
166 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW,
167 	   &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
168 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW,
169 	   &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
170 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW,
171 	   &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
172 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW,
173 	   &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
174 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW,
175 	   &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
176 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW,
177 	   &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
178 
179 /*
180  * in_pcb.c: manage the Protocol Control Blocks.
181  *
182  * NOTE: It is assumed that most of these functions will be called from
183  * a critical section.  XXX - There are, unfortunately, a few exceptions
184  * to this rule that should be fixed.
185  *
186  * NOTE: The caller should initialize the cpu field to the cpu running the
187  * protocol stack associated with this inpcbinfo.
188  */
189 
190 void
191 in_pcbinfo_init(struct inpcbinfo *pcbinfo)
192 {
193 	LIST_INIT(&pcbinfo->pcblisthead);
194 	pcbinfo->cpu = -1;
195 	pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave), M_PCB,
196 				    M_WAITOK | M_ZERO);
197 }
198 
199 struct baddynamicports baddynamicports;
200 
201 /*
202  * Check if the specified port is invalid for dynamic allocation.
203  */
204 int
205 in_baddynamic(u_int16_t port, u_int16_t proto)
206 {
207 	switch (proto) {
208 	case IPPROTO_TCP:
209 		return (DP_ISSET(baddynamicports.tcp, port));
210 	case IPPROTO_UDP:
211 #ifdef IPSEC
212 		/* Cannot preset this as it is a sysctl */
213 		if (port == udpencap_port)
214 			return (1);
215 #endif
216 		return (DP_ISSET(baddynamicports.udp, port));
217 	default:
218 		return (0);
219 	}
220 }
221 
222 
223 /*
224  * Allocate a PCB and associate it with the socket.
225  */
226 int
227 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
228 {
229 	struct inpcb *inp;
230 #ifdef IPSEC
231 	int error;
232 #endif
233 
234 	inp = kmalloc(pcbinfo->ipi_size, M_PCB, M_WAITOK|M_ZERO);
235 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
236 	inp->inp_pcbinfo = inp->inp_cpcbinfo = pcbinfo;
237 	inp->inp_socket = so;
238 #ifdef IPSEC
239 	error = ipsec_init_policy(so, &inp->inp_sp);
240 	if (error != 0) {
241 		kfree(inp, M_PCB);
242 		return (error);
243 	}
244 #endif
245 #ifdef INET6
246 	if (INP_SOCKAF(so) == AF_INET6 && ip6_v6only)
247 		inp->inp_flags |= IN6P_IPV6_V6ONLY;
248 	if (ip6_auto_flowlabel)
249 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
250 #endif
251 	soreference(so);
252 	so->so_pcb = inp;
253 	LIST_INSERT_HEAD(&pcbinfo->pcblisthead, inp, inp_list);
254 	pcbinfo->ipi_count++;
255 	return (0);
256 }
257 
258 /*
259  * Unlink a pcb with the intention of moving it to another cpu with a
260  * different pcbinfo.  While unlinked nothing should attempt to dereference
261  * inp_pcbinfo, NULL it out so we assert if it does.
262  */
263 void
264 in_pcbunlink(struct inpcb *inp, struct inpcbinfo *pcbinfo)
265 {
266 	KKASSERT(inp->inp_pcbinfo == pcbinfo);
267 
268 	LIST_REMOVE(inp, inp_list);
269 	pcbinfo->ipi_count--;
270 	inp->inp_pcbinfo = NULL;
271 }
272 
273 /*
274  * Relink a pcb into a new pcbinfo.
275  */
276 void
277 in_pcblink(struct inpcb *inp, struct inpcbinfo *pcbinfo)
278 {
279 	KKASSERT(inp->inp_pcbinfo == NULL);
280 	inp->inp_pcbinfo = pcbinfo;
281 	LIST_INSERT_HEAD(&pcbinfo->pcblisthead, inp, inp_list);
282 	pcbinfo->ipi_count++;
283 }
284 
285 int
286 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
287 {
288 	struct socket *so = inp->inp_socket;
289 	unsigned short *lastport;
290 	struct sockaddr_in *sin;
291 	struct sockaddr_in jsin;
292 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
293 	struct ucred *cred = NULL;
294 	u_short lport = 0;
295 	int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
296 	int error;
297 
298 	if (TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) /* XXX broken! */
299 		return (EADDRNOTAVAIL);
300 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
301 		return (EINVAL);	/* already bound */
302 
303 	if (!(so->so_options & (SO_REUSEADDR|SO_REUSEPORT)))
304 		wild = 1;    /* neither SO_REUSEADDR nor SO_REUSEPORT is set */
305 	if (td->td_proc)
306 		cred = td->td_proc->p_ucred;
307 
308 	/*
309 	 * This has to be atomic.  If the porthash is shared across multiple
310 	 * protocol threads (aka tcp) then the token will be non-NULL.
311 	 */
312 	if (pcbinfo->porttoken)
313 		lwkt_gettoken(pcbinfo->porttoken);
314 
315 	if (nam != NULL) {
316 		sin = (struct sockaddr_in *)nam;
317 		if (nam->sa_len != sizeof *sin) {
318 			error = EINVAL;
319 			goto done;
320 		}
321 #ifdef notdef
322 		/*
323 		 * We should check the family, but old programs
324 		 * incorrectly fail to initialize it.
325 		 */
326 		if (sin->sin_family != AF_INET) {
327 			error = EAFNOSUPPORT;
328 			goto done;
329 		}
330 #endif
331 		if (!prison_replace_wildcards(td, nam)) {
332 			error = EINVAL;
333 			goto done;
334 		}
335 		lport = sin->sin_port;
336 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
337 			/*
338 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
339 			 * allow complete duplication of binding if
340 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
341 			 * and a multicast address is bound on both
342 			 * new and duplicated sockets.
343 			 */
344 			if (so->so_options & SO_REUSEADDR)
345 				reuseport = SO_REUSEADDR | SO_REUSEPORT;
346 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
347 			sin->sin_port = 0;		/* yech... */
348 			bzero(&sin->sin_zero, sizeof sin->sin_zero);
349 			if (ifa_ifwithaddr((struct sockaddr *)sin) == NULL) {
350 				error = EADDRNOTAVAIL;
351 				goto done;
352 			}
353 		}
354 		if (lport != 0) {
355 			struct inpcb *t;
356 
357 			/* GROSS */
358 			if (ntohs(lport) < IPPORT_RESERVED &&
359 			    cred &&
360 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) {
361 				error = EACCES;
362 				goto done;
363 			}
364 			if (so->so_cred->cr_uid != 0 &&
365 			    !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
366 				t = in_pcblookup_local(pcbinfo,
367 						       sin->sin_addr,
368 						       lport,
369 						       INPLOOKUP_WILDCARD,
370 						       cred);
371 				if (t &&
372 				    (!in_nullhost(sin->sin_addr) ||
373 				     !in_nullhost(t->inp_laddr) ||
374 				     (t->inp_socket->so_options &
375 					 SO_REUSEPORT) == 0) &&
376 				    (so->so_cred->cr_uid !=
377 				     t->inp_socket->so_cred->cr_uid)) {
378 #ifdef INET6
379 					if (!in_nullhost(sin->sin_addr) ||
380 					    !in_nullhost(t->inp_laddr) ||
381 					    INP_SOCKAF(so) ==
382 					    INP_SOCKAF(t->inp_socket))
383 #endif
384 					{
385 						error = EADDRINUSE;
386 						goto done;
387 					}
388 				}
389 			}
390 			if (cred && !prison_replace_wildcards(td, nam)) {
391 				error = EADDRNOTAVAIL;
392 				goto done;
393 			}
394 			t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport,
395 					       wild, cred);
396 			if (t && !(reuseport & t->inp_socket->so_options)) {
397 #ifdef INET6
398 				if (!in_nullhost(sin->sin_addr) ||
399 				    !in_nullhost(t->inp_laddr) ||
400 				    INP_SOCKAF(so) == INP_SOCKAF(t->inp_socket))
401 #endif
402 				{
403 					error = EADDRINUSE;
404 					goto done;
405 				}
406 			}
407 		}
408 		inp->inp_laddr = sin->sin_addr;
409 	}
410 	if (lport == 0) {
411 		ushort first, last;
412 		int count;
413 
414 		jsin.sin_family = AF_INET;
415 		jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
416 		if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) {
417 			inp->inp_laddr.s_addr = INADDR_ANY;
418 			error = EINVAL;
419 			goto done;
420 		}
421 		inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
422 
423 		inp->inp_flags |= INP_ANONPORT;
424 
425 		if (inp->inp_flags & INP_HIGHPORT) {
426 			first = ipport_hifirstauto;	/* sysctl */
427 			last  = ipport_hilastauto;
428 			lastport = &pcbinfo->lasthi;
429 		} else if (inp->inp_flags & INP_LOWPORT) {
430 			if (cred &&
431 			    (error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) {
432 				inp->inp_laddr.s_addr = INADDR_ANY;
433 				goto done;
434 			}
435 			first = ipport_lowfirstauto;	/* 1023 */
436 			last  = ipport_lowlastauto;	/* 600 */
437 			lastport = &pcbinfo->lastlow;
438 		} else {
439 			first = ipport_firstauto;	/* sysctl */
440 			last  = ipport_lastauto;
441 			lastport = &pcbinfo->lastport;
442 		}
443 		/*
444 		 * Simple check to ensure all ports are not used up causing
445 		 * a deadlock here.
446 		 *
447 		 * We split the two cases (up and down) so that the direction
448 		 * is not being tested on each round of the loop.
449 		 */
450 		if (first > last) {
451 			/*
452 			 * counting down
453 			 */
454 			count = first - last;
455 
456 			do {
457 				if (count-- < 0) {	/* completely used? */
458 					inp->inp_laddr.s_addr = INADDR_ANY;
459 					error = EADDRNOTAVAIL;
460 					goto done;
461 				}
462 				--*lastport;
463 				if (*lastport > first || *lastport < last)
464 					*lastport = first;
465 				lport = htons(*lastport);
466 			} while (in_pcblookup_local(pcbinfo, inp->inp_laddr,
467 						    lport, wild, cred));
468 		} else {
469 			/*
470 			 * counting up
471 			 */
472 			count = last - first;
473 
474 			do {
475 				if (count-- < 0) {	/* completely used? */
476 					inp->inp_laddr.s_addr = INADDR_ANY;
477 					error = EADDRNOTAVAIL;
478 					goto done;
479 				}
480 				++*lastport;
481 				if (*lastport < first || *lastport > last)
482 					*lastport = first;
483 				lport = htons(*lastport);
484 			} while (in_pcblookup_local(pcbinfo, inp->inp_laddr,
485 						    lport, wild, cred));
486 		}
487 	}
488 	inp->inp_lport = lport;
489 
490 	jsin.sin_family = AF_INET;
491 	jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
492 	if (!prison_replace_wildcards(td, (struct sockaddr*)&jsin)) {
493 		inp->inp_laddr.s_addr = INADDR_ANY;
494 		inp->inp_lport = 0;
495 		error = EINVAL;
496 		goto done;
497 	}
498 	inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
499 
500 	if (in_pcbinsporthash(inp) != 0) {
501 		inp->inp_laddr.s_addr = INADDR_ANY;
502 		inp->inp_lport = 0;
503 		error = EAGAIN;
504 		goto done;
505 	}
506 	error = 0;
507 done:
508 	if (pcbinfo->porttoken)
509 		lwkt_reltoken(pcbinfo->porttoken);
510 	return error;
511 }
512 
513 static struct inpcb *
514 in_pcblookup_addrport(struct inpcbinfo *pcbinfo, struct in_addr laddr,
515     u_short lport, struct in_addr faddr, u_short fport, struct ucred *cred)
516 {
517 	struct inpcb *inp;
518 	struct inpcbporthead *porthash;
519 	struct inpcbport *phd;
520 	struct inpcb *match = NULL;
521 
522 	/*
523 	 * If the porthashbase is shared across several cpus we need
524 	 * to lock.
525 	 */
526 	if (pcbinfo->porttoken)
527 		lwkt_gettoken(pcbinfo->porttoken);
528 
529 	/*
530 	 * Best fit PCB lookup.
531 	 *
532 	 * First see if this local port is in use by looking on the
533 	 * port hash list.
534 	 */
535 	porthash = &pcbinfo->porthashbase[
536 			INP_PCBPORTHASH(lport, pcbinfo->porthashmask)];
537 	LIST_FOREACH(phd, porthash, phd_hash) {
538 		if (phd->phd_port == lport)
539 			break;
540 	}
541 	if (phd != NULL) {
542 		LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
543 #ifdef INET6
544 			if ((inp->inp_vflag & INP_IPV4) == 0)
545 				continue;
546 #endif
547 			if (inp->inp_laddr.s_addr != INADDR_ANY &&
548 			    inp->inp_laddr.s_addr != laddr.s_addr)
549 				continue;
550 
551 			if (inp->inp_faddr.s_addr != INADDR_ANY &&
552 			    inp->inp_faddr.s_addr != faddr.s_addr)
553 				continue;
554 
555 			if (inp->inp_fport != 0 && inp->inp_fport != fport)
556 				continue;
557 
558 			if (cred == NULL ||
559 			    cred->cr_prison ==
560 			    inp->inp_socket->so_cred->cr_prison) {
561 				match = inp;
562 				break;
563 			}
564 		}
565 	}
566 	if (pcbinfo->porttoken)
567 		lwkt_reltoken(pcbinfo->porttoken);
568 	return (match);
569 }
570 
571 int
572 in_pcbconn_bind(struct inpcb *inp, const struct sockaddr *nam,
573     struct thread *td)
574 {
575 	struct proc *p = td->td_proc;
576 	unsigned short *lastport;
577 	const struct sockaddr_in *sin = (const struct sockaddr_in *)nam;
578 	struct sockaddr_in jsin;
579 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
580 	struct ucred *cred = NULL;
581 	u_short lport = 0;
582 	ushort first, last;
583 	int count, error, dup = 0;
584 
585 	if (TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) /* XXX broken! */
586 		return (EADDRNOTAVAIL);
587 
588 	KKASSERT(inp->inp_laddr.s_addr != INADDR_ANY);
589 	if (inp->inp_lport != 0)
590 		return (EINVAL);	/* already bound */
591 
592 	KKASSERT(p);
593 	cred = p->p_ucred;
594 
595 	/*
596 	 * This has to be atomic.  If the porthash is shared across multiple
597 	 * protocol threads (aka tcp) then the token will be non-NULL.
598 	 */
599 	if (pcbinfo->porttoken)
600 		lwkt_gettoken(pcbinfo->porttoken);
601 
602 	jsin.sin_family = AF_INET;
603 	jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
604 	if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) {
605 		inp->inp_laddr.s_addr = INADDR_ANY;
606 		error = EINVAL;
607 		goto done;
608 	}
609 	inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
610 
611 	inp->inp_flags |= INP_ANONPORT;
612 
613 	if (inp->inp_flags & INP_HIGHPORT) {
614 		first = ipport_hifirstauto;	/* sysctl */
615 		last  = ipport_hilastauto;
616 		lastport = &pcbinfo->lasthi;
617 	} else if (inp->inp_flags & INP_LOWPORT) {
618 		if (cred &&
619 		    (error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) {
620 			inp->inp_laddr.s_addr = INADDR_ANY;
621 			goto done;
622 		}
623 		first = ipport_lowfirstauto;	/* 1023 */
624 		last  = ipport_lowlastauto;	/* 600 */
625 		lastport = &pcbinfo->lastlow;
626 	} else {
627 		first = ipport_firstauto;	/* sysctl */
628 		last  = ipport_lastauto;
629 		lastport = &pcbinfo->lastport;
630 	}
631 
632 again:
633 	/*
634 	 * Simple check to ensure all ports are not used up causing
635 	 * a deadlock here.
636 	 *
637 	 * We split the two cases (up and down) so that the direction
638 	 * is not being tested on each round of the loop.
639 	 */
640 	if (first > last) {
641 		/*
642 		 * counting down
643 		 */
644 		count = first - last;
645 
646 		do {
647 			if (count-- < 0) {	/* completely used? */
648 				inp->inp_laddr.s_addr = INADDR_ANY;
649 				error = EADDRNOTAVAIL;
650 				goto done;
651 			}
652 			--*lastport;
653 			if (*lastport > first || *lastport < last)
654 				*lastport = first;
655 			lport = htons(*lastport);
656 		} while (in_pcblookup_addrport(pcbinfo, inp->inp_laddr, lport,
657 				sin->sin_addr, sin->sin_port, cred));
658 	} else {
659 		/*
660 		 * counting up
661 		 */
662 		count = last - first;
663 
664 		do {
665 			if (count-- < 0) {	/* completely used? */
666 				inp->inp_laddr.s_addr = INADDR_ANY;
667 				error = EADDRNOTAVAIL;
668 				goto done;
669 			}
670 			++*lastport;
671 			if (*lastport < first || *lastport > last)
672 				*lastport = first;
673 			lport = htons(*lastport);
674 		} while (in_pcblookup_addrport(pcbinfo, inp->inp_laddr, lport,
675 				sin->sin_addr, sin->sin_port, cred));
676 	}
677 
678 	/* This could happen on loopback interface */
679 	if (sin->sin_port == lport &&
680 	    sin->sin_addr.s_addr == inp->inp_laddr.s_addr) {
681 		if (dup) {
682 			/*
683 			 * Duplicate again; give up
684 			 */
685 			inp->inp_laddr.s_addr = INADDR_ANY;
686 			error = EADDRNOTAVAIL;
687 			goto done;
688 		}
689 		dup = 1;
690 		goto again;
691 	}
692 	inp->inp_lport = lport;
693 
694 	jsin.sin_family = AF_INET;
695 	jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
696 	if (!prison_replace_wildcards(td, (struct sockaddr*)&jsin)) {
697 		inp->inp_laddr.s_addr = INADDR_ANY;
698 		inp->inp_lport = 0;
699 		error = EINVAL;
700 		goto done;
701 	}
702 	inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
703 
704 	if (in_pcbinsporthash(inp) != 0) {
705 		inp->inp_laddr.s_addr = INADDR_ANY;
706 		inp->inp_lport = 0;
707 		error = EAGAIN;
708 		goto done;
709 	}
710 	error = 0;
711 done:
712 	if (pcbinfo->porttoken)
713 		lwkt_reltoken(pcbinfo->porttoken);
714 	return error;
715 }
716 
717 /*
718  *   Transform old in_pcbconnect() into an inner subroutine for new
719  *   in_pcbconnect(): Do some validity-checking on the remote
720  *   address (in mbuf 'nam') and then determine local host address
721  *   (i.e., which interface) to use to access that remote host.
722  *
723  *   This preserves definition of in_pcbconnect(), while supporting a
724  *   slightly different version for T/TCP.  (This is more than
725  *   a bit of a kludge, but cleaning up the internal interfaces would
726  *   have forced minor changes in every protocol).
727  */
728 int
729 in_pcbladdr_find(struct inpcb *inp, struct sockaddr *nam,
730     struct sockaddr_in **plocal_sin, struct thread *td, int find)
731 {
732 	struct in_ifaddr *ia;
733 	struct ucred *cred = NULL;
734 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
735 	struct sockaddr *jsin;
736 	int jailed = 0, alloc_route = 0;
737 
738 	if (nam->sa_len != sizeof *sin)
739 		return (EINVAL);
740 	if (sin->sin_family != AF_INET)
741 		return (EAFNOSUPPORT);
742 	if (sin->sin_port == 0)
743 		return (EADDRNOTAVAIL);
744 	if (td && td->td_proc && td->td_proc->p_ucred)
745 		cred = td->td_proc->p_ucred;
746 	if (cred && cred->cr_prison)
747 		jailed = 1;
748 	if (!TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) {
749 		ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia;
750 		/*
751 		 * If the destination address is INADDR_ANY,
752 		 * use the primary local address.
753 		 * If the supplied address is INADDR_BROADCAST,
754 		 * and the primary interface supports broadcast,
755 		 * choose the broadcast address for that interface.
756 		 */
757 		if (sin->sin_addr.s_addr == INADDR_ANY)
758 			sin->sin_addr = IA_SIN(ia)->sin_addr;
759 		else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST &&
760 		    (ia->ia_ifp->if_flags & IFF_BROADCAST))
761 			sin->sin_addr = satosin(&ia->ia_broadaddr)->sin_addr;
762 	}
763 	if (find) {
764 		struct route *ro;
765 
766 		ia = NULL;
767 		/*
768 		 * If route is known or can be allocated now,
769 		 * our src addr is taken from the i/f, else punt.
770 		 * Note that we should check the address family of the cached
771 		 * destination, in case of sharing the cache with IPv6.
772 		 */
773 		ro = &inp->inp_route;
774 		if (ro->ro_rt &&
775 		    (!(ro->ro_rt->rt_flags & RTF_UP) ||
776 		     ro->ro_dst.sa_family != AF_INET ||
777 		     satosin(&ro->ro_dst)->sin_addr.s_addr !=
778 				      sin->sin_addr.s_addr ||
779 		     inp->inp_socket->so_options & SO_DONTROUTE)) {
780 			RTFREE(ro->ro_rt);
781 			ro->ro_rt = NULL;
782 		}
783 		if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/
784 		    (ro->ro_rt == NULL ||
785 		    ro->ro_rt->rt_ifp == NULL)) {
786 			/* No route yet, so try to acquire one */
787 			bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
788 			ro->ro_dst.sa_family = AF_INET;
789 			ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
790 			((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
791 				sin->sin_addr;
792 			rtalloc(ro);
793 			alloc_route = 1;
794 		}
795 		/*
796 		 * If we found a route, use the address
797 		 * corresponding to the outgoing interface
798 		 * unless it is the loopback (in case a route
799 		 * to our address on another net goes to loopback).
800 		 */
801 		if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) {
802 			if (jailed) {
803 				if (jailed_ip(cred->cr_prison,
804 				    ro->ro_rt->rt_ifa->ifa_addr)) {
805 					ia = ifatoia(ro->ro_rt->rt_ifa);
806 				}
807 			} else {
808 				ia = ifatoia(ro->ro_rt->rt_ifa);
809 			}
810 		}
811 		if (ia == NULL) {
812 			u_short fport = sin->sin_port;
813 
814 			sin->sin_port = 0;
815 			ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin)));
816 			if (ia && jailed && !jailed_ip(cred->cr_prison,
817 			    sintosa(&ia->ia_addr)))
818 				ia = NULL;
819 			if (ia == NULL)
820 				ia = ifatoia(ifa_ifwithnet(sintosa(sin)));
821 			if (ia && jailed && !jailed_ip(cred->cr_prison,
822 			    sintosa(&ia->ia_addr)))
823 				ia = NULL;
824 			sin->sin_port = fport;
825 			if (ia == NULL &&
826 			    !TAILQ_EMPTY(&in_ifaddrheads[mycpuid]))
827 				ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia;
828 			if (ia && jailed && !jailed_ip(cred->cr_prison,
829 			    sintosa(&ia->ia_addr)))
830 				ia = NULL;
831 
832 			if (!jailed && ia == NULL)
833 				goto fail;
834 		}
835 		/*
836 		 * If the destination address is multicast and an outgoing
837 		 * interface has been set as a multicast option, use the
838 		 * address of that interface as our source address.
839 		 */
840 		if (!jailed && IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
841 		    inp->inp_moptions != NULL) {
842 			struct ip_moptions *imo;
843 			struct ifnet *ifp;
844 
845 			imo = inp->inp_moptions;
846 			if (imo->imo_multicast_ifp != NULL) {
847 				struct in_ifaddr_container *iac;
848 
849 				ifp = imo->imo_multicast_ifp;
850 				ia = NULL;
851 				TAILQ_FOREACH(iac,
852 				&in_ifaddrheads[mycpuid], ia_link) {
853 					if (iac->ia->ia_ifp == ifp) {
854 						ia = iac->ia;
855 						break;
856 					}
857 				}
858 				if (ia == NULL)
859 					goto fail;
860 			}
861 		}
862 		/*
863 		 * Don't do pcblookup call here; return interface in plocal_sin
864 		 * and exit to caller, that will do the lookup.
865 		 */
866 		if (ia == NULL && jailed) {
867 			if ((jsin = prison_get_nonlocal(cred->cr_prison, AF_INET, NULL)) != NULL ||
868 			    (jsin = prison_get_local(cred->cr_prison, AF_INET, NULL)) != NULL) {
869 				*plocal_sin = satosin(jsin);
870 			} else {
871 				/* IPv6 only Jail */
872 				goto fail;
873 			}
874 		} else {
875 			*plocal_sin = &ia->ia_addr;
876 		}
877 	}
878 	return (0);
879 fail:
880 	if (alloc_route) {
881 		struct route *ro = &inp->inp_route;
882 
883 		if (ro->ro_rt != NULL)
884 			RTFREE(ro->ro_rt);
885 		bzero(ro, sizeof(*ro));
886 	}
887 	return (EADDRNOTAVAIL);
888 }
889 
890 int
891 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam,
892     struct sockaddr_in **plocal_sin, struct thread *td)
893 {
894 	return in_pcbladdr_find(inp, nam, plocal_sin, td,
895 	    (inp->inp_laddr.s_addr == INADDR_ANY));
896 }
897 
898 /*
899  * Outer subroutine:
900  * Connect from a socket to a specified address.
901  * Both address and port must be specified in argument sin.
902  * If don't have a local address for this socket yet,
903  * then pick one.
904  */
905 int
906 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
907 {
908 	struct sockaddr_in *if_sin;
909 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
910 	int error;
911 
912 	/* Call inner routine to assign local interface address. */
913 	if ((error = in_pcbladdr(inp, nam, &if_sin, td)) != 0)
914 		return (error);
915 
916 	if (in_pcblookup_hash(inp->inp_cpcbinfo, sin->sin_addr, sin->sin_port,
917 			      inp->inp_laddr.s_addr ?
918 				inp->inp_laddr : if_sin->sin_addr,
919 			      inp->inp_lport, FALSE, NULL) != NULL) {
920 		return (EADDRINUSE);
921 	}
922 	if (inp->inp_laddr.s_addr == INADDR_ANY) {
923 		if (inp->inp_lport == 0) {
924 			error = in_pcbbind(inp, NULL, td);
925 			if (error)
926 				return (error);
927 		}
928 		inp->inp_laddr = if_sin->sin_addr;
929 	}
930 	inp->inp_faddr = sin->sin_addr;
931 	inp->inp_fport = sin->sin_port;
932 	in_pcbinsconnhash(inp);
933 	return (0);
934 }
935 
936 void
937 in_pcbdisconnect(struct inpcb *inp)
938 {
939 
940 	inp->inp_faddr.s_addr = INADDR_ANY;
941 	inp->inp_fport = 0;
942 	in_pcbremconnhash(inp);
943 	if (inp->inp_socket->so_state & SS_NOFDREF)
944 		in_pcbdetach(inp);
945 }
946 
947 void
948 in_pcbdetach(struct inpcb *inp)
949 {
950 	struct socket *so = inp->inp_socket;
951 	struct inpcbinfo *ipi = inp->inp_pcbinfo;
952 
953 #ifdef IPSEC
954 	ipsec4_delete_pcbpolicy(inp);
955 #endif /*IPSEC*/
956 	inp->inp_gencnt = ++ipi->ipi_gencnt;
957 	KKASSERT((so->so_state & SS_ASSERTINPROG) == 0);
958 	in_pcbremlists(inp);
959 	so->so_pcb = NULL;
960 	sofree(so);			/* remove pcb ref */
961 	if (inp->inp_options)
962 		m_free(inp->inp_options);
963 	if (inp->inp_route.ro_rt)
964 		rtfree(inp->inp_route.ro_rt);
965 	ip_freemoptions(inp->inp_moptions);
966 	inp->inp_vflag = 0;
967 	kfree(inp, M_PCB);
968 }
969 
970 /*
971  * The calling convention of in_setsockaddr() and in_setpeeraddr() was
972  * modified to match the pru_sockaddr() and pru_peeraddr() entry points
973  * in struct pr_usrreqs, so that protocols can just reference then directly
974  * without the need for a wrapper function.  The socket must have a valid
975  * (i.e., non-nil) PCB, but it should be impossible to get an invalid one
976  * except through a kernel programming error, so it is acceptable to panic
977  * (or in this case trap) if the PCB is invalid.  (Actually, we don't trap
978  * because there actually /is/ a programming error somewhere... XXX)
979  */
980 int
981 in_setsockaddr(struct socket *so, struct sockaddr **nam)
982 {
983 	struct inpcb *inp;
984 	struct sockaddr_in *sin;
985 
986 	/*
987 	 * Do the malloc first in case it blocks.
988 	 */
989 	sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO);
990 	sin->sin_family = AF_INET;
991 	sin->sin_len = sizeof *sin;
992 
993 	crit_enter();
994 	inp = so->so_pcb;
995 	if (!inp) {
996 		crit_exit();
997 		kfree(sin, M_SONAME);
998 		return (ECONNRESET);
999 	}
1000 	sin->sin_port = inp->inp_lport;
1001 	sin->sin_addr = inp->inp_laddr;
1002 	crit_exit();
1003 
1004 	*nam = (struct sockaddr *)sin;
1005 	return (0);
1006 }
1007 
1008 void
1009 in_setsockaddr_dispatch(netmsg_t msg)
1010 {
1011 	int error;
1012 
1013 	error = in_setsockaddr(msg->base.nm_so, msg->peeraddr.nm_nam);
1014 	lwkt_replymsg(&msg->lmsg, error);
1015 }
1016 
1017 int
1018 in_setpeeraddr(struct socket *so, struct sockaddr **nam)
1019 {
1020 	struct inpcb *inp;
1021 	struct sockaddr_in *sin;
1022 
1023 	/*
1024 	 * Do the malloc first in case it blocks.
1025 	 */
1026 	sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO);
1027 	sin->sin_family = AF_INET;
1028 	sin->sin_len = sizeof *sin;
1029 
1030 	crit_enter();
1031 	inp = so->so_pcb;
1032 	if (!inp) {
1033 		crit_exit();
1034 		kfree(sin, M_SONAME);
1035 		return (ECONNRESET);
1036 	}
1037 	sin->sin_port = inp->inp_fport;
1038 	sin->sin_addr = inp->inp_faddr;
1039 	crit_exit();
1040 
1041 	*nam = (struct sockaddr *)sin;
1042 	return (0);
1043 }
1044 
1045 void
1046 in_setpeeraddr_dispatch(netmsg_t msg)
1047 {
1048 	int error;
1049 
1050 	error = in_setpeeraddr(msg->base.nm_so, msg->peeraddr.nm_nam);
1051 	lwkt_replymsg(&msg->lmsg, error);
1052 }
1053 
1054 void
1055 in_pcbnotifyall(struct inpcbhead *head, struct in_addr faddr, int err,
1056 		void (*notify)(struct inpcb *, int))
1057 {
1058 	struct inpcb *inp, *ninp;
1059 
1060 	/*
1061 	 * note: if INP_PLACEMARKER is set we must ignore the rest of
1062 	 * the structure and skip it.
1063 	 */
1064 	crit_enter();
1065 	LIST_FOREACH_MUTABLE(inp, head, inp_list, ninp) {
1066 		if (inp->inp_flags & INP_PLACEMARKER)
1067 			continue;
1068 #ifdef INET6
1069 		if (!(inp->inp_vflag & INP_IPV4))
1070 			continue;
1071 #endif
1072 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
1073 		    inp->inp_socket == NULL)
1074 			continue;
1075 		(*notify)(inp, err);		/* can remove inp from list! */
1076 	}
1077 	crit_exit();
1078 }
1079 
1080 void
1081 in_pcbpurgeif0(struct inpcb *head, struct ifnet *ifp)
1082 {
1083 	struct inpcb *inp;
1084 	struct ip_moptions *imo;
1085 	int i, gap;
1086 
1087 	for (inp = head; inp != NULL; inp = LIST_NEXT(inp, inp_list)) {
1088 		if (inp->inp_flags & INP_PLACEMARKER)
1089 			continue;
1090 		imo = inp->inp_moptions;
1091 		if ((inp->inp_vflag & INP_IPV4) && imo != NULL) {
1092 			/*
1093 			 * Unselect the outgoing interface if it is being
1094 			 * detached.
1095 			 */
1096 			if (imo->imo_multicast_ifp == ifp)
1097 				imo->imo_multicast_ifp = NULL;
1098 
1099 			/*
1100 			 * Drop multicast group membership if we joined
1101 			 * through the interface being detached.
1102 			 */
1103 			for (i = 0, gap = 0; i < imo->imo_num_memberships;
1104 			    i++) {
1105 				if (imo->imo_membership[i]->inm_ifp == ifp) {
1106 					in_delmulti(imo->imo_membership[i]);
1107 					gap++;
1108 				} else if (gap != 0)
1109 					imo->imo_membership[i - gap] =
1110 					    imo->imo_membership[i];
1111 			}
1112 			imo->imo_num_memberships -= gap;
1113 		}
1114 	}
1115 }
1116 
1117 /*
1118  * Check for alternatives when higher level complains
1119  * about service problems.  For now, invalidate cached
1120  * routing information.  If the route was created dynamically
1121  * (by a redirect), time to try a default gateway again.
1122  */
1123 void
1124 in_losing(struct inpcb *inp)
1125 {
1126 	struct rtentry *rt;
1127 	struct rt_addrinfo rtinfo;
1128 
1129 	if ((rt = inp->inp_route.ro_rt)) {
1130 		bzero(&rtinfo, sizeof(struct rt_addrinfo));
1131 		rtinfo.rti_info[RTAX_DST] = rt_key(rt);
1132 		rtinfo.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1133 		rtinfo.rti_info[RTAX_NETMASK] = rt_mask(rt);
1134 		rtinfo.rti_flags = rt->rt_flags;
1135 		rt_missmsg(RTM_LOSING, &rtinfo, rt->rt_flags, 0);
1136 		if (rt->rt_flags & RTF_DYNAMIC)
1137 			rtrequest1_global(RTM_DELETE, &rtinfo, NULL, NULL);
1138 		inp->inp_route.ro_rt = NULL;
1139 		rtfree(rt);
1140 		/*
1141 		 * A new route can be allocated
1142 		 * the next time output is attempted.
1143 		 */
1144 	}
1145 }
1146 
1147 /*
1148  * After a routing change, flush old routing
1149  * and allocate a (hopefully) better one.
1150  */
1151 void
1152 in_rtchange(struct inpcb *inp, int err)
1153 {
1154 	if (inp->inp_route.ro_rt) {
1155 		rtfree(inp->inp_route.ro_rt);
1156 		inp->inp_route.ro_rt = NULL;
1157 		/*
1158 		 * A new route can be allocated the next time
1159 		 * output is attempted.
1160 		 */
1161 	}
1162 }
1163 
1164 /*
1165  * Lookup a PCB based on the local address and port.
1166  */
1167 struct inpcb *
1168 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1169 		   u_int lport_arg, int wild_okay, struct ucred *cred)
1170 {
1171 	struct inpcb *inp;
1172 	int matchwild = 3, wildcard;
1173 	u_short lport = lport_arg;
1174 	struct inpcbporthead *porthash;
1175 	struct inpcbport *phd;
1176 	struct inpcb *match = NULL;
1177 
1178 	/*
1179 	 * If the porthashbase is shared across several cpus we need
1180 	 * to lock.
1181 	 */
1182 	if (pcbinfo->porttoken)
1183 		lwkt_gettoken(pcbinfo->porttoken);
1184 
1185 	/*
1186 	 * Best fit PCB lookup.
1187 	 *
1188 	 * First see if this local port is in use by looking on the
1189 	 * port hash list.
1190 	 */
1191 	porthash = &pcbinfo->porthashbase[
1192 			INP_PCBPORTHASH(lport, pcbinfo->porthashmask)];
1193 	LIST_FOREACH(phd, porthash, phd_hash) {
1194 		if (phd->phd_port == lport)
1195 			break;
1196 	}
1197 	if (phd != NULL) {
1198 		/*
1199 		 * Port is in use by one or more PCBs. Look for best
1200 		 * fit.
1201 		 */
1202 		LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1203 			wildcard = 0;
1204 #ifdef INET6
1205 			if ((inp->inp_vflag & INP_IPV4) == 0)
1206 				continue;
1207 #endif
1208 			if (inp->inp_faddr.s_addr != INADDR_ANY)
1209 				wildcard++;
1210 			if (inp->inp_laddr.s_addr != INADDR_ANY) {
1211 				if (laddr.s_addr == INADDR_ANY)
1212 					wildcard++;
1213 				else if (inp->inp_laddr.s_addr != laddr.s_addr)
1214 					continue;
1215 			} else {
1216 				if (laddr.s_addr != INADDR_ANY)
1217 					wildcard++;
1218 			}
1219 			if (wildcard && !wild_okay)
1220 				continue;
1221 			if (wildcard < matchwild &&
1222 			    (cred == NULL ||
1223 			     cred->cr_prison ==
1224 					inp->inp_socket->so_cred->cr_prison)) {
1225 				match = inp;
1226 				matchwild = wildcard;
1227 				if (matchwild == 0) {
1228 					break;
1229 				}
1230 			}
1231 		}
1232 	}
1233 	if (pcbinfo->porttoken)
1234 		lwkt_reltoken(pcbinfo->porttoken);
1235 	return (match);
1236 }
1237 
1238 static struct inpcb *
1239 inp_localgroup_lookup(const struct inpcbinfo *pcbinfo,
1240     struct in_addr laddr, uint16_t lport, uint32_t pkt_hash)
1241 {
1242 	struct inpcb *local_wild = NULL;
1243 	const struct inp_localgrphead *hdr;
1244 	const struct inp_localgroup *grp;
1245 
1246 	hdr = &pcbinfo->localgrphashbase[
1247 	    INP_PCBLOCALGRPHASH(lport, pcbinfo->localgrphashmask)];
1248 	pkt_hash >>= ncpus2_shift;
1249 
1250 	/*
1251 	 * Order of socket selection:
1252 	 * 1. non-wild.
1253 	 * 2. wild.
1254 	 *
1255 	 * NOTE:
1256 	 * - Local group does not contain jailed sockets
1257 	 * - Local group does not contain IPv4 mapped INET6 wild sockets
1258 	 */
1259 	LIST_FOREACH(grp, hdr, il_list) {
1260 #ifdef INET6
1261 		if (!(grp->il_vflag & INP_IPV4))
1262 			continue;
1263 #endif
1264 		if (grp->il_lport == lport) {
1265 			int idx;
1266 
1267 			idx = pkt_hash / grp->il_factor;
1268 			KASSERT(idx < grp->il_inpcnt && idx >= 0,
1269 			    ("invalid hash %04x, cnt %d or fact %d",
1270 			     pkt_hash, grp->il_inpcnt, grp->il_factor));
1271 
1272 			if (grp->il_laddr.s_addr == laddr.s_addr)
1273 				return grp->il_inp[idx];
1274 			else if (grp->il_laddr.s_addr == INADDR_ANY)
1275 				local_wild = grp->il_inp[idx];
1276 		}
1277 	}
1278 	if (local_wild != NULL)
1279 		return local_wild;
1280 	return NULL;
1281 }
1282 
1283 /*
1284  * Lookup PCB in hash list.
1285  */
1286 struct inpcb *
1287 in_pcblookup_pkthash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1288     u_int fport_arg, struct in_addr laddr, u_int lport_arg,
1289     boolean_t wildcard, struct ifnet *ifp, const struct mbuf *m)
1290 {
1291 	struct inpcbhead *head;
1292 	struct inpcb *inp, *jinp=NULL;
1293 	u_short fport = fport_arg, lport = lport_arg;
1294 
1295 	/*
1296 	 * First look for an exact match.
1297 	 */
1298 	head = &pcbinfo->hashbase[INP_PCBCONNHASH(faddr.s_addr, fport,
1299 	    laddr.s_addr, lport, pcbinfo->hashmask)];
1300 	LIST_FOREACH(inp, head, inp_hash) {
1301 #ifdef INET6
1302 		if (!(inp->inp_vflag & INP_IPV4))
1303 			continue;
1304 #endif
1305 		if (in_hosteq(inp->inp_faddr, faddr) &&
1306 		    in_hosteq(inp->inp_laddr, laddr) &&
1307 		    inp->inp_fport == fport && inp->inp_lport == lport) {
1308 			/* found */
1309 			if (inp->inp_socket == NULL ||
1310 			    inp->inp_socket->so_cred->cr_prison == NULL) {
1311 				return (inp);
1312 			} else {
1313 				if  (jinp == NULL)
1314 					jinp = inp;
1315 			}
1316 		}
1317 	}
1318 	if (jinp != NULL)
1319 		return (jinp);
1320 	if (wildcard) {
1321 		struct inpcb *local_wild = NULL;
1322 		struct inpcb *jinp_wild = NULL;
1323 #ifdef INET6
1324 		struct inpcb *local_wild_mapped = NULL;
1325 #endif
1326 		struct inpcontainer *ic;
1327 		struct inpcontainerhead *chead;
1328 		struct sockaddr_in jsin;
1329 		struct ucred *cred;
1330 
1331 		/*
1332 		 * Check local group first
1333 		 */
1334 		if (pcbinfo->localgrphashbase != NULL &&
1335 		    m != NULL && (m->m_flags & M_HASH) &&
1336 		    !(ifp && ifp->if_type == IFT_FAITH)) {
1337 			inp = inp_localgroup_lookup(pcbinfo,
1338 			    laddr, lport, m->m_pkthdr.hash);
1339 			if (inp != NULL)
1340 				return inp;
1341 		}
1342 
1343 		/*
1344 		 * Order of socket selection:
1345 		 * 1. non-jailed, non-wild.
1346 		 * 2. non-jailed, wild.
1347 		 * 3. jailed, non-wild.
1348 		 * 4. jailed, wild.
1349 		 */
1350 		jsin.sin_family = AF_INET;
1351 		chead = &pcbinfo->wildcardhashbase[
1352 		    INP_PCBWILDCARDHASH(lport, pcbinfo->wildcardhashmask)];
1353 		LIST_FOREACH(ic, chead, ic_list) {
1354 			inp = ic->ic_inp;
1355 			jsin.sin_addr.s_addr = laddr.s_addr;
1356 #ifdef INET6
1357 			if (!(inp->inp_vflag & INP_IPV4))
1358 				continue;
1359 #endif
1360 			if (inp->inp_socket != NULL)
1361 				cred = inp->inp_socket->so_cred;
1362 			else
1363 				cred = NULL;
1364 			if (cred != NULL && jailed(cred)) {
1365 				if (jinp != NULL)
1366 					continue;
1367 				else
1368 					if (!jailed_ip(cred->cr_prison,
1369 					    (struct sockaddr *)&jsin))
1370 						continue;
1371 			}
1372 			if (inp->inp_lport == lport) {
1373 				if (ifp && ifp->if_type == IFT_FAITH &&
1374 				    !(inp->inp_flags & INP_FAITH))
1375 					continue;
1376 				if (inp->inp_laddr.s_addr == laddr.s_addr) {
1377 					if (cred != NULL && jailed(cred))
1378 						jinp = inp;
1379 					else
1380 						return (inp);
1381 				}
1382 				if (inp->inp_laddr.s_addr == INADDR_ANY) {
1383 #ifdef INET6
1384 					if (INP_CHECK_SOCKAF(inp->inp_socket,
1385 							     AF_INET6))
1386 						local_wild_mapped = inp;
1387 					else
1388 #endif
1389 						if (cred != NULL &&
1390 						    jailed(cred))
1391 							jinp_wild = inp;
1392 						else
1393 							local_wild = inp;
1394 				}
1395 			}
1396 		}
1397 		if (local_wild != NULL)
1398 			return (local_wild);
1399 #ifdef INET6
1400 		if (local_wild_mapped != NULL)
1401 			return (local_wild_mapped);
1402 #endif
1403 		if (jinp != NULL)
1404 			return (jinp);
1405 		return (jinp_wild);
1406 	}
1407 
1408 	/*
1409 	 * Not found.
1410 	 */
1411 	return (NULL);
1412 }
1413 
1414 struct inpcb *
1415 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1416     u_int fport_arg, struct in_addr laddr, u_int lport_arg,
1417     boolean_t wildcard, struct ifnet *ifp)
1418 {
1419 	return in_pcblookup_pkthash(pcbinfo, faddr, fport_arg,
1420 	    laddr, lport_arg, wildcard, ifp, NULL);
1421 }
1422 
1423 /*
1424  * Insert PCB into connection hash table.
1425  */
1426 void
1427 in_pcbinsconnhash(struct inpcb *inp)
1428 {
1429 	struct inpcbinfo *pcbinfo = inp->inp_cpcbinfo;
1430 	struct inpcbhead *bucket;
1431 	u_int32_t hashkey_faddr, hashkey_laddr;
1432 
1433 #ifdef INET6
1434 	if (inp->inp_vflag & INP_IPV6) {
1435 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX JH */;
1436 		hashkey_laddr = inp->in6p_laddr.s6_addr32[3] /* XXX JH */;
1437 	} else {
1438 #endif
1439 		hashkey_faddr = inp->inp_faddr.s_addr;
1440 		hashkey_laddr = inp->inp_laddr.s_addr;
1441 #ifdef INET6
1442 	}
1443 #endif
1444 
1445 	KASSERT(!(inp->inp_flags & INP_WILDCARD),
1446 		("already on wildcardhash"));
1447 	KASSERT(!(inp->inp_flags & INP_CONNECTED),
1448 		("already on connhash"));
1449 	inp->inp_flags |= INP_CONNECTED;
1450 
1451 	/*
1452 	 * Insert into the connection hash table.
1453 	 */
1454 	bucket = &pcbinfo->hashbase[INP_PCBCONNHASH(hashkey_faddr,
1455 	    inp->inp_fport, hashkey_laddr, inp->inp_lport, pcbinfo->hashmask)];
1456 	LIST_INSERT_HEAD(bucket, inp, inp_hash);
1457 }
1458 
1459 /*
1460  * Remove PCB from connection hash table.
1461  */
1462 void
1463 in_pcbremconnhash(struct inpcb *inp)
1464 {
1465 	KASSERT(inp->inp_flags & INP_CONNECTED, ("inp not connected"));
1466 	LIST_REMOVE(inp, inp_hash);
1467 	inp->inp_flags &= ~INP_CONNECTED;
1468 }
1469 
1470 /*
1471  * Insert PCB into port hash table.
1472  */
1473 int
1474 in_pcbinsporthash(struct inpcb *inp)
1475 {
1476 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1477 	struct inpcbporthead *pcbporthash;
1478 	struct inpcbport *phd;
1479 
1480 	/*
1481 	 * If the porthashbase is shared across several cpus we need
1482 	 * to lock.
1483 	 */
1484 	if (pcbinfo->porttoken)
1485 		lwkt_gettoken(pcbinfo->porttoken);
1486 
1487 	/*
1488 	 * Insert into the port hash table.
1489 	 */
1490 	pcbporthash = &pcbinfo->porthashbase[
1491 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->porthashmask)];
1492 
1493 	/* Go through port list and look for a head for this lport. */
1494 	LIST_FOREACH(phd, pcbporthash, phd_hash) {
1495 		if (phd->phd_port == inp->inp_lport)
1496 			break;
1497 	}
1498 
1499 	/* If none exists, malloc one and tack it on. */
1500 	if (phd == NULL) {
1501 		KKASSERT(pcbinfo->portsave != NULL);
1502 		phd = pcbinfo->portsave;
1503 		pcbinfo->portsave = NULL;
1504 		phd->phd_port = inp->inp_lport;
1505 		LIST_INIT(&phd->phd_pcblist);
1506 		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1507 	}
1508 
1509 	inp->inp_phd = phd;
1510 	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1511 
1512 	if (pcbinfo->porttoken)
1513 		lwkt_reltoken(pcbinfo->porttoken);
1514 	if (pcbinfo->portsave == NULL) {
1515 		pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave),
1516 					    M_PCB, M_INTWAIT | M_ZERO);
1517 	}
1518 	return (0);
1519 }
1520 
1521 static struct inp_localgroup *
1522 inp_localgroup_alloc(struct inp_localgrphead *hdr, u_char vflag,
1523     uint16_t port, const union in_dependaddr *addr, int size)
1524 {
1525 	struct inp_localgroup *grp;
1526 
1527 	grp = kmalloc(__offsetof(struct inp_localgroup, il_inp[size]),
1528 	    M_TEMP, M_INTWAIT | M_ZERO);
1529 	grp->il_vflag = vflag;
1530 	grp->il_lport = port;
1531 	grp->il_dependladdr = *addr;
1532 	grp->il_inpsiz = size;
1533 
1534 	LIST_INSERT_HEAD(hdr, grp, il_list);
1535 
1536 	return grp;
1537 }
1538 
1539 static void
1540 inp_localgroup_free(struct inp_localgroup *grp)
1541 {
1542 	LIST_REMOVE(grp, il_list);
1543 	kfree(grp, M_TEMP);
1544 }
1545 
1546 static struct inp_localgroup *
1547 inp_localgroup_resize(struct inp_localgrphead *hdr,
1548     struct inp_localgroup *old_grp, int size)
1549 {
1550 	struct inp_localgroup *grp;
1551 	int i;
1552 
1553 	grp = inp_localgroup_alloc(hdr, old_grp->il_vflag,
1554 	    old_grp->il_lport, &old_grp->il_dependladdr, size);
1555 
1556 	KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
1557 	    ("invalid new local group size %d and old local group count %d",
1558 	     grp->il_inpsiz, old_grp->il_inpcnt));
1559 	for (i = 0; i < old_grp->il_inpcnt; ++i)
1560 		grp->il_inp[i] = old_grp->il_inp[i];
1561 	grp->il_inpcnt = old_grp->il_inpcnt;
1562 	grp->il_factor = old_grp->il_factor;
1563 
1564 	inp_localgroup_free(old_grp);
1565 
1566 	return grp;
1567 }
1568 
1569 static void
1570 inp_localgroup_factor(struct inp_localgroup *grp)
1571 {
1572 	grp->il_factor =
1573 	    ((uint32_t)(0xffff >> ncpus2_shift) / grp->il_inpcnt) + 1;
1574 	KASSERT(grp->il_factor != 0, ("invalid local group factor, "
1575 	    "ncpus2_shift %d, inpcnt %d", ncpus2_shift, grp->il_inpcnt));
1576 }
1577 
1578 static void
1579 in_pcbinslocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1580 {
1581 	struct inp_localgrphead *hdr;
1582 	struct inp_localgroup *grp;
1583 	struct ucred *cred;
1584 
1585 	if (pcbinfo->localgrphashbase == NULL)
1586 		return;
1587 
1588 	/*
1589 	 * XXX don't allow jailed socket to join local group
1590 	 */
1591 	if (inp->inp_socket != NULL)
1592 		cred = inp->inp_socket->so_cred;
1593 	else
1594 		cred = NULL;
1595 	if (cred != NULL && jailed(cred))
1596 		return;
1597 
1598 #ifdef INET6
1599 	/*
1600 	 * XXX don't allow IPv4 mapped INET6 wild socket
1601 	 */
1602 	if ((inp->inp_vflag & INP_IPV4) &&
1603 	    inp->inp_laddr.s_addr == INADDR_ANY &&
1604 	    INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6))
1605 		return;
1606 #endif
1607 
1608 	hdr = &pcbinfo->localgrphashbase[
1609 	    INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)];
1610 
1611 	LIST_FOREACH(grp, hdr, il_list) {
1612 		if (grp->il_vflag == inp->inp_vflag &&
1613 		    grp->il_lport == inp->inp_lport &&
1614 		    memcmp(&grp->il_dependladdr,
1615 		        &inp->inp_inc.inc_ie.ie_dependladdr,
1616 		        sizeof(grp->il_dependladdr)) == 0) {
1617 			break;
1618 		}
1619 	}
1620 	if (grp == NULL) {
1621 		/* Create new local group */
1622 		grp = inp_localgroup_alloc(hdr, inp->inp_vflag,
1623 		    inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
1624 		    INP_LOCALGROUP_SIZMIN);
1625 	} else if (grp->il_inpcnt == grp->il_inpsiz) {
1626 		if (grp->il_inpsiz >= INP_LOCALGROUP_SIZMAX) {
1627 			static int limit_logged = 0;
1628 
1629 			if (!limit_logged) {
1630 				limit_logged = 1;
1631 				kprintf("local group port %d, "
1632 				    "limit reached\n", ntohs(grp->il_lport));
1633 			}
1634 			return;
1635 		}
1636 
1637 		/* Expand this local group */
1638 		grp = inp_localgroup_resize(hdr, grp, grp->il_inpsiz * 2);
1639 	}
1640 
1641 	KASSERT(grp->il_inpcnt < grp->il_inpsiz,
1642 	    ("invalid local group size %d and count %d",
1643 	     grp->il_inpsiz, grp->il_inpcnt));
1644 	grp->il_inp[grp->il_inpcnt] = inp;
1645 	grp->il_inpcnt++;
1646 	inp_localgroup_factor(grp);
1647 }
1648 
1649 void
1650 in_pcbinswildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1651 {
1652 	struct inpcontainer *ic;
1653 	struct inpcontainerhead *bucket;
1654 
1655 	in_pcbinslocalgrphash_oncpu(inp, pcbinfo);
1656 
1657 	bucket = &pcbinfo->wildcardhashbase[
1658 	    INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)];
1659 
1660 	ic = kmalloc(sizeof(struct inpcontainer), M_TEMP, M_INTWAIT);
1661 	ic->ic_inp = inp;
1662 	LIST_INSERT_HEAD(bucket, ic, ic_list);
1663 }
1664 
1665 /*
1666  * Insert PCB into wildcard hash table.
1667  */
1668 void
1669 in_pcbinswildcardhash(struct inpcb *inp)
1670 {
1671 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1672 
1673 	KASSERT(!(inp->inp_flags & INP_CONNECTED),
1674 		("already on connhash"));
1675 	KASSERT(!(inp->inp_flags & INP_WILDCARD),
1676 		("already on wildcardhash"));
1677 	inp->inp_flags |= INP_WILDCARD;
1678 
1679 	in_pcbinswildcardhash_oncpu(inp, pcbinfo);
1680 }
1681 
1682 static void
1683 in_pcbremlocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1684 {
1685 	struct inp_localgrphead *hdr;
1686 	struct inp_localgroup *grp;
1687 
1688 	if (pcbinfo->localgrphashbase == NULL)
1689 		return;
1690 
1691 	hdr = &pcbinfo->localgrphashbase[
1692 	    INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)];
1693 
1694 	LIST_FOREACH(grp, hdr, il_list) {
1695 		int i;
1696 
1697 		for (i = 0; i < grp->il_inpcnt; ++i) {
1698 			if (grp->il_inp[i] != inp)
1699 				continue;
1700 
1701 			if (grp->il_inpcnt == 1) {
1702 				/* Free this local group */
1703 				inp_localgroup_free(grp);
1704 			} else {
1705 				/* Pull up inpcbs */
1706 				for (; i + 1 < grp->il_inpcnt; ++i)
1707 					grp->il_inp[i] = grp->il_inp[i + 1];
1708 				grp->il_inpcnt--;
1709 				inp_localgroup_factor(grp);
1710 
1711 				if (grp->il_inpsiz > INP_LOCALGROUP_SIZMIN &&
1712 				    grp->il_inpcnt <= (grp->il_inpsiz / 4)) {
1713 					/* Shrink this local group */
1714 					grp = inp_localgroup_resize(hdr, grp,
1715 					    grp->il_inpsiz / 2);
1716 				}
1717 			}
1718 			return;
1719 		}
1720 	}
1721 }
1722 
1723 void
1724 in_pcbremwildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1725 {
1726 	struct inpcontainer *ic;
1727 	struct inpcontainerhead *head;
1728 
1729 	in_pcbremlocalgrphash_oncpu(inp, pcbinfo);
1730 
1731 	/* find bucket */
1732 	head = &pcbinfo->wildcardhashbase[
1733 	    INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)];
1734 
1735 	LIST_FOREACH(ic, head, ic_list) {
1736 		if (ic->ic_inp == inp)
1737 			goto found;
1738 	}
1739 	return;			/* not found! */
1740 
1741 found:
1742 	LIST_REMOVE(ic, ic_list);	/* remove container from bucket chain */
1743 	kfree(ic, M_TEMP);		/* deallocate container */
1744 }
1745 
1746 /*
1747  * Remove PCB from wildcard hash table.
1748  */
1749 void
1750 in_pcbremwildcardhash(struct inpcb *inp)
1751 {
1752 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1753 
1754 	KASSERT(inp->inp_flags & INP_WILDCARD, ("inp not wildcard"));
1755 	in_pcbremwildcardhash_oncpu(inp, pcbinfo);
1756 	inp->inp_flags &= ~INP_WILDCARD;
1757 }
1758 
1759 /*
1760  * Remove PCB from various lists.
1761  */
1762 void
1763 in_pcbremlists(struct inpcb *inp)
1764 {
1765 	struct inpcbinfo *pcbinfo;
1766 
1767 	if (inp->inp_lport) {
1768 		struct inpcbport *phd;
1769 
1770 		pcbinfo = inp->inp_pcbinfo;
1771 		if (pcbinfo->porttoken)
1772 			lwkt_gettoken(pcbinfo->porttoken);
1773 
1774 		phd = inp->inp_phd;
1775 		LIST_REMOVE(inp, inp_portlist);
1776 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1777 			LIST_REMOVE(phd, phd_hash);
1778 			kfree(phd, M_PCB);
1779 		}
1780 		if (pcbinfo->porttoken)
1781 			lwkt_reltoken(pcbinfo->porttoken);
1782 	}
1783 	if (inp->inp_flags & INP_WILDCARD) {
1784 		in_pcbremwildcardhash(inp);
1785 	} else if (inp->inp_flags & INP_CONNECTED) {
1786 		in_pcbremconnhash(inp);
1787 	}
1788 	LIST_REMOVE(inp, inp_list);
1789 	inp->inp_pcbinfo->ipi_count--;
1790 }
1791 
1792 int
1793 prison_xinpcb(struct thread *td, struct inpcb *inp)
1794 {
1795 	struct ucred *cr;
1796 
1797 	if (td->td_proc == NULL)
1798 		return (0);
1799 	cr = td->td_proc->p_ucred;
1800 	if (cr->cr_prison == NULL)
1801 		return (0);
1802 	if (inp->inp_socket && inp->inp_socket->so_cred &&
1803 	    inp->inp_socket->so_cred->cr_prison &&
1804 	    cr->cr_prison == inp->inp_socket->so_cred->cr_prison)
1805 		return (0);
1806 	return (1);
1807 }
1808 
1809 int
1810 in_pcblist_global(SYSCTL_HANDLER_ARGS)
1811 {
1812 	struct inpcbinfo *pcbinfo = arg1;
1813 	struct inpcb *inp, *marker;
1814 	struct xinpcb xi;
1815 	int error, i, n;
1816 
1817 	/*
1818 	 * The process of preparing the TCB list is too time-consuming and
1819 	 * resource-intensive to repeat twice on every request.
1820 	 */
1821 	if (req->oldptr == NULL) {
1822 		n = pcbinfo->ipi_count;
1823 		req->oldidx = (n + n/8 + 10) * sizeof(struct xinpcb);
1824 		return 0;
1825 	}
1826 
1827 	if (req->newptr != NULL)
1828 		return EPERM;
1829 
1830 	/*
1831 	 * OK, now we're committed to doing something.  Re-fetch ipi_count
1832 	 * after obtaining the generation count.
1833 	 */
1834 	n = pcbinfo->ipi_count;
1835 
1836 	marker = kmalloc(sizeof(struct inpcb), M_TEMP, M_WAITOK|M_ZERO);
1837 	marker->inp_flags |= INP_PLACEMARKER;
1838 	LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list);
1839 
1840 	i = 0;
1841 	error = 0;
1842 
1843 	while ((inp = LIST_NEXT(marker, inp_list)) != NULL && i < n) {
1844 		LIST_REMOVE(marker, inp_list);
1845 		LIST_INSERT_AFTER(inp, marker, inp_list);
1846 
1847 		if (inp->inp_flags & INP_PLACEMARKER)
1848 			continue;
1849 		if (prison_xinpcb(req->td, inp))
1850 			continue;
1851 		bzero(&xi, sizeof xi);
1852 		xi.xi_len = sizeof xi;
1853 		bcopy(inp, &xi.xi_inp, sizeof *inp);
1854 		if (inp->inp_socket)
1855 			sotoxsocket(inp->inp_socket, &xi.xi_socket);
1856 		if ((error = SYSCTL_OUT(req, &xi, sizeof xi)) != 0)
1857 			break;
1858 		++i;
1859 	}
1860 	LIST_REMOVE(marker, inp_list);
1861 	if (error == 0 && i < n) {
1862 		bzero(&xi, sizeof xi);
1863 		xi.xi_len = sizeof xi;
1864 		while (i < n) {
1865 			error = SYSCTL_OUT(req, &xi, sizeof xi);
1866 			++i;
1867 		}
1868 	}
1869 	kfree(marker, M_TEMP);
1870 	return(error);
1871 }
1872 
1873 int
1874 in_pcblist_global_nomarker(SYSCTL_HANDLER_ARGS, struct xinpcb **xi0, int *nxi0)
1875 {
1876 	struct inpcbinfo *pcbinfo = arg1;
1877 	struct inpcb *inp;
1878 	struct xinpcb *xi;
1879 	int nxi;
1880 
1881 	*nxi0 = 0;
1882 	*xi0 = NULL;
1883 
1884 	/*
1885 	 * The process of preparing the PCB list is too time-consuming and
1886 	 * resource-intensive to repeat twice on every request.
1887 	 */
1888 	if (req->oldptr == NULL) {
1889 		int n = pcbinfo->ipi_count;
1890 
1891 		req->oldidx = (n + n/8 + 10) * sizeof(struct xinpcb);
1892 		return 0;
1893 	}
1894 
1895 	if (req->newptr != NULL)
1896 		return EPERM;
1897 
1898 	if (pcbinfo->ipi_count == 0)
1899 		return 0;
1900 
1901 	nxi = 0;
1902 	xi = kmalloc(pcbinfo->ipi_count * sizeof(*xi), M_TEMP,
1903 		     M_WAITOK | M_ZERO | M_NULLOK);
1904 	if (xi == NULL)
1905 		return ENOMEM;
1906 
1907 	LIST_FOREACH(inp, &pcbinfo->pcblisthead, inp_list) {
1908 		struct xinpcb *xi_ptr = &xi[nxi];
1909 
1910 		if (prison_xinpcb(req->td, inp))
1911 			continue;
1912 
1913 		xi_ptr->xi_len = sizeof(*xi_ptr);
1914 		bcopy(inp, &xi_ptr->xi_inp, sizeof(*inp));
1915 		if (inp->inp_socket)
1916 			sotoxsocket(inp->inp_socket, &xi_ptr->xi_socket);
1917 		++nxi;
1918 	}
1919 
1920 	if (nxi == 0) {
1921 		kfree(xi, M_TEMP);
1922 		return 0;
1923 	}
1924 
1925 	*nxi0 = nxi;
1926 	*xi0 = xi;
1927 
1928 	return 0;
1929 }
1930 
1931 void
1932 in_savefaddr(struct socket *so, const struct sockaddr *faddr)
1933 {
1934 	struct sockaddr_in *sin;
1935 
1936 	KASSERT(faddr->sa_family == AF_INET,
1937 	    ("not AF_INET faddr %d", faddr->sa_family));
1938 
1939 	sin = kmalloc(sizeof(*sin), M_SONAME, M_WAITOK | M_ZERO);
1940 	sin->sin_family = AF_INET;
1941 	sin->sin_len = sizeof(*sin);
1942 	sin->sin_port = ((const struct sockaddr_in *)faddr)->sin_port;
1943 	sin->sin_addr = ((const struct sockaddr_in *)faddr)->sin_addr;
1944 
1945 	so->so_faddr = (struct sockaddr *)sin;
1946 }
1947