xref: /dragonfly/sys/kern/uipc_usrreq.c (revision ce7a3582)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
34  * $FreeBSD: src/sys/kern/uipc_usrreq.c,v 1.54.2.10 2003/03/04 17:28:09 nectar Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/domain.h>
41 #include <sys/fcntl.h>
42 #include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
43 #include <sys/proc.h>
44 #include <sys/file.h>
45 #include <sys/filedesc.h>
46 #include <sys/mbuf.h>
47 #include <sys/nlookup.h>
48 #include <sys/protosw.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/resourcevar.h>
52 #include <sys/stat.h>
53 #include <sys/mount.h>
54 #include <sys/sysctl.h>
55 #include <sys/un.h>
56 #include <sys/unpcb.h>
57 #include <sys/vnode.h>
58 
59 #include <sys/file2.h>
60 #include <sys/spinlock2.h>
61 #include <sys/socketvar2.h>
62 #include <sys/msgport2.h>
63 
64 typedef struct unp_defdiscard {
65 	struct unp_defdiscard *next;
66 	struct file *fp;
67 } *unp_defdiscard_t;
68 
69 static	MALLOC_DEFINE(M_UNPCB, "unpcb", "unpcb struct");
70 static	unp_gen_t unp_gencnt;
71 static	u_int unp_count;
72 
73 static	struct unp_head unp_shead, unp_dhead;
74 
75 static struct lwkt_token unp_token = LWKT_TOKEN_INITIALIZER(unp_token);
76 static int unp_defdiscard_nest;
77 static unp_defdiscard_t unp_defdiscard_base;
78 
79 /*
80  * Unix communications domain.
81  *
82  * TODO:
83  *	RDM
84  *	rethink name space problems
85  *	need a proper out-of-band
86  *	lock pushdown
87  */
88 static struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
89 static ino_t	unp_ino = 1;		/* prototype for fake inode numbers */
90 static struct spinlock unp_ino_spin = SPINLOCK_INITIALIZER(&unp_ino_spin);
91 
92 static int     unp_attach (struct socket *, struct pru_attach_info *);
93 static void    unp_detach (struct unpcb *);
94 static int     unp_bind (struct unpcb *,struct sockaddr *, struct thread *);
95 static int     unp_connect (struct socket *,struct sockaddr *,
96 				struct thread *);
97 static void    unp_disconnect (struct unpcb *);
98 static void    unp_shutdown (struct unpcb *);
99 static void    unp_drop (struct unpcb *, int);
100 static void    unp_gc (void);
101 static int     unp_gc_clearmarks(struct file *, void *);
102 static int     unp_gc_checkmarks(struct file *, void *);
103 static int     unp_gc_checkrefs(struct file *, void *);
104 static int     unp_revoke_gc_check(struct file *, void *);
105 static void    unp_scan (struct mbuf *, void (*)(struct file *, void *),
106 				void *data);
107 static void    unp_mark (struct file *, void *data);
108 static void    unp_discard (struct file *, void *);
109 static int     unp_internalize (struct mbuf *, struct thread *);
110 static int     unp_listen (struct unpcb *, struct thread *);
111 static void    unp_fp_externalize(struct lwp *lp, struct file *fp, int fd);
112 
113 /*
114  * NOTE:
115  * Since unp_token will be automaticly released upon execution of
116  * blocking code, we need to reference unp_conn before any possible
117  * blocking code to prevent it from being ripped behind our back.
118  */
119 
120 /* NOTE: unp_token MUST be held */
121 static __inline void
122 unp_reference(struct unpcb *unp)
123 {
124 	atomic_add_int(&unp->unp_refcnt, 1);
125 }
126 
127 /* NOTE: unp_token MUST be held */
128 static __inline void
129 unp_free(struct unpcb *unp)
130 {
131 	KKASSERT(unp->unp_refcnt > 0);
132 	if (atomic_fetchadd_int(&unp->unp_refcnt, -1) == 1)
133 		unp_detach(unp);
134 }
135 
136 /*
137  * NOTE: (so) is referenced from soabort*() and netmsg_pru_abort()
138  *	 will sofree() it when we return.
139  */
140 static void
141 uipc_abort(netmsg_t msg)
142 {
143 	struct unpcb *unp;
144 	int error;
145 
146 	lwkt_gettoken(&unp_token);
147 	unp = msg->base.nm_so->so_pcb;
148 	if (unp) {
149 		unp_drop(unp, ECONNABORTED);
150 		unp_free(unp);
151 		error = 0;
152 	} else {
153 		error = EINVAL;
154 	}
155 	lwkt_reltoken(&unp_token);
156 
157 	lwkt_replymsg(&msg->lmsg, error);
158 }
159 
160 static void
161 uipc_accept(netmsg_t msg)
162 {
163 	struct unpcb *unp;
164 	int error;
165 
166 	lwkt_gettoken(&unp_token);
167 	unp = msg->base.nm_so->so_pcb;
168 	if (unp == NULL) {
169 		error = EINVAL;
170 	} else {
171 		struct unpcb *unp2 = unp->unp_conn;
172 
173 		/*
174 		 * Pass back name of connected socket,
175 		 * if it was bound and we are still connected
176 		 * (our peer may have closed already!).
177 		 */
178 		if (unp2 && unp2->unp_addr) {
179 			unp_reference(unp2);
180 			*msg->accept.nm_nam = dup_sockaddr(
181 				(struct sockaddr *)unp2->unp_addr);
182 			unp_free(unp2);
183 		} else {
184 			*msg->accept.nm_nam = dup_sockaddr(
185 				(struct sockaddr *)&sun_noname);
186 		}
187 		error = 0;
188 	}
189 	lwkt_reltoken(&unp_token);
190 	lwkt_replymsg(&msg->lmsg, error);
191 }
192 
193 static void
194 uipc_attach(netmsg_t msg)
195 {
196 	struct unpcb *unp;
197 	int error;
198 
199 	lwkt_gettoken(&unp_token);
200 	unp = msg->base.nm_so->so_pcb;
201 	if (unp)
202 		error = EISCONN;
203 	else
204 		error = unp_attach(msg->base.nm_so, msg->attach.nm_ai);
205 	lwkt_reltoken(&unp_token);
206 	lwkt_replymsg(&msg->lmsg, error);
207 }
208 
209 static void
210 uipc_bind(netmsg_t msg)
211 {
212 	struct unpcb *unp;
213 	int error;
214 
215 	lwkt_gettoken(&unp_token);
216 	unp = msg->base.nm_so->so_pcb;
217 	if (unp)
218 		error = unp_bind(unp, msg->bind.nm_nam, msg->bind.nm_td);
219 	else
220 		error = EINVAL;
221 	lwkt_reltoken(&unp_token);
222 	lwkt_replymsg(&msg->lmsg, error);
223 }
224 
225 static void
226 uipc_connect(netmsg_t msg)
227 {
228 	struct unpcb *unp;
229 	int error;
230 
231 	lwkt_gettoken(&unp_token);
232 	unp = msg->base.nm_so->so_pcb;
233 	if (unp) {
234 		error = unp_connect(msg->base.nm_so,
235 				    msg->connect.nm_nam,
236 				    msg->connect.nm_td);
237 	} else {
238 		error = EINVAL;
239 	}
240 	lwkt_reltoken(&unp_token);
241 	lwkt_replymsg(&msg->lmsg, error);
242 }
243 
244 static void
245 uipc_connect2(netmsg_t msg)
246 {
247 	struct unpcb *unp;
248 	int error;
249 
250 	lwkt_gettoken(&unp_token);
251 	unp = msg->connect2.nm_so1->so_pcb;
252 	if (unp) {
253 		error = unp_connect2(msg->connect2.nm_so1,
254 				     msg->connect2.nm_so2);
255 	} else {
256 		error = EINVAL;
257 	}
258 	lwkt_reltoken(&unp_token);
259 	lwkt_replymsg(&msg->lmsg, error);
260 }
261 
262 /* control is EOPNOTSUPP */
263 
264 static void
265 uipc_detach(netmsg_t msg)
266 {
267 	struct unpcb *unp;
268 	int error;
269 
270 	lwkt_gettoken(&unp_token);
271 	unp = msg->base.nm_so->so_pcb;
272 	if (unp) {
273 		unp_free(unp);
274 		error = 0;
275 	} else {
276 		error = EINVAL;
277 	}
278 	lwkt_reltoken(&unp_token);
279 	lwkt_replymsg(&msg->lmsg, error);
280 }
281 
282 static void
283 uipc_disconnect(netmsg_t msg)
284 {
285 	struct unpcb *unp;
286 	int error;
287 
288 	lwkt_gettoken(&unp_token);
289 	unp = msg->base.nm_so->so_pcb;
290 	if (unp) {
291 		unp_disconnect(unp);
292 		error = 0;
293 	} else {
294 		error = EINVAL;
295 	}
296 	lwkt_reltoken(&unp_token);
297 	lwkt_replymsg(&msg->lmsg, error);
298 }
299 
300 static void
301 uipc_listen(netmsg_t msg)
302 {
303 	struct unpcb *unp;
304 	int error;
305 
306 	lwkt_gettoken(&unp_token);
307 	unp = msg->base.nm_so->so_pcb;
308 	if (unp == NULL || unp->unp_vnode == NULL)
309 		error = EINVAL;
310 	else
311 		error = unp_listen(unp, msg->listen.nm_td);
312 	lwkt_reltoken(&unp_token);
313 	lwkt_replymsg(&msg->lmsg, error);
314 }
315 
316 static void
317 uipc_peeraddr(netmsg_t msg)
318 {
319 	struct unpcb *unp;
320 	int error;
321 
322 	lwkt_gettoken(&unp_token);
323 	unp = msg->base.nm_so->so_pcb;
324 	if (unp == NULL) {
325 		error = EINVAL;
326 	} else if (unp->unp_conn && unp->unp_conn->unp_addr) {
327 		struct unpcb *unp2 = unp->unp_conn;
328 
329 		unp_reference(unp2);
330 		*msg->peeraddr.nm_nam = dup_sockaddr(
331 				(struct sockaddr *)unp2->unp_addr);
332 		unp_free(unp2);
333 		error = 0;
334 	} else {
335 		/*
336 		 * XXX: It seems that this test always fails even when
337 		 * connection is established.  So, this else clause is
338 		 * added as workaround to return PF_LOCAL sockaddr.
339 		 */
340 		*msg->peeraddr.nm_nam = dup_sockaddr(
341 				(struct sockaddr *)&sun_noname);
342 		error = 0;
343 	}
344 	lwkt_reltoken(&unp_token);
345 	lwkt_replymsg(&msg->lmsg, error);
346 }
347 
348 static void
349 uipc_rcvd(netmsg_t msg)
350 {
351 	struct unpcb *unp, *unp2;
352 	struct socket *so;
353 	struct socket *so2;
354 	int error;
355 
356 	lwkt_gettoken(&unp_token);
357 	so = msg->base.nm_so;
358 	unp = so->so_pcb;
359 	if (unp == NULL) {
360 		error = EINVAL;
361 		goto done;
362 	}
363 
364 	switch (so->so_type) {
365 	case SOCK_DGRAM:
366 		panic("uipc_rcvd DGRAM?");
367 		/*NOTREACHED*/
368 	case SOCK_STREAM:
369 	case SOCK_SEQPACKET:
370 		if (unp->unp_conn == NULL)
371 			break;
372 		unp2 = unp->unp_conn;
373 
374 		/*
375 		 * Because we are transfering mbufs directly to the
376 		 * peer socket we have to use SSB_STOP on the sender
377 		 * to prevent it from building up infinite mbufs.
378 		 */
379 		so2 = unp2->unp_socket;
380 		if (so->so_rcv.ssb_cc < so2->so_snd.ssb_hiwat &&
381 		    so->so_rcv.ssb_mbcnt < so2->so_snd.ssb_mbmax
382 		) {
383 			atomic_clear_int(&so2->so_snd.ssb_flags, SSB_STOP);
384 
385 			unp_reference(unp2);
386 			sowwakeup(so2);
387 			unp_free(unp2);
388 		}
389 		break;
390 	default:
391 		panic("uipc_rcvd unknown socktype");
392 		/*NOTREACHED*/
393 	}
394 	error = 0;
395 done:
396 	lwkt_reltoken(&unp_token);
397 	lwkt_replymsg(&msg->lmsg, error);
398 }
399 
400 /* pru_rcvoob is EOPNOTSUPP */
401 
402 static void
403 uipc_send(netmsg_t msg)
404 {
405 	struct unpcb *unp, *unp2;
406 	struct socket *so;
407 	struct socket *so2;
408 	struct mbuf *control;
409 	struct mbuf *m;
410 	int error = 0;
411 
412 	lwkt_gettoken(&unp_token);
413 	so = msg->base.nm_so;
414 	control = msg->send.nm_control;
415 	m = msg->send.nm_m;
416 	unp = so->so_pcb;
417 
418 	if (unp == NULL) {
419 		error = EINVAL;
420 		goto release;
421 	}
422 	if (msg->send.nm_flags & PRUS_OOB) {
423 		error = EOPNOTSUPP;
424 		goto release;
425 	}
426 
427 	if (control && (error = unp_internalize(control, msg->send.nm_td)))
428 		goto release;
429 
430 	switch (so->so_type) {
431 	case SOCK_DGRAM:
432 	{
433 		struct sockaddr *from;
434 
435 		if (msg->send.nm_addr) {
436 			if (unp->unp_conn) {
437 				error = EISCONN;
438 				break;
439 			}
440 			error = unp_connect(so,
441 					    msg->send.nm_addr,
442 					    msg->send.nm_td);
443 			if (error)
444 				break;
445 		} else {
446 			if (unp->unp_conn == NULL) {
447 				error = ENOTCONN;
448 				break;
449 			}
450 		}
451 		unp2 = unp->unp_conn;
452 		so2 = unp2->unp_socket;
453 		if (unp->unp_addr)
454 			from = (struct sockaddr *)unp->unp_addr;
455 		else
456 			from = &sun_noname;
457 
458 		unp_reference(unp2);
459 
460 		lwkt_gettoken(&so2->so_rcv.ssb_token);
461 		if (ssb_appendaddr(&so2->so_rcv, from, m, control)) {
462 			sorwakeup(so2);
463 			m = NULL;
464 			control = NULL;
465 		} else {
466 			error = ENOBUFS;
467 		}
468 		if (msg->send.nm_addr)
469 			unp_disconnect(unp);
470 		lwkt_reltoken(&so2->so_rcv.ssb_token);
471 
472 		unp_free(unp2);
473 		break;
474 	}
475 
476 	case SOCK_STREAM:
477 	case SOCK_SEQPACKET:
478 		/* Connect if not connected yet. */
479 		/*
480 		 * Note: A better implementation would complain
481 		 * if not equal to the peer's address.
482 		 */
483 		if (!(so->so_state & SS_ISCONNECTED)) {
484 			if (msg->send.nm_addr) {
485 				error = unp_connect(so,
486 						    msg->send.nm_addr,
487 						    msg->send.nm_td);
488 				if (error)
489 					break;	/* XXX */
490 			} else {
491 				error = ENOTCONN;
492 				break;
493 			}
494 		}
495 
496 		if (so->so_state & SS_CANTSENDMORE) {
497 			error = EPIPE;
498 			break;
499 		}
500 		if (unp->unp_conn == NULL)
501 			panic("uipc_send connected but no connection?");
502 		unp2 = unp->unp_conn;
503 		so2 = unp2->unp_socket;
504 
505 		unp_reference(unp2);
506 
507 		/*
508 		 * Send to paired receive port, and then reduce
509 		 * send buffer hiwater marks to maintain backpressure.
510 		 * Wake up readers.
511 		 */
512 		lwkt_gettoken(&so2->so_rcv.ssb_token);
513 		if (control) {
514 			if (ssb_appendcontrol(&so2->so_rcv, m, control)) {
515 				control = NULL;
516 				m = NULL;
517 			}
518 		} else if (so->so_type == SOCK_SEQPACKET) {
519 			sbappendrecord(&so2->so_rcv.sb, m);
520 			m = NULL;
521 		} else {
522 			sbappend(&so2->so_rcv.sb, m);
523 			m = NULL;
524 		}
525 
526 		/*
527 		 * Because we are transfering mbufs directly to the
528 		 * peer socket we have to use SSB_STOP on the sender
529 		 * to prevent it from building up infinite mbufs.
530 		 */
531 		if (so2->so_rcv.ssb_cc >= so->so_snd.ssb_hiwat ||
532 		    so2->so_rcv.ssb_mbcnt >= so->so_snd.ssb_mbmax
533 		) {
534 			atomic_set_int(&so->so_snd.ssb_flags, SSB_STOP);
535 		}
536 		lwkt_reltoken(&so2->so_rcv.ssb_token);
537 		sorwakeup(so2);
538 
539 		unp_free(unp2);
540 		break;
541 
542 	default:
543 		panic("uipc_send unknown socktype");
544 	}
545 
546 	/*
547 	 * SEND_EOF is equivalent to a SEND followed by a SHUTDOWN.
548 	 */
549 	if (msg->send.nm_flags & PRUS_EOF) {
550 		socantsendmore(so);
551 		unp_shutdown(unp);
552 	}
553 
554 	if (control && error != 0)
555 		unp_dispose(control);
556 
557 release:
558 	lwkt_reltoken(&unp_token);
559 
560 	if (control)
561 		m_freem(control);
562 	if (m)
563 		m_freem(m);
564 	lwkt_replymsg(&msg->lmsg, error);
565 }
566 
567 /*
568  * MPSAFE
569  */
570 static void
571 uipc_sense(netmsg_t msg)
572 {
573 	struct unpcb *unp;
574 	struct socket *so;
575 	struct stat *sb;
576 	int error;
577 
578 	lwkt_gettoken(&unp_token);
579 	so = msg->base.nm_so;
580 	sb = msg->sense.nm_stat;
581 	unp = so->so_pcb;
582 	if (unp == NULL) {
583 		error = EINVAL;
584 		goto done;
585 	}
586 	sb->st_blksize = so->so_snd.ssb_hiwat;
587 	sb->st_dev = NOUDEV;
588 	if (unp->unp_ino == 0) {	/* make up a non-zero inode number */
589 		spin_lock(&unp_ino_spin);
590 		unp->unp_ino = unp_ino++;
591 		spin_unlock(&unp_ino_spin);
592 	}
593 	sb->st_ino = unp->unp_ino;
594 	error = 0;
595 done:
596 	lwkt_reltoken(&unp_token);
597 	lwkt_replymsg(&msg->lmsg, error);
598 }
599 
600 static void
601 uipc_shutdown(netmsg_t msg)
602 {
603 	struct socket *so;
604 	struct unpcb *unp;
605 	int error;
606 
607 	lwkt_gettoken(&unp_token);
608 	so = msg->base.nm_so;
609 	unp = so->so_pcb;
610 	if (unp) {
611 		socantsendmore(so);
612 		unp_shutdown(unp);
613 		error = 0;
614 	} else {
615 		error = EINVAL;
616 	}
617 	lwkt_reltoken(&unp_token);
618 	lwkt_replymsg(&msg->lmsg, error);
619 }
620 
621 static void
622 uipc_sockaddr(netmsg_t msg)
623 {
624 	struct unpcb *unp;
625 	int error;
626 
627 	lwkt_gettoken(&unp_token);
628 	unp = msg->base.nm_so->so_pcb;
629 	if (unp) {
630 		if (unp->unp_addr) {
631 			*msg->sockaddr.nm_nam =
632 				dup_sockaddr((struct sockaddr *)unp->unp_addr);
633 		}
634 		error = 0;
635 	} else {
636 		error = EINVAL;
637 	}
638 	lwkt_reltoken(&unp_token);
639 	lwkt_replymsg(&msg->lmsg, error);
640 }
641 
642 struct pr_usrreqs uipc_usrreqs = {
643 	.pru_abort = uipc_abort,
644 	.pru_accept = uipc_accept,
645 	.pru_attach = uipc_attach,
646 	.pru_bind = uipc_bind,
647 	.pru_connect = uipc_connect,
648 	.pru_connect2 = uipc_connect2,
649 	.pru_control = pr_generic_notsupp,
650 	.pru_detach = uipc_detach,
651 	.pru_disconnect = uipc_disconnect,
652 	.pru_listen = uipc_listen,
653 	.pru_peeraddr = uipc_peeraddr,
654 	.pru_rcvd = uipc_rcvd,
655 	.pru_rcvoob = pr_generic_notsupp,
656 	.pru_send = uipc_send,
657 	.pru_sense = uipc_sense,
658 	.pru_shutdown = uipc_shutdown,
659 	.pru_sockaddr = uipc_sockaddr,
660 	.pru_sosend = sosend,
661 	.pru_soreceive = soreceive
662 };
663 
664 void
665 uipc_ctloutput(netmsg_t msg)
666 {
667 	struct socket *so;
668 	struct sockopt *sopt;
669 	struct unpcb *unp;
670 	int error = 0;
671 
672 	lwkt_gettoken(&unp_token);
673 	so = msg->base.nm_so;
674 	sopt = msg->ctloutput.nm_sopt;
675 	unp = so->so_pcb;
676 
677 	switch (sopt->sopt_dir) {
678 	case SOPT_GET:
679 		switch (sopt->sopt_name) {
680 		case LOCAL_PEERCRED:
681 			if (unp->unp_flags & UNP_HAVEPC)
682 				soopt_from_kbuf(sopt, &unp->unp_peercred,
683 						sizeof(unp->unp_peercred));
684 			else {
685 				if (so->so_type == SOCK_STREAM)
686 					error = ENOTCONN;
687 				else if (so->so_type == SOCK_SEQPACKET)
688 					error = ENOTCONN;
689 				else
690 					error = EINVAL;
691 			}
692 			break;
693 		default:
694 			error = EOPNOTSUPP;
695 			break;
696 		}
697 		break;
698 	case SOPT_SET:
699 	default:
700 		error = EOPNOTSUPP;
701 		break;
702 	}
703 	lwkt_reltoken(&unp_token);
704 	lwkt_replymsg(&msg->lmsg, error);
705 }
706 
707 /*
708  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
709  * for stream sockets, although the total for sender and receiver is
710  * actually only PIPSIZ.
711  *
712  * Datagram sockets really use the sendspace as the maximum datagram size,
713  * and don't really want to reserve the sendspace.  Their recvspace should
714  * be large enough for at least one max-size datagram plus address.
715  *
716  * We want the local send/recv space to be significant larger then lo0's
717  * mtu of 16384.
718  */
719 #ifndef PIPSIZ
720 #define	PIPSIZ	57344
721 #endif
722 static u_long	unpst_sendspace = PIPSIZ;
723 static u_long	unpst_recvspace = PIPSIZ;
724 static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
725 static u_long	unpdg_recvspace = 4*1024;
726 
727 static int	unp_rights;			/* file descriptors in flight */
728 static struct spinlock unp_spin = SPINLOCK_INITIALIZER(&unp_spin);
729 
730 SYSCTL_DECL(_net_local_seqpacket);
731 SYSCTL_DECL(_net_local_stream);
732 SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
733     &unpst_sendspace, 0, "Size of stream socket send buffer");
734 SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
735     &unpst_recvspace, 0, "Size of stream socket receive buffer");
736 
737 SYSCTL_DECL(_net_local_dgram);
738 SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
739     &unpdg_sendspace, 0, "Max datagram socket size");
740 SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
741     &unpdg_recvspace, 0, "Size of datagram socket receive buffer");
742 
743 SYSCTL_DECL(_net_local);
744 SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0,
745    "File descriptors in flight");
746 
747 static int
748 unp_attach(struct socket *so, struct pru_attach_info *ai)
749 {
750 	struct unpcb *unp;
751 	int error;
752 
753 	lwkt_gettoken(&unp_token);
754 
755 	if (so->so_snd.ssb_hiwat == 0 || so->so_rcv.ssb_hiwat == 0) {
756 		switch (so->so_type) {
757 
758 		case SOCK_STREAM:
759 		case SOCK_SEQPACKET:
760 			error = soreserve(so, unpst_sendspace, unpst_recvspace,
761 					  ai->sb_rlimit);
762 			break;
763 
764 		case SOCK_DGRAM:
765 			error = soreserve(so, unpdg_sendspace, unpdg_recvspace,
766 					  ai->sb_rlimit);
767 			break;
768 
769 		default:
770 			panic("unp_attach");
771 		}
772 		if (error)
773 			goto failed;
774 	}
775 	unp = kmalloc(sizeof(*unp), M_UNPCB, M_WAITOK | M_ZERO | M_NULLOK);
776 	if (unp == NULL) {
777 		error = ENOBUFS;
778 		goto failed;
779 	}
780 	unp->unp_refcnt = 1;
781 	unp->unp_gencnt = ++unp_gencnt;
782 	unp_count++;
783 	LIST_INIT(&unp->unp_refs);
784 	unp->unp_socket = so;
785 	unp->unp_rvnode = ai->fd_rdir;		/* jail cruft XXX JH */
786 	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
787 			 : &unp_shead, unp, unp_link);
788 	so->so_pcb = (caddr_t)unp;
789 	soreference(so);
790 	error = 0;
791 failed:
792 	lwkt_reltoken(&unp_token);
793 	return error;
794 }
795 
796 static void
797 unp_detach(struct unpcb *unp)
798 {
799 	struct socket *so;
800 
801 	lwkt_gettoken(&unp_token);
802 
803 	LIST_REMOVE(unp, unp_link);
804 	unp->unp_gencnt = ++unp_gencnt;
805 	--unp_count;
806 	if (unp->unp_vnode) {
807 		unp->unp_vnode->v_socket = NULL;
808 		vrele(unp->unp_vnode);
809 		unp->unp_vnode = NULL;
810 	}
811 	if (unp->unp_conn)
812 		unp_disconnect(unp);
813 	while (!LIST_EMPTY(&unp->unp_refs))
814 		unp_drop(LIST_FIRST(&unp->unp_refs), ECONNRESET);
815 	soisdisconnected(unp->unp_socket);
816 	so = unp->unp_socket;
817 	soreference(so);	/* for delayed sorflush */
818 	so->so_pcb = NULL;
819 	unp->unp_socket = NULL;
820 	sofree(so);		/* remove pcb ref */
821 
822 	if (unp_rights) {
823 		/*
824 		 * Normally the receive buffer is flushed later,
825 		 * in sofree, but if our receive buffer holds references
826 		 * to descriptors that are now garbage, we will dispose
827 		 * of those descriptor references after the garbage collector
828 		 * gets them (resulting in a "panic: closef: count < 0").
829 		 */
830 		sorflush(so);
831 		unp_gc();
832 	}
833 	sofree(so);
834 	lwkt_reltoken(&unp_token);
835 
836 	if (unp->unp_addr)
837 		kfree(unp->unp_addr, M_SONAME);
838 	kfree(unp, M_UNPCB);
839 }
840 
841 static int
842 unp_bind(struct unpcb *unp, struct sockaddr *nam, struct thread *td)
843 {
844 	struct proc *p = td->td_proc;
845 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
846 	struct vnode *vp;
847 	struct vattr vattr;
848 	int error, namelen;
849 	struct nlookupdata nd;
850 	char buf[SOCK_MAXADDRLEN];
851 
852 	lwkt_gettoken(&unp_token);
853 	if (unp->unp_vnode != NULL) {
854 		error = EINVAL;
855 		goto failed;
856 	}
857 	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
858 	if (namelen <= 0) {
859 		error = EINVAL;
860 		goto failed;
861 	}
862 	strncpy(buf, soun->sun_path, namelen);
863 	buf[namelen] = 0;	/* null-terminate the string */
864 	error = nlookup_init(&nd, buf, UIO_SYSSPACE,
865 			     NLC_LOCKVP | NLC_CREATE | NLC_REFDVP);
866 	if (error == 0)
867 		error = nlookup(&nd);
868 	if (error == 0 && nd.nl_nch.ncp->nc_vp != NULL)
869 		error = EADDRINUSE;
870 	if (error)
871 		goto done;
872 
873 	VATTR_NULL(&vattr);
874 	vattr.va_type = VSOCK;
875 	vattr.va_mode = (ACCESSPERMS & ~p->p_fd->fd_cmask);
876 	error = VOP_NCREATE(&nd.nl_nch, nd.nl_dvp, &vp, nd.nl_cred, &vattr);
877 	if (error == 0) {
878 		vp->v_socket = unp->unp_socket;
879 		unp->unp_vnode = vp;
880 		unp->unp_addr = (struct sockaddr_un *)dup_sockaddr(nam);
881 		vn_unlock(vp);
882 	}
883 done:
884 	nlookup_done(&nd);
885 failed:
886 	lwkt_reltoken(&unp_token);
887 	return (error);
888 }
889 
890 static int
891 unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
892 {
893 	struct proc *p = td->td_proc;
894 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
895 	struct vnode *vp;
896 	struct socket *so2, *so3;
897 	struct unpcb *unp, *unp2, *unp3;
898 	int error, len;
899 	struct nlookupdata nd;
900 	char buf[SOCK_MAXADDRLEN];
901 
902 	lwkt_gettoken(&unp_token);
903 
904 	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
905 	if (len <= 0) {
906 		error = EINVAL;
907 		goto failed;
908 	}
909 	strncpy(buf, soun->sun_path, len);
910 	buf[len] = 0;
911 
912 	vp = NULL;
913 	error = nlookup_init(&nd, buf, UIO_SYSSPACE, NLC_FOLLOW);
914 	if (error == 0)
915 		error = nlookup(&nd);
916 	if (error == 0)
917 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
918 	nlookup_done(&nd);
919 	if (error)
920 		goto failed;
921 
922 	if (vp->v_type != VSOCK) {
923 		error = ENOTSOCK;
924 		goto bad;
925 	}
926 	error = VOP_EACCESS(vp, VWRITE, p->p_ucred);
927 	if (error)
928 		goto bad;
929 	so2 = vp->v_socket;
930 	if (so2 == NULL) {
931 		error = ECONNREFUSED;
932 		goto bad;
933 	}
934 	if (so->so_type != so2->so_type) {
935 		error = EPROTOTYPE;
936 		goto bad;
937 	}
938 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
939 		if (!(so2->so_options & SO_ACCEPTCONN) ||
940 		    (so3 = sonewconn(so2, 0)) == NULL) {
941 			error = ECONNREFUSED;
942 			goto bad;
943 		}
944 		unp = so->so_pcb;
945 		unp2 = so2->so_pcb;
946 		unp3 = so3->so_pcb;
947 		if (unp2->unp_addr)
948 			unp3->unp_addr = (struct sockaddr_un *)
949 				dup_sockaddr((struct sockaddr *)unp2->unp_addr);
950 
951 		/*
952 		 * unp_peercred management:
953 		 *
954 		 * The connecter's (client's) credentials are copied
955 		 * from its process structure at the time of connect()
956 		 * (which is now).
957 		 */
958 		cru2x(p->p_ucred, &unp3->unp_peercred);
959 		unp3->unp_flags |= UNP_HAVEPC;
960 		/*
961 		 * The receiver's (server's) credentials are copied
962 		 * from the unp_peercred member of socket on which the
963 		 * former called listen(); unp_listen() cached that
964 		 * process's credentials at that time so we can use
965 		 * them now.
966 		 */
967 		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
968 		    ("unp_connect: listener without cached peercred"));
969 		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
970 		    sizeof(unp->unp_peercred));
971 		unp->unp_flags |= UNP_HAVEPC;
972 
973 		so2 = so3;
974 	}
975 	error = unp_connect2(so, so2);
976 bad:
977 	vput(vp);
978 failed:
979 	lwkt_reltoken(&unp_token);
980 	return (error);
981 }
982 
983 int
984 unp_connect2(struct socket *so, struct socket *so2)
985 {
986 	struct unpcb *unp;
987 	struct unpcb *unp2;
988 
989 	lwkt_gettoken(&unp_token);
990 	unp = so->so_pcb;
991 	if (so2->so_type != so->so_type) {
992 		lwkt_reltoken(&unp_token);
993 		return (EPROTOTYPE);
994 	}
995 	unp2 = so2->so_pcb;
996 	unp->unp_conn = unp2;
997 
998 	switch (so->so_type) {
999 	case SOCK_DGRAM:
1000 		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
1001 		soisconnected(so);
1002 		break;
1003 
1004 	case SOCK_STREAM:
1005 	case SOCK_SEQPACKET:
1006 		unp2->unp_conn = unp;
1007 		soisconnected(so);
1008 		soisconnected(so2);
1009 		break;
1010 
1011 	default:
1012 		panic("unp_connect2");
1013 	}
1014 	lwkt_reltoken(&unp_token);
1015 	return (0);
1016 }
1017 
1018 static void
1019 unp_disconnect(struct unpcb *unp)
1020 {
1021 	struct unpcb *unp2;
1022 
1023 	lwkt_gettoken(&unp_token);
1024 
1025 	unp2 = unp->unp_conn;
1026 	if (unp2 == NULL) {
1027 		lwkt_reltoken(&unp_token);
1028 		return;
1029 	}
1030 
1031 	unp->unp_conn = NULL;
1032 
1033 	switch (unp->unp_socket->so_type) {
1034 	case SOCK_DGRAM:
1035 		LIST_REMOVE(unp, unp_reflink);
1036 		soclrstate(unp->unp_socket, SS_ISCONNECTED);
1037 		break;
1038 
1039 	case SOCK_STREAM:
1040 	case SOCK_SEQPACKET:
1041 		unp_reference(unp2);
1042 		unp2->unp_conn = NULL;
1043 
1044 		soisdisconnected(unp->unp_socket);
1045 		soisdisconnected(unp2->unp_socket);
1046 
1047 		unp_free(unp2);
1048 		break;
1049 	}
1050 	lwkt_reltoken(&unp_token);
1051 }
1052 
1053 #ifdef notdef
1054 void
1055 unp_abort(struct unpcb *unp)
1056 {
1057 	lwkt_gettoken(&unp_token);
1058 	unp_free(unp);
1059 	lwkt_reltoken(&unp_token);
1060 }
1061 #endif
1062 
1063 static int
1064 prison_unpcb(struct thread *td, struct unpcb *unp)
1065 {
1066 	struct proc *p;
1067 
1068 	if (td == NULL)
1069 		return (0);
1070 	if ((p = td->td_proc) == NULL)
1071 		return (0);
1072 	if (!p->p_ucred->cr_prison)
1073 		return (0);
1074 	if (p->p_fd->fd_rdir == unp->unp_rvnode)
1075 		return (0);
1076 	return (1);
1077 }
1078 
1079 static int
1080 unp_pcblist(SYSCTL_HANDLER_ARGS)
1081 {
1082 	int error, i, n;
1083 	struct unpcb *unp, **unp_list;
1084 	unp_gen_t gencnt;
1085 	struct unp_head *head;
1086 
1087 	head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
1088 
1089 	KKASSERT(curproc != NULL);
1090 
1091 	/*
1092 	 * The process of preparing the PCB list is too time-consuming and
1093 	 * resource-intensive to repeat twice on every request.
1094 	 */
1095 	if (req->oldptr == NULL) {
1096 		n = unp_count;
1097 		req->oldidx = (n + n/8) * sizeof(struct xunpcb);
1098 		return 0;
1099 	}
1100 
1101 	if (req->newptr != NULL)
1102 		return EPERM;
1103 
1104 	lwkt_gettoken(&unp_token);
1105 
1106 	/*
1107 	 * OK, now we're committed to doing something.
1108 	 */
1109 	gencnt = unp_gencnt;
1110 	n = unp_count;
1111 
1112 	unp_list = kmalloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
1113 
1114 	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
1115 	     unp = LIST_NEXT(unp, unp_link)) {
1116 		if (unp->unp_gencnt <= gencnt && !prison_unpcb(req->td, unp))
1117 			unp_list[i++] = unp;
1118 	}
1119 	n = i;			/* in case we lost some during malloc */
1120 
1121 	error = 0;
1122 	for (i = 0; i < n; i++) {
1123 		unp = unp_list[i];
1124 		if (unp->unp_gencnt <= gencnt) {
1125 			struct xunpcb xu;
1126 			xu.xu_len = sizeof xu;
1127 			xu.xu_unpp = unp;
1128 			/*
1129 			 * XXX - need more locking here to protect against
1130 			 * connect/disconnect races for SMP.
1131 			 */
1132 			if (unp->unp_addr)
1133 				bcopy(unp->unp_addr, &xu.xu_addr,
1134 				      unp->unp_addr->sun_len);
1135 			if (unp->unp_conn && unp->unp_conn->unp_addr)
1136 				bcopy(unp->unp_conn->unp_addr,
1137 				      &xu.xu_caddr,
1138 				      unp->unp_conn->unp_addr->sun_len);
1139 			bcopy(unp, &xu.xu_unp, sizeof *unp);
1140 			sotoxsocket(unp->unp_socket, &xu.xu_socket);
1141 			error = SYSCTL_OUT(req, &xu, sizeof xu);
1142 		}
1143 	}
1144 	lwkt_reltoken(&unp_token);
1145 	kfree(unp_list, M_TEMP);
1146 
1147 	return error;
1148 }
1149 
1150 SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD,
1151 	    (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
1152 	    "List of active local datagram sockets");
1153 SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD,
1154 	    (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
1155 	    "List of active local stream sockets");
1156 SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist, CTLFLAG_RD,
1157 	    (caddr_t)(long)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb",
1158 	    "List of active local seqpacket stream sockets");
1159 
1160 static void
1161 unp_shutdown(struct unpcb *unp)
1162 {
1163 	struct socket *so;
1164 
1165 	if ((unp->unp_socket->so_type == SOCK_STREAM ||
1166 	     unp->unp_socket->so_type == SOCK_SEQPACKET) &&
1167 	    unp->unp_conn != NULL && (so = unp->unp_conn->unp_socket)) {
1168 		socantrcvmore(so);
1169 	}
1170 }
1171 
1172 static void
1173 unp_drop(struct unpcb *unp, int err)
1174 {
1175 	struct socket *so = unp->unp_socket;
1176 
1177 	so->so_error = err;
1178 	unp_disconnect(unp);
1179 }
1180 
1181 #ifdef notdef
1182 void
1183 unp_drain(void)
1184 {
1185 	lwkt_gettoken(&unp_token);
1186 	lwkt_reltoken(&unp_token);
1187 }
1188 #endif
1189 
1190 int
1191 unp_externalize(struct mbuf *rights)
1192 {
1193 	struct thread *td = curthread;
1194 	struct proc *p = td->td_proc;		/* XXX */
1195 	struct lwp *lp = td->td_lwp;
1196 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
1197 	int *fdp;
1198 	int i;
1199 	struct file **rp;
1200 	struct file *fp;
1201 	int newfds = (cm->cmsg_len - (CMSG_DATA(cm) - (u_char *)cm))
1202 		/ sizeof (struct file *);
1203 	int f;
1204 
1205 	lwkt_gettoken(&unp_token);
1206 
1207 	/*
1208 	 * if the new FD's will not fit, then we free them all
1209 	 */
1210 	if (!fdavail(p, newfds)) {
1211 		rp = (struct file **)CMSG_DATA(cm);
1212 		for (i = 0; i < newfds; i++) {
1213 			fp = *rp;
1214 			/*
1215 			 * zero the pointer before calling unp_discard,
1216 			 * since it may end up in unp_gc()..
1217 			 */
1218 			*rp++ = NULL;
1219 			unp_discard(fp, NULL);
1220 		}
1221 		lwkt_reltoken(&unp_token);
1222 		return (EMSGSIZE);
1223 	}
1224 
1225 	/*
1226 	 * now change each pointer to an fd in the global table to
1227 	 * an integer that is the index to the local fd table entry
1228 	 * that we set up to point to the global one we are transferring.
1229 	 * If sizeof (struct file *) is bigger than or equal to sizeof int,
1230 	 * then do it in forward order. In that case, an integer will
1231 	 * always come in the same place or before its corresponding
1232 	 * struct file pointer.
1233 	 * If sizeof (struct file *) is smaller than sizeof int, then
1234 	 * do it in reverse order.
1235 	 */
1236 	if (sizeof (struct file *) >= sizeof (int)) {
1237 		fdp = (int *)CMSG_DATA(cm);
1238 		rp = (struct file **)CMSG_DATA(cm);
1239 		for (i = 0; i < newfds; i++) {
1240 			if (fdalloc(p, 0, &f))
1241 				panic("unp_externalize");
1242 			fp = *rp++;
1243 			unp_fp_externalize(lp, fp, f);
1244 			*fdp++ = f;
1245 		}
1246 	} else {
1247 		fdp = (int *)CMSG_DATA(cm) + newfds - 1;
1248 		rp = (struct file **)CMSG_DATA(cm) + newfds - 1;
1249 		for (i = 0; i < newfds; i++) {
1250 			if (fdalloc(p, 0, &f))
1251 				panic("unp_externalize");
1252 			fp = *rp--;
1253 			unp_fp_externalize(lp, fp, f);
1254 			*fdp-- = f;
1255 		}
1256 	}
1257 
1258 	/*
1259 	 * Adjust length, in case sizeof(struct file *) and sizeof(int)
1260 	 * differs.
1261 	 */
1262 	cm->cmsg_len = CMSG_LEN(newfds * sizeof(int));
1263 	rights->m_len = cm->cmsg_len;
1264 
1265 	lwkt_reltoken(&unp_token);
1266 	return (0);
1267 }
1268 
1269 static void
1270 unp_fp_externalize(struct lwp *lp, struct file *fp, int fd)
1271 {
1272 	struct file *fx;
1273 	int error;
1274 
1275 	lwkt_gettoken(&unp_token);
1276 
1277 	if (lp) {
1278 		KKASSERT(fd >= 0);
1279 		if (fp->f_flag & FREVOKED) {
1280 			kprintf("Warning: revoked fp exiting unix socket\n");
1281 			fx = NULL;
1282 			error = falloc(lp, &fx, NULL);
1283 			if (error == 0)
1284 				fsetfd(lp->lwp_proc->p_fd, fx, fd);
1285 			else
1286 				fsetfd(lp->lwp_proc->p_fd, NULL, fd);
1287 			fdrop(fx);
1288 		} else {
1289 			fsetfd(lp->lwp_proc->p_fd, fp, fd);
1290 		}
1291 	}
1292 	spin_lock(&unp_spin);
1293 	fp->f_msgcount--;
1294 	unp_rights--;
1295 	spin_unlock(&unp_spin);
1296 	fdrop(fp);
1297 
1298 	lwkt_reltoken(&unp_token);
1299 }
1300 
1301 
1302 void
1303 unp_init(void)
1304 {
1305 	LIST_INIT(&unp_dhead);
1306 	LIST_INIT(&unp_shead);
1307 	spin_init(&unp_spin);
1308 }
1309 
1310 static int
1311 unp_internalize(struct mbuf *control, struct thread *td)
1312 {
1313 	struct proc *p = td->td_proc;
1314 	struct filedesc *fdescp;
1315 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1316 	struct file **rp;
1317 	struct file *fp;
1318 	int i, fd, *fdp;
1319 	struct cmsgcred *cmcred;
1320 	int oldfds;
1321 	u_int newlen;
1322 	int error;
1323 
1324 	KKASSERT(p);
1325 	lwkt_gettoken(&unp_token);
1326 
1327 	fdescp = p->p_fd;
1328 	if ((cm->cmsg_type != SCM_RIGHTS && cm->cmsg_type != SCM_CREDS) ||
1329 	    cm->cmsg_level != SOL_SOCKET ||
1330 	    CMSG_ALIGN(cm->cmsg_len) != control->m_len) {
1331 		error = EINVAL;
1332 		goto done;
1333 	}
1334 
1335 	/*
1336 	 * Fill in credential information.
1337 	 */
1338 	if (cm->cmsg_type == SCM_CREDS) {
1339 		cmcred = (struct cmsgcred *)CMSG_DATA(cm);
1340 		cmcred->cmcred_pid = p->p_pid;
1341 		cmcred->cmcred_uid = p->p_ucred->cr_ruid;
1342 		cmcred->cmcred_gid = p->p_ucred->cr_rgid;
1343 		cmcred->cmcred_euid = p->p_ucred->cr_uid;
1344 		cmcred->cmcred_ngroups = MIN(p->p_ucred->cr_ngroups,
1345 							CMGROUP_MAX);
1346 		for (i = 0; i < cmcred->cmcred_ngroups; i++)
1347 			cmcred->cmcred_groups[i] = p->p_ucred->cr_groups[i];
1348 		error = 0;
1349 		goto done;
1350 	}
1351 
1352 	/*
1353 	 * cmsghdr may not be aligned, do not allow calculation(s) to
1354 	 * go negative.
1355 	 */
1356 	if (cm->cmsg_len < CMSG_LEN(0)) {
1357 		error = EINVAL;
1358 		goto done;
1359 	}
1360 
1361 	oldfds = (cm->cmsg_len - CMSG_LEN(0)) / sizeof (int);
1362 
1363 	/*
1364 	 * check that all the FDs passed in refer to legal OPEN files
1365 	 * If not, reject the entire operation.
1366 	 */
1367 	fdp = (int *)CMSG_DATA(cm);
1368 	for (i = 0; i < oldfds; i++) {
1369 		fd = *fdp++;
1370 		if ((unsigned)fd >= fdescp->fd_nfiles ||
1371 		    fdescp->fd_files[fd].fp == NULL) {
1372 			error = EBADF;
1373 			goto done;
1374 		}
1375 		if (fdescp->fd_files[fd].fp->f_type == DTYPE_KQUEUE) {
1376 			error = EOPNOTSUPP;
1377 			goto done;
1378 		}
1379 	}
1380 	/*
1381 	 * Now replace the integer FDs with pointers to
1382 	 * the associated global file table entry..
1383 	 * Allocate a bigger buffer as necessary. But if an cluster is not
1384 	 * enough, return E2BIG.
1385 	 */
1386 	newlen = CMSG_LEN(oldfds * sizeof(struct file *));
1387 	if (newlen > MCLBYTES) {
1388 		error = E2BIG;
1389 		goto done;
1390 	}
1391 	if (newlen - control->m_len > M_TRAILINGSPACE(control)) {
1392 		if (control->m_flags & M_EXT) {
1393 			error = E2BIG;
1394 			goto done;
1395 		}
1396 		MCLGET(control, MB_WAIT);
1397 		if (!(control->m_flags & M_EXT)) {
1398 			error = ENOBUFS;
1399 			goto done;
1400 		}
1401 
1402 		/* copy the data to the cluster */
1403 		memcpy(mtod(control, char *), cm, cm->cmsg_len);
1404 		cm = mtod(control, struct cmsghdr *);
1405 	}
1406 
1407 	/*
1408 	 * Adjust length, in case sizeof(struct file *) and sizeof(int)
1409 	 * differs.
1410 	 */
1411 	cm->cmsg_len = newlen;
1412 	control->m_len = CMSG_ALIGN(newlen);
1413 
1414 	/*
1415 	 * Transform the file descriptors into struct file pointers.
1416 	 * If sizeof (struct file *) is bigger than or equal to sizeof int,
1417 	 * then do it in reverse order so that the int won't get until
1418 	 * we're done.
1419 	 * If sizeof (struct file *) is smaller than sizeof int, then
1420 	 * do it in forward order.
1421 	 */
1422 	if (sizeof (struct file *) >= sizeof (int)) {
1423 		fdp = (int *)CMSG_DATA(cm) + oldfds - 1;
1424 		rp = (struct file **)CMSG_DATA(cm) + oldfds - 1;
1425 		for (i = 0; i < oldfds; i++) {
1426 			fp = fdescp->fd_files[*fdp--].fp;
1427 			*rp-- = fp;
1428 			fhold(fp);
1429 			spin_lock(&unp_spin);
1430 			fp->f_msgcount++;
1431 			unp_rights++;
1432 			spin_unlock(&unp_spin);
1433 		}
1434 	} else {
1435 		fdp = (int *)CMSG_DATA(cm);
1436 		rp = (struct file **)CMSG_DATA(cm);
1437 		for (i = 0; i < oldfds; i++) {
1438 			fp = fdescp->fd_files[*fdp++].fp;
1439 			*rp++ = fp;
1440 			fhold(fp);
1441 			spin_lock(&unp_spin);
1442 			fp->f_msgcount++;
1443 			unp_rights++;
1444 			spin_unlock(&unp_spin);
1445 		}
1446 	}
1447 	error = 0;
1448 done:
1449 	lwkt_reltoken(&unp_token);
1450 	return error;
1451 }
1452 
1453 /*
1454  * Garbage collect in-transit file descriptors that get lost due to
1455  * loops (i.e. when a socket is sent to another process over itself,
1456  * and more complex situations).
1457  *
1458  * NOT MPSAFE - TODO socket flush code and maybe closef.  Rest is MPSAFE.
1459  */
1460 
1461 struct unp_gc_info {
1462 	struct file **extra_ref;
1463 	struct file *locked_fp;
1464 	int defer;
1465 	int index;
1466 	int maxindex;
1467 };
1468 
1469 static void
1470 unp_gc(void)
1471 {
1472 	struct unp_gc_info info;
1473 	static boolean_t unp_gcing;
1474 	struct file **fpp;
1475 	int i;
1476 
1477 	/*
1478 	 * Only one gc can be in-progress at any given moment
1479 	 */
1480 	spin_lock(&unp_spin);
1481 	if (unp_gcing) {
1482 		spin_unlock(&unp_spin);
1483 		return;
1484 	}
1485 	unp_gcing = TRUE;
1486 	spin_unlock(&unp_spin);
1487 
1488 	lwkt_gettoken(&unp_token);
1489 
1490 	/*
1491 	 * Before going through all this, set all FDs to be NOT defered
1492 	 * and NOT externally accessible (not marked).  During the scan
1493 	 * a fd can be marked externally accessible but we may or may not
1494 	 * be able to immediately process it (controlled by FDEFER).
1495 	 *
1496 	 * If we loop sleep a bit.  The complexity of the topology can cause
1497 	 * multiple loops.  Also failure to acquire the socket's so_rcv
1498 	 * token can cause us to loop.
1499 	 */
1500 	allfiles_scan_exclusive(unp_gc_clearmarks, NULL);
1501 	do {
1502 		info.defer = 0;
1503 		allfiles_scan_exclusive(unp_gc_checkmarks, &info);
1504 		if (info.defer)
1505 			tsleep(&info, 0, "gcagain", 1);
1506 	} while (info.defer);
1507 
1508 	/*
1509 	 * We grab an extra reference to each of the file table entries
1510 	 * that are not otherwise accessible and then free the rights
1511 	 * that are stored in messages on them.
1512 	 *
1513 	 * The bug in the orginal code is a little tricky, so I'll describe
1514 	 * what's wrong with it here.
1515 	 *
1516 	 * It is incorrect to simply unp_discard each entry for f_msgcount
1517 	 * times -- consider the case of sockets A and B that contain
1518 	 * references to each other.  On a last close of some other socket,
1519 	 * we trigger a gc since the number of outstanding rights (unp_rights)
1520 	 * is non-zero.  If during the sweep phase the gc code un_discards,
1521 	 * we end up doing a (full) closef on the descriptor.  A closef on A
1522 	 * results in the following chain.  Closef calls soo_close, which
1523 	 * calls soclose.   Soclose calls first (through the switch
1524 	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
1525 	 * returns because the previous instance had set unp_gcing, and
1526 	 * we return all the way back to soclose, which marks the socket
1527 	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
1528 	 * to free up the rights that are queued in messages on the socket A,
1529 	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
1530 	 * switch unp_dispose, which unp_scans with unp_discard.  This second
1531 	 * instance of unp_discard just calls closef on B.
1532 	 *
1533 	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
1534 	 * which results in another closef on A.  Unfortunately, A is already
1535 	 * being closed, and the descriptor has already been marked with
1536 	 * SS_NOFDREF, and soclose panics at this point.
1537 	 *
1538 	 * Here, we first take an extra reference to each inaccessible
1539 	 * descriptor.  Then, we call sorflush ourself, since we know
1540 	 * it is a Unix domain socket anyhow.  After we destroy all the
1541 	 * rights carried in messages, we do a last closef to get rid
1542 	 * of our extra reference.  This is the last close, and the
1543 	 * unp_detach etc will shut down the socket.
1544 	 *
1545 	 * 91/09/19, bsy@cs.cmu.edu
1546 	 */
1547 	info.extra_ref = kmalloc(256 * sizeof(struct file *), M_FILE, M_WAITOK);
1548 	info.maxindex = 256;
1549 
1550 	do {
1551 		/*
1552 		 * Look for matches
1553 		 */
1554 		info.index = 0;
1555 		allfiles_scan_exclusive(unp_gc_checkrefs, &info);
1556 
1557 		/*
1558 		 * For each FD on our hit list, do the following two things
1559 		 */
1560 		for (i = info.index, fpp = info.extra_ref; --i >= 0; ++fpp) {
1561 			struct file *tfp = *fpp;
1562 			if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL)
1563 				sorflush((struct socket *)(tfp->f_data));
1564 		}
1565 		for (i = info.index, fpp = info.extra_ref; --i >= 0; ++fpp)
1566 			closef(*fpp, NULL);
1567 	} while (info.index == info.maxindex);
1568 
1569 	lwkt_reltoken(&unp_token);
1570 
1571 	kfree((caddr_t)info.extra_ref, M_FILE);
1572 	unp_gcing = FALSE;
1573 }
1574 
1575 /*
1576  * MPSAFE - NOTE: filehead list and file pointer spinlocked on entry
1577  */
1578 static int
1579 unp_gc_checkrefs(struct file *fp, void *data)
1580 {
1581 	struct unp_gc_info *info = data;
1582 
1583 	if (fp->f_count == 0)
1584 		return(0);
1585 	if (info->index == info->maxindex)
1586 		return(-1);
1587 
1588 	/*
1589 	 * If all refs are from msgs, and it's not marked accessible
1590 	 * then it must be referenced from some unreachable cycle
1591 	 * of (shut-down) FDs, so include it in our
1592 	 * list of FDs to remove
1593 	 */
1594 	if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) {
1595 		info->extra_ref[info->index++] = fp;
1596 		fhold(fp);
1597 	}
1598 	return(0);
1599 }
1600 
1601 /*
1602  * MPSAFE - NOTE: filehead list and file pointer spinlocked on entry
1603  */
1604 static int
1605 unp_gc_clearmarks(struct file *fp, void *data __unused)
1606 {
1607 	atomic_clear_int(&fp->f_flag, FMARK | FDEFER);
1608 	return(0);
1609 }
1610 
1611 /*
1612  * MPSAFE - NOTE: filehead list and file pointer spinlocked on entry
1613  */
1614 static int
1615 unp_gc_checkmarks(struct file *fp, void *data)
1616 {
1617 	struct unp_gc_info *info = data;
1618 	struct socket *so;
1619 
1620 	/*
1621 	 * If the file is not open, skip it.  Make sure it isn't marked
1622 	 * defered or we could loop forever, in case we somehow race
1623 	 * something.
1624 	 */
1625 	if (fp->f_count == 0) {
1626 		if (fp->f_flag & FDEFER)
1627 			atomic_clear_int(&fp->f_flag, FDEFER);
1628 		return(0);
1629 	}
1630 	/*
1631 	 * If we already marked it as 'defer'  in a
1632 	 * previous pass, then try process it this time
1633 	 * and un-mark it
1634 	 */
1635 	if (fp->f_flag & FDEFER) {
1636 		atomic_clear_int(&fp->f_flag, FDEFER);
1637 	} else {
1638 		/*
1639 		 * if it's not defered, then check if it's
1640 		 * already marked.. if so skip it
1641 		 */
1642 		if (fp->f_flag & FMARK)
1643 			return(0);
1644 		/*
1645 		 * If all references are from messages
1646 		 * in transit, then skip it. it's not
1647 		 * externally accessible.
1648 		 */
1649 		if (fp->f_count == fp->f_msgcount)
1650 			return(0);
1651 		/*
1652 		 * If it got this far then it must be
1653 		 * externally accessible.
1654 		 */
1655 		atomic_set_int(&fp->f_flag, FMARK);
1656 	}
1657 
1658 	/*
1659 	 * either it was defered, or it is externally
1660 	 * accessible and not already marked so.
1661 	 * Now check if it is possibly one of OUR sockets.
1662 	 */
1663 	if (fp->f_type != DTYPE_SOCKET ||
1664 	    (so = (struct socket *)fp->f_data) == NULL) {
1665 		return(0);
1666 	}
1667 	if (so->so_proto->pr_domain != &localdomain ||
1668 	    !(so->so_proto->pr_flags & PR_RIGHTS)) {
1669 		return(0);
1670 	}
1671 
1672 	/*
1673 	 * So, Ok, it's one of our sockets and it IS externally accessible
1674 	 * (or was defered).  Now we look to see if we hold any file
1675 	 * descriptors in its message buffers.  Follow those links and mark
1676 	 * them as accessible too.
1677 	 *
1678 	 * We are holding multiple spinlocks here, if we cannot get the
1679 	 * token non-blocking defer until the next loop.
1680 	 */
1681 	info->locked_fp = fp;
1682 	if (lwkt_trytoken(&so->so_rcv.ssb_token)) {
1683 		unp_scan(so->so_rcv.ssb_mb, unp_mark, info);
1684 		lwkt_reltoken(&so->so_rcv.ssb_token);
1685 	} else {
1686 		atomic_set_int(&fp->f_flag, FDEFER);
1687 		++info->defer;
1688 	}
1689 	return (0);
1690 }
1691 
1692 /*
1693  * Scan all unix domain sockets and replace any revoked file pointers
1694  * found with the dummy file pointer fx.  We don't worry about races
1695  * against file pointers being read out as those are handled in the
1696  * externalize code.
1697  */
1698 
1699 #define REVOKE_GC_MAXFILES	32
1700 
1701 struct unp_revoke_gc_info {
1702 	struct file	*fx;
1703 	struct file	*fary[REVOKE_GC_MAXFILES];
1704 	int		fcount;
1705 };
1706 
1707 void
1708 unp_revoke_gc(struct file *fx)
1709 {
1710 	struct unp_revoke_gc_info info;
1711 	int i;
1712 
1713 	lwkt_gettoken(&unp_token);
1714 	info.fx = fx;
1715 	do {
1716 		info.fcount = 0;
1717 		allfiles_scan_exclusive(unp_revoke_gc_check, &info);
1718 		for (i = 0; i < info.fcount; ++i)
1719 			unp_fp_externalize(NULL, info.fary[i], -1);
1720 	} while (info.fcount == REVOKE_GC_MAXFILES);
1721 	lwkt_reltoken(&unp_token);
1722 }
1723 
1724 /*
1725  * Check for and replace revoked descriptors.
1726  *
1727  * WARNING:  This routine is not allowed to block.
1728  */
1729 static int
1730 unp_revoke_gc_check(struct file *fps, void *vinfo)
1731 {
1732 	struct unp_revoke_gc_info *info = vinfo;
1733 	struct file *fp;
1734 	struct socket *so;
1735 	struct mbuf *m0;
1736 	struct mbuf *m;
1737 	struct file **rp;
1738 	struct cmsghdr *cm;
1739 	int i;
1740 	int qfds;
1741 
1742 	/*
1743 	 * Is this a unix domain socket with rights-passing abilities?
1744 	 */
1745 	if (fps->f_type != DTYPE_SOCKET)
1746 		return (0);
1747 	if ((so = (struct socket *)fps->f_data) == NULL)
1748 		return(0);
1749 	if (so->so_proto->pr_domain != &localdomain)
1750 		return(0);
1751 	if ((so->so_proto->pr_flags & PR_RIGHTS) == 0)
1752 		return(0);
1753 
1754 	/*
1755 	 * Scan the mbufs for control messages and replace any revoked
1756 	 * descriptors we find.
1757 	 */
1758 	lwkt_gettoken(&so->so_rcv.ssb_token);
1759 	m0 = so->so_rcv.ssb_mb;
1760 	while (m0) {
1761 		for (m = m0; m; m = m->m_next) {
1762 			if (m->m_type != MT_CONTROL)
1763 				continue;
1764 			if (m->m_len < sizeof(*cm))
1765 				continue;
1766 			cm = mtod(m, struct cmsghdr *);
1767 			if (cm->cmsg_level != SOL_SOCKET ||
1768 			    cm->cmsg_type != SCM_RIGHTS) {
1769 				continue;
1770 			}
1771 			qfds = (cm->cmsg_len - CMSG_LEN(0)) / sizeof(void *);
1772 			rp = (struct file **)CMSG_DATA(cm);
1773 			for (i = 0; i < qfds; i++) {
1774 				fp = rp[i];
1775 				if (fp->f_flag & FREVOKED) {
1776 					kprintf("Warning: Removing revoked fp from unix domain socket queue\n");
1777 					fhold(info->fx);
1778 					info->fx->f_msgcount++;
1779 					unp_rights++;
1780 					rp[i] = info->fx;
1781 					info->fary[info->fcount++] = fp;
1782 				}
1783 				if (info->fcount == REVOKE_GC_MAXFILES)
1784 					break;
1785 			}
1786 			if (info->fcount == REVOKE_GC_MAXFILES)
1787 				break;
1788 		}
1789 		m0 = m0->m_nextpkt;
1790 		if (info->fcount == REVOKE_GC_MAXFILES)
1791 			break;
1792 	}
1793 	lwkt_reltoken(&so->so_rcv.ssb_token);
1794 
1795 	/*
1796 	 * Stop the scan if we filled up our array.
1797 	 */
1798 	if (info->fcount == REVOKE_GC_MAXFILES)
1799 		return(-1);
1800 	return(0);
1801 }
1802 
1803 /*
1804  * Dispose of the fp's stored in a mbuf.
1805  *
1806  * The dds loop can cause additional fps to be entered onto the
1807  * list while it is running, flattening out the operation and avoiding
1808  * a deep kernel stack recursion.
1809  */
1810 void
1811 unp_dispose(struct mbuf *m)
1812 {
1813 	unp_defdiscard_t dds;
1814 
1815 	lwkt_gettoken(&unp_token);
1816 	++unp_defdiscard_nest;
1817 	if (m) {
1818 		unp_scan(m, unp_discard, NULL);
1819 	}
1820 	if (unp_defdiscard_nest == 1) {
1821 		while ((dds = unp_defdiscard_base) != NULL) {
1822 			unp_defdiscard_base = dds->next;
1823 			closef(dds->fp, NULL);
1824 			kfree(dds, M_UNPCB);
1825 		}
1826 	}
1827 	--unp_defdiscard_nest;
1828 	lwkt_reltoken(&unp_token);
1829 }
1830 
1831 static int
1832 unp_listen(struct unpcb *unp, struct thread *td)
1833 {
1834 	struct proc *p = td->td_proc;
1835 
1836 	KKASSERT(p);
1837 	lwkt_gettoken(&unp_token);
1838 	cru2x(p->p_ucred, &unp->unp_peercred);
1839 	unp->unp_flags |= UNP_HAVEPCCACHED;
1840 	lwkt_reltoken(&unp_token);
1841 	return (0);
1842 }
1843 
1844 static void
1845 unp_scan(struct mbuf *m0, void (*op)(struct file *, void *), void *data)
1846 {
1847 	struct mbuf *m;
1848 	struct file **rp;
1849 	struct cmsghdr *cm;
1850 	int i;
1851 	int qfds;
1852 
1853 	while (m0) {
1854 		for (m = m0; m; m = m->m_next) {
1855 			if (m->m_type == MT_CONTROL &&
1856 			    m->m_len >= sizeof(*cm)) {
1857 				cm = mtod(m, struct cmsghdr *);
1858 				if (cm->cmsg_level != SOL_SOCKET ||
1859 				    cm->cmsg_type != SCM_RIGHTS)
1860 					continue;
1861 				qfds = (cm->cmsg_len - CMSG_LEN(0)) /
1862 					sizeof(void *);
1863 				rp = (struct file **)CMSG_DATA(cm);
1864 				for (i = 0; i < qfds; i++)
1865 					(*op)(*rp++, data);
1866 				break;		/* XXX, but saves time */
1867 			}
1868 		}
1869 		m0 = m0->m_nextpkt;
1870 	}
1871 }
1872 
1873 /*
1874  * Mark visibility.  info->defer is recalculated on every pass.
1875  */
1876 static void
1877 unp_mark(struct file *fp, void *data)
1878 {
1879 	struct unp_gc_info *info = data;
1880 
1881 	if ((fp->f_flag & FMARK) == 0) {
1882 		++info->defer;
1883 		atomic_set_int(&fp->f_flag, FMARK | FDEFER);
1884 	} else if (fp->f_flag & FDEFER) {
1885 		++info->defer;
1886 	}
1887 }
1888 
1889 /*
1890  * Discard a fp previously held in a unix domain socket mbuf.  To
1891  * avoid blowing out the kernel stack due to contrived chain-reactions
1892  * we may have to defer the operation to a higher procedural level.
1893  *
1894  * Caller holds unp_token
1895  */
1896 static void
1897 unp_discard(struct file *fp, void *data __unused)
1898 {
1899 	unp_defdiscard_t dds;
1900 
1901 	spin_lock(&unp_spin);
1902 	fp->f_msgcount--;
1903 	unp_rights--;
1904 	spin_unlock(&unp_spin);
1905 
1906 	if (unp_defdiscard_nest) {
1907 		dds = kmalloc(sizeof(*dds), M_UNPCB, M_WAITOK|M_ZERO);
1908 		dds->fp = fp;
1909 		dds->next = unp_defdiscard_base;
1910 		unp_defdiscard_base = dds;
1911 	} else {
1912 		closef(fp, NULL);
1913 	}
1914 }
1915 
1916