xref: /dragonfly/sys/kern/uipc_usrreq.c (revision 6e278935)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
34  * $FreeBSD: src/sys/kern/uipc_usrreq.c,v 1.54.2.10 2003/03/04 17:28:09 nectar Exp $
35  * $DragonFly: src/sys/kern/uipc_usrreq.c,v 1.44 2008/09/06 05:44:58 dillon Exp $
36  */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/domain.h>
42 #include <sys/fcntl.h>
43 #include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
44 #include <sys/proc.h>
45 #include <sys/file.h>
46 #include <sys/filedesc.h>
47 #include <sys/mbuf.h>
48 #include <sys/nlookup.h>
49 #include <sys/protosw.h>
50 #include <sys/socket.h>
51 #include <sys/socketvar.h>
52 #include <sys/resourcevar.h>
53 #include <sys/stat.h>
54 #include <sys/mount.h>
55 #include <sys/sysctl.h>
56 #include <sys/un.h>
57 #include <sys/unpcb.h>
58 #include <sys/vnode.h>
59 
60 #include <sys/file2.h>
61 #include <sys/spinlock2.h>
62 #include <sys/socketvar2.h>
63 #include <sys/msgport2.h>
64 
65 typedef struct unp_defdiscard {
66 	struct unp_defdiscard *next;
67 	struct file *fp;
68 } *unp_defdiscard_t;
69 
70 static	MALLOC_DEFINE(M_UNPCB, "unpcb", "unpcb struct");
71 static	unp_gen_t unp_gencnt;
72 static	u_int unp_count;
73 
74 static	struct unp_head unp_shead, unp_dhead;
75 
76 static struct lwkt_token unp_token = LWKT_TOKEN_INITIALIZER(unp_token);
77 static int unp_defdiscard_nest;
78 static unp_defdiscard_t unp_defdiscard_base;
79 
80 /*
81  * Unix communications domain.
82  *
83  * TODO:
84  *	RDM
85  *	rethink name space problems
86  *	need a proper out-of-band
87  *	lock pushdown
88  */
89 static struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
90 static ino_t	unp_ino = 1;		/* prototype for fake inode numbers */
91 static struct spinlock unp_ino_spin = SPINLOCK_INITIALIZER(&unp_ino_spin);
92 
93 static int     unp_attach (struct socket *, struct pru_attach_info *);
94 static void    unp_detach (struct unpcb *);
95 static int     unp_bind (struct unpcb *,struct sockaddr *, struct thread *);
96 static int     unp_connect (struct socket *,struct sockaddr *,
97 				struct thread *);
98 static void    unp_disconnect (struct unpcb *);
99 static void    unp_shutdown (struct unpcb *);
100 static void    unp_drop (struct unpcb *, int);
101 static void    unp_gc (void);
102 static int     unp_gc_clearmarks(struct file *, void *);
103 static int     unp_gc_checkmarks(struct file *, void *);
104 static int     unp_gc_checkrefs(struct file *, void *);
105 static int     unp_revoke_gc_check(struct file *, void *);
106 static void    unp_scan (struct mbuf *, void (*)(struct file *, void *),
107 				void *data);
108 static void    unp_mark (struct file *, void *data);
109 static void    unp_discard (struct file *, void *);
110 static int     unp_internalize (struct mbuf *, struct thread *);
111 static int     unp_listen (struct unpcb *, struct thread *);
112 static void    unp_fp_externalize(struct lwp *lp, struct file *fp, int fd);
113 
114 /*
115  * NOTE:
116  * Since unp_token will be automaticly released upon execution of
117  * blocking code, we need to reference unp_conn before any possible
118  * blocking code to prevent it from being ripped behind our back.
119  */
120 
121 /* NOTE: unp_token MUST be held */
122 static __inline void
123 unp_reference(struct unpcb *unp)
124 {
125 	atomic_add_int(&unp->unp_refcnt, 1);
126 }
127 
128 /* NOTE: unp_token MUST be held */
129 static __inline void
130 unp_free(struct unpcb *unp)
131 {
132 	KKASSERT(unp->unp_refcnt > 0);
133 	if (atomic_fetchadd_int(&unp->unp_refcnt, -1) == 1)
134 		unp_detach(unp);
135 }
136 
137 /*
138  * NOTE: (so) is referenced from soabort*() and netmsg_pru_abort()
139  *	 will sofree() it when we return.
140  */
141 static void
142 uipc_abort(netmsg_t msg)
143 {
144 	struct unpcb *unp;
145 	int error;
146 
147 	lwkt_gettoken(&unp_token);
148 	unp = msg->base.nm_so->so_pcb;
149 	if (unp) {
150 		unp_drop(unp, ECONNABORTED);
151 		unp_free(unp);
152 		error = 0;
153 	} else {
154 		error = EINVAL;
155 	}
156 	lwkt_reltoken(&unp_token);
157 
158 	lwkt_replymsg(&msg->lmsg, error);
159 }
160 
161 static void
162 uipc_accept(netmsg_t msg)
163 {
164 	struct unpcb *unp;
165 	int error;
166 
167 	lwkt_gettoken(&unp_token);
168 	unp = msg->base.nm_so->so_pcb;
169 	if (unp == NULL) {
170 		error = EINVAL;
171 	} else {
172 		struct unpcb *unp2 = unp->unp_conn;
173 
174 		/*
175 		 * Pass back name of connected socket,
176 		 * if it was bound and we are still connected
177 		 * (our peer may have closed already!).
178 		 */
179 		if (unp2 && unp2->unp_addr) {
180 			unp_reference(unp2);
181 			*msg->accept.nm_nam = dup_sockaddr(
182 				(struct sockaddr *)unp2->unp_addr);
183 			unp_free(unp2);
184 		} else {
185 			*msg->accept.nm_nam = dup_sockaddr(
186 				(struct sockaddr *)&sun_noname);
187 		}
188 		error = 0;
189 	}
190 	lwkt_reltoken(&unp_token);
191 	lwkt_replymsg(&msg->lmsg, error);
192 }
193 
194 static void
195 uipc_attach(netmsg_t msg)
196 {
197 	struct unpcb *unp;
198 	int error;
199 
200 	lwkt_gettoken(&unp_token);
201 	unp = msg->base.nm_so->so_pcb;
202 	if (unp)
203 		error = EISCONN;
204 	else
205 		error = unp_attach(msg->base.nm_so, msg->attach.nm_ai);
206 	lwkt_reltoken(&unp_token);
207 	lwkt_replymsg(&msg->lmsg, error);
208 }
209 
210 static void
211 uipc_bind(netmsg_t msg)
212 {
213 	struct unpcb *unp;
214 	int error;
215 
216 	lwkt_gettoken(&unp_token);
217 	unp = msg->base.nm_so->so_pcb;
218 	if (unp)
219 		error = unp_bind(unp, msg->bind.nm_nam, msg->bind.nm_td);
220 	else
221 		error = EINVAL;
222 	lwkt_reltoken(&unp_token);
223 	lwkt_replymsg(&msg->lmsg, error);
224 }
225 
226 static void
227 uipc_connect(netmsg_t msg)
228 {
229 	struct unpcb *unp;
230 	int error;
231 
232 	lwkt_gettoken(&unp_token);
233 	unp = msg->base.nm_so->so_pcb;
234 	if (unp) {
235 		error = unp_connect(msg->base.nm_so,
236 				    msg->connect.nm_nam,
237 				    msg->connect.nm_td);
238 	} else {
239 		error = EINVAL;
240 	}
241 	lwkt_reltoken(&unp_token);
242 	lwkt_replymsg(&msg->lmsg, error);
243 }
244 
245 static void
246 uipc_connect2(netmsg_t msg)
247 {
248 	struct unpcb *unp;
249 	int error;
250 
251 	lwkt_gettoken(&unp_token);
252 	unp = msg->connect2.nm_so1->so_pcb;
253 	if (unp) {
254 		error = unp_connect2(msg->connect2.nm_so1,
255 				     msg->connect2.nm_so2);
256 	} else {
257 		error = EINVAL;
258 	}
259 	lwkt_reltoken(&unp_token);
260 	lwkt_replymsg(&msg->lmsg, error);
261 }
262 
263 /* control is EOPNOTSUPP */
264 
265 static void
266 uipc_detach(netmsg_t msg)
267 {
268 	struct unpcb *unp;
269 	int error;
270 
271 	lwkt_gettoken(&unp_token);
272 	unp = msg->base.nm_so->so_pcb;
273 	if (unp) {
274 		unp_free(unp);
275 		error = 0;
276 	} else {
277 		error = EINVAL;
278 	}
279 	lwkt_reltoken(&unp_token);
280 	lwkt_replymsg(&msg->lmsg, error);
281 }
282 
283 static void
284 uipc_disconnect(netmsg_t msg)
285 {
286 	struct unpcb *unp;
287 	int error;
288 
289 	lwkt_gettoken(&unp_token);
290 	unp = msg->base.nm_so->so_pcb;
291 	if (unp) {
292 		unp_disconnect(unp);
293 		error = 0;
294 	} else {
295 		error = EINVAL;
296 	}
297 	lwkt_reltoken(&unp_token);
298 	lwkt_replymsg(&msg->lmsg, error);
299 }
300 
301 static void
302 uipc_listen(netmsg_t msg)
303 {
304 	struct unpcb *unp;
305 	int error;
306 
307 	lwkt_gettoken(&unp_token);
308 	unp = msg->base.nm_so->so_pcb;
309 	if (unp == NULL || unp->unp_vnode == NULL)
310 		error = EINVAL;
311 	else
312 		error = unp_listen(unp, msg->listen.nm_td);
313 	lwkt_reltoken(&unp_token);
314 	lwkt_replymsg(&msg->lmsg, error);
315 }
316 
317 static void
318 uipc_peeraddr(netmsg_t msg)
319 {
320 	struct unpcb *unp;
321 	int error;
322 
323 	lwkt_gettoken(&unp_token);
324 	unp = msg->base.nm_so->so_pcb;
325 	if (unp == NULL) {
326 		error = EINVAL;
327 	} else if (unp->unp_conn && unp->unp_conn->unp_addr) {
328 		struct unpcb *unp2 = unp->unp_conn;
329 
330 		unp_reference(unp2);
331 		*msg->peeraddr.nm_nam = dup_sockaddr(
332 				(struct sockaddr *)unp2->unp_addr);
333 		unp_free(unp2);
334 		error = 0;
335 	} else {
336 		/*
337 		 * XXX: It seems that this test always fails even when
338 		 * connection is established.  So, this else clause is
339 		 * added as workaround to return PF_LOCAL sockaddr.
340 		 */
341 		*msg->peeraddr.nm_nam = dup_sockaddr(
342 				(struct sockaddr *)&sun_noname);
343 		error = 0;
344 	}
345 	lwkt_reltoken(&unp_token);
346 	lwkt_replymsg(&msg->lmsg, error);
347 }
348 
349 static void
350 uipc_rcvd(netmsg_t msg)
351 {
352 	struct unpcb *unp, *unp2;
353 	struct socket *so;
354 	struct socket *so2;
355 	int error;
356 
357 	lwkt_gettoken(&unp_token);
358 	so = msg->base.nm_so;
359 	unp = so->so_pcb;
360 	if (unp == NULL) {
361 		error = EINVAL;
362 		goto done;
363 	}
364 
365 	switch (so->so_type) {
366 	case SOCK_DGRAM:
367 		panic("uipc_rcvd DGRAM?");
368 		/*NOTREACHED*/
369 	case SOCK_STREAM:
370 	case SOCK_SEQPACKET:
371 		if (unp->unp_conn == NULL)
372 			break;
373 		unp2 = unp->unp_conn;
374 
375 		/*
376 		 * Because we are transfering mbufs directly to the
377 		 * peer socket we have to use SSB_STOP on the sender
378 		 * to prevent it from building up infinite mbufs.
379 		 */
380 		so2 = unp2->unp_socket;
381 		if (so->so_rcv.ssb_cc < so2->so_snd.ssb_hiwat &&
382 		    so->so_rcv.ssb_mbcnt < so2->so_snd.ssb_mbmax
383 		) {
384 			atomic_clear_int(&so2->so_snd.ssb_flags, SSB_STOP);
385 
386 			unp_reference(unp2);
387 			sowwakeup(so2);
388 			unp_free(unp2);
389 		}
390 		break;
391 	default:
392 		panic("uipc_rcvd unknown socktype");
393 		/*NOTREACHED*/
394 	}
395 	error = 0;
396 done:
397 	lwkt_reltoken(&unp_token);
398 	lwkt_replymsg(&msg->lmsg, error);
399 }
400 
401 /* pru_rcvoob is EOPNOTSUPP */
402 
403 static void
404 uipc_send(netmsg_t msg)
405 {
406 	struct unpcb *unp, *unp2;
407 	struct socket *so;
408 	struct socket *so2;
409 	struct mbuf *control;
410 	struct mbuf *m;
411 	int error = 0;
412 
413 	lwkt_gettoken(&unp_token);
414 	so = msg->base.nm_so;
415 	control = msg->send.nm_control;
416 	m = msg->send.nm_m;
417 	unp = so->so_pcb;
418 
419 	if (unp == NULL) {
420 		error = EINVAL;
421 		goto release;
422 	}
423 	if (msg->send.nm_flags & PRUS_OOB) {
424 		error = EOPNOTSUPP;
425 		goto release;
426 	}
427 
428 	if (control && (error = unp_internalize(control, msg->send.nm_td)))
429 		goto release;
430 
431 	switch (so->so_type) {
432 	case SOCK_DGRAM:
433 	{
434 		struct sockaddr *from;
435 
436 		if (msg->send.nm_addr) {
437 			if (unp->unp_conn) {
438 				error = EISCONN;
439 				break;
440 			}
441 			error = unp_connect(so,
442 					    msg->send.nm_addr,
443 					    msg->send.nm_td);
444 			if (error)
445 				break;
446 		} else {
447 			if (unp->unp_conn == NULL) {
448 				error = ENOTCONN;
449 				break;
450 			}
451 		}
452 		unp2 = unp->unp_conn;
453 		so2 = unp2->unp_socket;
454 		if (unp->unp_addr)
455 			from = (struct sockaddr *)unp->unp_addr;
456 		else
457 			from = &sun_noname;
458 
459 		unp_reference(unp2);
460 
461 		lwkt_gettoken(&so2->so_rcv.ssb_token);
462 		if (ssb_appendaddr(&so2->so_rcv, from, m, control)) {
463 			sorwakeup(so2);
464 			m = NULL;
465 			control = NULL;
466 		} else {
467 			error = ENOBUFS;
468 		}
469 		if (msg->send.nm_addr)
470 			unp_disconnect(unp);
471 		lwkt_reltoken(&so2->so_rcv.ssb_token);
472 
473 		unp_free(unp2);
474 		break;
475 	}
476 
477 	case SOCK_STREAM:
478 	case SOCK_SEQPACKET:
479 		/* Connect if not connected yet. */
480 		/*
481 		 * Note: A better implementation would complain
482 		 * if not equal to the peer's address.
483 		 */
484 		if (!(so->so_state & SS_ISCONNECTED)) {
485 			if (msg->send.nm_addr) {
486 				error = unp_connect(so,
487 						    msg->send.nm_addr,
488 						    msg->send.nm_td);
489 				if (error)
490 					break;	/* XXX */
491 			} else {
492 				error = ENOTCONN;
493 				break;
494 			}
495 		}
496 
497 		if (so->so_state & SS_CANTSENDMORE) {
498 			error = EPIPE;
499 			break;
500 		}
501 		if (unp->unp_conn == NULL)
502 			panic("uipc_send connected but no connection?");
503 		unp2 = unp->unp_conn;
504 		so2 = unp2->unp_socket;
505 
506 		unp_reference(unp2);
507 
508 		/*
509 		 * Send to paired receive port, and then reduce
510 		 * send buffer hiwater marks to maintain backpressure.
511 		 * Wake up readers.
512 		 */
513 		lwkt_gettoken(&so2->so_rcv.ssb_token);
514 		if (control) {
515 			if (ssb_appendcontrol(&so2->so_rcv, m, control)) {
516 				control = NULL;
517 				m = NULL;
518 			}
519 		} else if (so->so_type == SOCK_SEQPACKET) {
520 			sbappendrecord(&so2->so_rcv.sb, m);
521 			m = NULL;
522 		} else {
523 			sbappend(&so2->so_rcv.sb, m);
524 			m = NULL;
525 		}
526 
527 		/*
528 		 * Because we are transfering mbufs directly to the
529 		 * peer socket we have to use SSB_STOP on the sender
530 		 * to prevent it from building up infinite mbufs.
531 		 */
532 		if (so2->so_rcv.ssb_cc >= so->so_snd.ssb_hiwat ||
533 		    so2->so_rcv.ssb_mbcnt >= so->so_snd.ssb_mbmax
534 		) {
535 			atomic_set_int(&so->so_snd.ssb_flags, SSB_STOP);
536 		}
537 		lwkt_reltoken(&so2->so_rcv.ssb_token);
538 		sorwakeup(so2);
539 
540 		unp_free(unp2);
541 		break;
542 
543 	default:
544 		panic("uipc_send unknown socktype");
545 	}
546 
547 	/*
548 	 * SEND_EOF is equivalent to a SEND followed by a SHUTDOWN.
549 	 */
550 	if (msg->send.nm_flags & PRUS_EOF) {
551 		socantsendmore(so);
552 		unp_shutdown(unp);
553 	}
554 
555 	if (control && error != 0)
556 		unp_dispose(control);
557 
558 release:
559 	lwkt_reltoken(&unp_token);
560 
561 	if (control)
562 		m_freem(control);
563 	if (m)
564 		m_freem(m);
565 	lwkt_replymsg(&msg->lmsg, error);
566 }
567 
568 /*
569  * MPSAFE
570  */
571 static void
572 uipc_sense(netmsg_t msg)
573 {
574 	struct unpcb *unp;
575 	struct socket *so;
576 	struct stat *sb;
577 	int error;
578 
579 	lwkt_gettoken(&unp_token);
580 	so = msg->base.nm_so;
581 	sb = msg->sense.nm_stat;
582 	unp = so->so_pcb;
583 	if (unp == NULL) {
584 		error = EINVAL;
585 		goto done;
586 	}
587 	sb->st_blksize = so->so_snd.ssb_hiwat;
588 	sb->st_dev = NOUDEV;
589 	if (unp->unp_ino == 0) {	/* make up a non-zero inode number */
590 		spin_lock(&unp_ino_spin);
591 		unp->unp_ino = unp_ino++;
592 		spin_unlock(&unp_ino_spin);
593 	}
594 	sb->st_ino = unp->unp_ino;
595 	error = 0;
596 done:
597 	lwkt_reltoken(&unp_token);
598 	lwkt_replymsg(&msg->lmsg, error);
599 }
600 
601 static void
602 uipc_shutdown(netmsg_t msg)
603 {
604 	struct socket *so;
605 	struct unpcb *unp;
606 	int error;
607 
608 	lwkt_gettoken(&unp_token);
609 	so = msg->base.nm_so;
610 	unp = so->so_pcb;
611 	if (unp) {
612 		socantsendmore(so);
613 		unp_shutdown(unp);
614 		error = 0;
615 	} else {
616 		error = EINVAL;
617 	}
618 	lwkt_reltoken(&unp_token);
619 	lwkt_replymsg(&msg->lmsg, error);
620 }
621 
622 static void
623 uipc_sockaddr(netmsg_t msg)
624 {
625 	struct unpcb *unp;
626 	int error;
627 
628 	lwkt_gettoken(&unp_token);
629 	unp = msg->base.nm_so->so_pcb;
630 	if (unp) {
631 		if (unp->unp_addr) {
632 			*msg->sockaddr.nm_nam =
633 				dup_sockaddr((struct sockaddr *)unp->unp_addr);
634 		}
635 		error = 0;
636 	} else {
637 		error = EINVAL;
638 	}
639 	lwkt_reltoken(&unp_token);
640 	lwkt_replymsg(&msg->lmsg, error);
641 }
642 
643 struct pr_usrreqs uipc_usrreqs = {
644 	.pru_abort = uipc_abort,
645 	.pru_accept = uipc_accept,
646 	.pru_attach = uipc_attach,
647 	.pru_bind = uipc_bind,
648 	.pru_connect = uipc_connect,
649 	.pru_connect2 = uipc_connect2,
650 	.pru_control = pr_generic_notsupp,
651 	.pru_detach = uipc_detach,
652 	.pru_disconnect = uipc_disconnect,
653 	.pru_listen = uipc_listen,
654 	.pru_peeraddr = uipc_peeraddr,
655 	.pru_rcvd = uipc_rcvd,
656 	.pru_rcvoob = pr_generic_notsupp,
657 	.pru_send = uipc_send,
658 	.pru_sense = uipc_sense,
659 	.pru_shutdown = uipc_shutdown,
660 	.pru_sockaddr = uipc_sockaddr,
661 	.pru_sosend = sosend,
662 	.pru_soreceive = soreceive
663 };
664 
665 void
666 uipc_ctloutput(netmsg_t msg)
667 {
668 	struct socket *so;
669 	struct sockopt *sopt;
670 	struct unpcb *unp;
671 	int error = 0;
672 
673 	lwkt_gettoken(&unp_token);
674 	so = msg->base.nm_so;
675 	sopt = msg->ctloutput.nm_sopt;
676 	unp = so->so_pcb;
677 
678 	switch (sopt->sopt_dir) {
679 	case SOPT_GET:
680 		switch (sopt->sopt_name) {
681 		case LOCAL_PEERCRED:
682 			if (unp->unp_flags & UNP_HAVEPC)
683 				soopt_from_kbuf(sopt, &unp->unp_peercred,
684 						sizeof(unp->unp_peercred));
685 			else {
686 				if (so->so_type == SOCK_STREAM)
687 					error = ENOTCONN;
688 				else if (so->so_type == SOCK_SEQPACKET)
689 					error = ENOTCONN;
690 				else
691 					error = EINVAL;
692 			}
693 			break;
694 		default:
695 			error = EOPNOTSUPP;
696 			break;
697 		}
698 		break;
699 	case SOPT_SET:
700 	default:
701 		error = EOPNOTSUPP;
702 		break;
703 	}
704 	lwkt_reltoken(&unp_token);
705 	lwkt_replymsg(&msg->lmsg, error);
706 }
707 
708 /*
709  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
710  * for stream sockets, although the total for sender and receiver is
711  * actually only PIPSIZ.
712  *
713  * Datagram sockets really use the sendspace as the maximum datagram size,
714  * and don't really want to reserve the sendspace.  Their recvspace should
715  * be large enough for at least one max-size datagram plus address.
716  *
717  * We want the local send/recv space to be significant larger then lo0's
718  * mtu of 16384.
719  */
720 #ifndef PIPSIZ
721 #define	PIPSIZ	57344
722 #endif
723 static u_long	unpst_sendspace = PIPSIZ;
724 static u_long	unpst_recvspace = PIPSIZ;
725 static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
726 static u_long	unpdg_recvspace = 4*1024;
727 
728 static int	unp_rights;			/* file descriptors in flight */
729 static struct spinlock unp_spin = SPINLOCK_INITIALIZER(&unp_spin);
730 
731 SYSCTL_DECL(_net_local_seqpacket);
732 SYSCTL_DECL(_net_local_stream);
733 SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
734     &unpst_sendspace, 0, "Size of stream socket send buffer");
735 SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
736     &unpst_recvspace, 0, "Size of stream socket receive buffer");
737 
738 SYSCTL_DECL(_net_local_dgram);
739 SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
740     &unpdg_sendspace, 0, "Max datagram socket size");
741 SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
742     &unpdg_recvspace, 0, "Size of datagram socket receive buffer");
743 
744 SYSCTL_DECL(_net_local);
745 SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0,
746    "File descriptors in flight");
747 
748 static int
749 unp_attach(struct socket *so, struct pru_attach_info *ai)
750 {
751 	struct unpcb *unp;
752 	int error;
753 
754 	lwkt_gettoken(&unp_token);
755 
756 	if (so->so_snd.ssb_hiwat == 0 || so->so_rcv.ssb_hiwat == 0) {
757 		switch (so->so_type) {
758 
759 		case SOCK_STREAM:
760 		case SOCK_SEQPACKET:
761 			error = soreserve(so, unpst_sendspace, unpst_recvspace,
762 					  ai->sb_rlimit);
763 			break;
764 
765 		case SOCK_DGRAM:
766 			error = soreserve(so, unpdg_sendspace, unpdg_recvspace,
767 					  ai->sb_rlimit);
768 			break;
769 
770 		default:
771 			panic("unp_attach");
772 		}
773 		if (error)
774 			goto failed;
775 	}
776 	unp = kmalloc(sizeof(*unp), M_UNPCB, M_WAITOK | M_ZERO | M_NULLOK);
777 	if (unp == NULL) {
778 		error = ENOBUFS;
779 		goto failed;
780 	}
781 	unp->unp_refcnt = 1;
782 	unp->unp_gencnt = ++unp_gencnt;
783 	unp_count++;
784 	LIST_INIT(&unp->unp_refs);
785 	unp->unp_socket = so;
786 	unp->unp_rvnode = ai->fd_rdir;		/* jail cruft XXX JH */
787 	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
788 			 : &unp_shead, unp, unp_link);
789 	so->so_pcb = (caddr_t)unp;
790 	soreference(so);
791 	error = 0;
792 failed:
793 	lwkt_reltoken(&unp_token);
794 	return error;
795 }
796 
797 static void
798 unp_detach(struct unpcb *unp)
799 {
800 	struct socket *so;
801 
802 	lwkt_gettoken(&unp_token);
803 
804 	LIST_REMOVE(unp, unp_link);
805 	unp->unp_gencnt = ++unp_gencnt;
806 	--unp_count;
807 	if (unp->unp_vnode) {
808 		unp->unp_vnode->v_socket = NULL;
809 		vrele(unp->unp_vnode);
810 		unp->unp_vnode = NULL;
811 	}
812 	if (unp->unp_conn)
813 		unp_disconnect(unp);
814 	while (!LIST_EMPTY(&unp->unp_refs))
815 		unp_drop(LIST_FIRST(&unp->unp_refs), ECONNRESET);
816 	soisdisconnected(unp->unp_socket);
817 	so = unp->unp_socket;
818 	soreference(so);	/* for delayed sorflush */
819 	so->so_pcb = NULL;
820 	unp->unp_socket = NULL;
821 	sofree(so);		/* remove pcb ref */
822 
823 	if (unp_rights) {
824 		/*
825 		 * Normally the receive buffer is flushed later,
826 		 * in sofree, but if our receive buffer holds references
827 		 * to descriptors that are now garbage, we will dispose
828 		 * of those descriptor references after the garbage collector
829 		 * gets them (resulting in a "panic: closef: count < 0").
830 		 */
831 		sorflush(so);
832 		unp_gc();
833 	}
834 	sofree(so);
835 	lwkt_reltoken(&unp_token);
836 
837 	if (unp->unp_addr)
838 		kfree(unp->unp_addr, M_SONAME);
839 	kfree(unp, M_UNPCB);
840 }
841 
842 static int
843 unp_bind(struct unpcb *unp, struct sockaddr *nam, struct thread *td)
844 {
845 	struct proc *p = td->td_proc;
846 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
847 	struct vnode *vp;
848 	struct vattr vattr;
849 	int error, namelen;
850 	struct nlookupdata nd;
851 	char buf[SOCK_MAXADDRLEN];
852 
853 	lwkt_gettoken(&unp_token);
854 	if (unp->unp_vnode != NULL) {
855 		error = EINVAL;
856 		goto failed;
857 	}
858 	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
859 	if (namelen <= 0) {
860 		error = EINVAL;
861 		goto failed;
862 	}
863 	strncpy(buf, soun->sun_path, namelen);
864 	buf[namelen] = 0;	/* null-terminate the string */
865 	error = nlookup_init(&nd, buf, UIO_SYSSPACE,
866 			     NLC_LOCKVP | NLC_CREATE | NLC_REFDVP);
867 	if (error == 0)
868 		error = nlookup(&nd);
869 	if (error == 0 && nd.nl_nch.ncp->nc_vp != NULL)
870 		error = EADDRINUSE;
871 	if (error)
872 		goto done;
873 
874 	VATTR_NULL(&vattr);
875 	vattr.va_type = VSOCK;
876 	vattr.va_mode = (ACCESSPERMS & ~p->p_fd->fd_cmask);
877 	error = VOP_NCREATE(&nd.nl_nch, nd.nl_dvp, &vp, nd.nl_cred, &vattr);
878 	if (error == 0) {
879 		vp->v_socket = unp->unp_socket;
880 		unp->unp_vnode = vp;
881 		unp->unp_addr = (struct sockaddr_un *)dup_sockaddr(nam);
882 		vn_unlock(vp);
883 	}
884 done:
885 	nlookup_done(&nd);
886 failed:
887 	lwkt_reltoken(&unp_token);
888 	return (error);
889 }
890 
891 static int
892 unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
893 {
894 	struct proc *p = td->td_proc;
895 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
896 	struct vnode *vp;
897 	struct socket *so2, *so3;
898 	struct unpcb *unp, *unp2, *unp3;
899 	int error, len;
900 	struct nlookupdata nd;
901 	char buf[SOCK_MAXADDRLEN];
902 
903 	lwkt_gettoken(&unp_token);
904 
905 	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
906 	if (len <= 0) {
907 		error = EINVAL;
908 		goto failed;
909 	}
910 	strncpy(buf, soun->sun_path, len);
911 	buf[len] = 0;
912 
913 	vp = NULL;
914 	error = nlookup_init(&nd, buf, UIO_SYSSPACE, NLC_FOLLOW);
915 	if (error == 0)
916 		error = nlookup(&nd);
917 	if (error == 0)
918 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
919 	nlookup_done(&nd);
920 	if (error)
921 		goto failed;
922 
923 	if (vp->v_type != VSOCK) {
924 		error = ENOTSOCK;
925 		goto bad;
926 	}
927 	error = VOP_EACCESS(vp, VWRITE, p->p_ucred);
928 	if (error)
929 		goto bad;
930 	so2 = vp->v_socket;
931 	if (so2 == NULL) {
932 		error = ECONNREFUSED;
933 		goto bad;
934 	}
935 	if (so->so_type != so2->so_type) {
936 		error = EPROTOTYPE;
937 		goto bad;
938 	}
939 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
940 		if (!(so2->so_options & SO_ACCEPTCONN) ||
941 		    (so3 = sonewconn(so2, 0)) == NULL) {
942 			error = ECONNREFUSED;
943 			goto bad;
944 		}
945 		unp = so->so_pcb;
946 		unp2 = so2->so_pcb;
947 		unp3 = so3->so_pcb;
948 		if (unp2->unp_addr)
949 			unp3->unp_addr = (struct sockaddr_un *)
950 				dup_sockaddr((struct sockaddr *)unp2->unp_addr);
951 
952 		/*
953 		 * unp_peercred management:
954 		 *
955 		 * The connecter's (client's) credentials are copied
956 		 * from its process structure at the time of connect()
957 		 * (which is now).
958 		 */
959 		cru2x(p->p_ucred, &unp3->unp_peercred);
960 		unp3->unp_flags |= UNP_HAVEPC;
961 		/*
962 		 * The receiver's (server's) credentials are copied
963 		 * from the unp_peercred member of socket on which the
964 		 * former called listen(); unp_listen() cached that
965 		 * process's credentials at that time so we can use
966 		 * them now.
967 		 */
968 		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
969 		    ("unp_connect: listener without cached peercred"));
970 		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
971 		    sizeof(unp->unp_peercred));
972 		unp->unp_flags |= UNP_HAVEPC;
973 
974 		so2 = so3;
975 	}
976 	error = unp_connect2(so, so2);
977 bad:
978 	vput(vp);
979 failed:
980 	lwkt_reltoken(&unp_token);
981 	return (error);
982 }
983 
984 int
985 unp_connect2(struct socket *so, struct socket *so2)
986 {
987 	struct unpcb *unp;
988 	struct unpcb *unp2;
989 
990 	lwkt_gettoken(&unp_token);
991 	unp = so->so_pcb;
992 	if (so2->so_type != so->so_type) {
993 		lwkt_reltoken(&unp_token);
994 		return (EPROTOTYPE);
995 	}
996 	unp2 = so2->so_pcb;
997 	unp->unp_conn = unp2;
998 
999 	switch (so->so_type) {
1000 	case SOCK_DGRAM:
1001 		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
1002 		soisconnected(so);
1003 		break;
1004 
1005 	case SOCK_STREAM:
1006 	case SOCK_SEQPACKET:
1007 		unp2->unp_conn = unp;
1008 		soisconnected(so);
1009 		soisconnected(so2);
1010 		break;
1011 
1012 	default:
1013 		panic("unp_connect2");
1014 	}
1015 	lwkt_reltoken(&unp_token);
1016 	return (0);
1017 }
1018 
1019 static void
1020 unp_disconnect(struct unpcb *unp)
1021 {
1022 	struct unpcb *unp2;
1023 
1024 	lwkt_gettoken(&unp_token);
1025 
1026 	unp2 = unp->unp_conn;
1027 	if (unp2 == NULL) {
1028 		lwkt_reltoken(&unp_token);
1029 		return;
1030 	}
1031 
1032 	unp->unp_conn = NULL;
1033 
1034 	switch (unp->unp_socket->so_type) {
1035 	case SOCK_DGRAM:
1036 		LIST_REMOVE(unp, unp_reflink);
1037 		soclrstate(unp->unp_socket, SS_ISCONNECTED);
1038 		break;
1039 
1040 	case SOCK_STREAM:
1041 	case SOCK_SEQPACKET:
1042 		unp_reference(unp2);
1043 		unp2->unp_conn = NULL;
1044 
1045 		soisdisconnected(unp->unp_socket);
1046 		soisdisconnected(unp2->unp_socket);
1047 
1048 		unp_free(unp2);
1049 		break;
1050 	}
1051 	lwkt_reltoken(&unp_token);
1052 }
1053 
1054 #ifdef notdef
1055 void
1056 unp_abort(struct unpcb *unp)
1057 {
1058 	lwkt_gettoken(&unp_token);
1059 	unp_free(unp);
1060 	lwkt_reltoken(&unp_token);
1061 }
1062 #endif
1063 
1064 static int
1065 prison_unpcb(struct thread *td, struct unpcb *unp)
1066 {
1067 	struct proc *p;
1068 
1069 	if (td == NULL)
1070 		return (0);
1071 	if ((p = td->td_proc) == NULL)
1072 		return (0);
1073 	if (!p->p_ucred->cr_prison)
1074 		return (0);
1075 	if (p->p_fd->fd_rdir == unp->unp_rvnode)
1076 		return (0);
1077 	return (1);
1078 }
1079 
1080 static int
1081 unp_pcblist(SYSCTL_HANDLER_ARGS)
1082 {
1083 	int error, i, n;
1084 	struct unpcb *unp, **unp_list;
1085 	unp_gen_t gencnt;
1086 	struct unp_head *head;
1087 
1088 	head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
1089 
1090 	KKASSERT(curproc != NULL);
1091 
1092 	/*
1093 	 * The process of preparing the PCB list is too time-consuming and
1094 	 * resource-intensive to repeat twice on every request.
1095 	 */
1096 	if (req->oldptr == NULL) {
1097 		n = unp_count;
1098 		req->oldidx = (n + n/8) * sizeof(struct xunpcb);
1099 		return 0;
1100 	}
1101 
1102 	if (req->newptr != NULL)
1103 		return EPERM;
1104 
1105 	lwkt_gettoken(&unp_token);
1106 
1107 	/*
1108 	 * OK, now we're committed to doing something.
1109 	 */
1110 	gencnt = unp_gencnt;
1111 	n = unp_count;
1112 
1113 	unp_list = kmalloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
1114 
1115 	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
1116 	     unp = LIST_NEXT(unp, unp_link)) {
1117 		if (unp->unp_gencnt <= gencnt && !prison_unpcb(req->td, unp))
1118 			unp_list[i++] = unp;
1119 	}
1120 	n = i;			/* in case we lost some during malloc */
1121 
1122 	error = 0;
1123 	for (i = 0; i < n; i++) {
1124 		unp = unp_list[i];
1125 		if (unp->unp_gencnt <= gencnt) {
1126 			struct xunpcb xu;
1127 			xu.xu_len = sizeof xu;
1128 			xu.xu_unpp = unp;
1129 			/*
1130 			 * XXX - need more locking here to protect against
1131 			 * connect/disconnect races for SMP.
1132 			 */
1133 			if (unp->unp_addr)
1134 				bcopy(unp->unp_addr, &xu.xu_addr,
1135 				      unp->unp_addr->sun_len);
1136 			if (unp->unp_conn && unp->unp_conn->unp_addr)
1137 				bcopy(unp->unp_conn->unp_addr,
1138 				      &xu.xu_caddr,
1139 				      unp->unp_conn->unp_addr->sun_len);
1140 			bcopy(unp, &xu.xu_unp, sizeof *unp);
1141 			sotoxsocket(unp->unp_socket, &xu.xu_socket);
1142 			error = SYSCTL_OUT(req, &xu, sizeof xu);
1143 		}
1144 	}
1145 	lwkt_reltoken(&unp_token);
1146 	kfree(unp_list, M_TEMP);
1147 
1148 	return error;
1149 }
1150 
1151 SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD,
1152 	    (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
1153 	    "List of active local datagram sockets");
1154 SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD,
1155 	    (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
1156 	    "List of active local stream sockets");
1157 SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist, CTLFLAG_RD,
1158 	    (caddr_t)(long)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb",
1159 	    "List of active local seqpacket stream sockets");
1160 
1161 static void
1162 unp_shutdown(struct unpcb *unp)
1163 {
1164 	struct socket *so;
1165 
1166 	if ((unp->unp_socket->so_type == SOCK_STREAM ||
1167 	     unp->unp_socket->so_type == SOCK_SEQPACKET) &&
1168 	    unp->unp_conn != NULL && (so = unp->unp_conn->unp_socket)) {
1169 		socantrcvmore(so);
1170 	}
1171 }
1172 
1173 static void
1174 unp_drop(struct unpcb *unp, int err)
1175 {
1176 	struct socket *so = unp->unp_socket;
1177 
1178 	so->so_error = err;
1179 	unp_disconnect(unp);
1180 }
1181 
1182 #ifdef notdef
1183 void
1184 unp_drain(void)
1185 {
1186 	lwkt_gettoken(&unp_token);
1187 	lwkt_reltoken(&unp_token);
1188 }
1189 #endif
1190 
1191 int
1192 unp_externalize(struct mbuf *rights)
1193 {
1194 	struct thread *td = curthread;
1195 	struct proc *p = td->td_proc;		/* XXX */
1196 	struct lwp *lp = td->td_lwp;
1197 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
1198 	int *fdp;
1199 	int i;
1200 	struct file **rp;
1201 	struct file *fp;
1202 	int newfds = (cm->cmsg_len - (CMSG_DATA(cm) - (u_char *)cm))
1203 		/ sizeof (struct file *);
1204 	int f;
1205 
1206 	lwkt_gettoken(&unp_token);
1207 
1208 	/*
1209 	 * if the new FD's will not fit, then we free them all
1210 	 */
1211 	if (!fdavail(p, newfds)) {
1212 		rp = (struct file **)CMSG_DATA(cm);
1213 		for (i = 0; i < newfds; i++) {
1214 			fp = *rp;
1215 			/*
1216 			 * zero the pointer before calling unp_discard,
1217 			 * since it may end up in unp_gc()..
1218 			 */
1219 			*rp++ = 0;
1220 			unp_discard(fp, NULL);
1221 		}
1222 		lwkt_reltoken(&unp_token);
1223 		return (EMSGSIZE);
1224 	}
1225 
1226 	/*
1227 	 * now change each pointer to an fd in the global table to
1228 	 * an integer that is the index to the local fd table entry
1229 	 * that we set up to point to the global one we are transferring.
1230 	 * If sizeof (struct file *) is bigger than or equal to sizeof int,
1231 	 * then do it in forward order. In that case, an integer will
1232 	 * always come in the same place or before its corresponding
1233 	 * struct file pointer.
1234 	 * If sizeof (struct file *) is smaller than sizeof int, then
1235 	 * do it in reverse order.
1236 	 */
1237 	if (sizeof (struct file *) >= sizeof (int)) {
1238 		fdp = (int *)CMSG_DATA(cm);
1239 		rp = (struct file **)CMSG_DATA(cm);
1240 		for (i = 0; i < newfds; i++) {
1241 			if (fdalloc(p, 0, &f))
1242 				panic("unp_externalize");
1243 			fp = *rp++;
1244 			unp_fp_externalize(lp, fp, f);
1245 			*fdp++ = f;
1246 		}
1247 	} else {
1248 		fdp = (int *)CMSG_DATA(cm) + newfds - 1;
1249 		rp = (struct file **)CMSG_DATA(cm) + newfds - 1;
1250 		for (i = 0; i < newfds; i++) {
1251 			if (fdalloc(p, 0, &f))
1252 				panic("unp_externalize");
1253 			fp = *rp--;
1254 			unp_fp_externalize(lp, fp, f);
1255 			*fdp-- = f;
1256 		}
1257 	}
1258 
1259 	/*
1260 	 * Adjust length, in case sizeof(struct file *) and sizeof(int)
1261 	 * differs.
1262 	 */
1263 	cm->cmsg_len = CMSG_LEN(newfds * sizeof(int));
1264 	rights->m_len = cm->cmsg_len;
1265 
1266 	lwkt_reltoken(&unp_token);
1267 	return (0);
1268 }
1269 
1270 static void
1271 unp_fp_externalize(struct lwp *lp, struct file *fp, int fd)
1272 {
1273 	struct file *fx;
1274 	int error;
1275 
1276 	lwkt_gettoken(&unp_token);
1277 
1278 	if (lp) {
1279 		KKASSERT(fd >= 0);
1280 		if (fp->f_flag & FREVOKED) {
1281 			kprintf("Warning: revoked fp exiting unix socket\n");
1282 			fx = NULL;
1283 			error = falloc(lp, &fx, NULL);
1284 			if (error == 0)
1285 				fsetfd(lp->lwp_proc->p_fd, fx, fd);
1286 			else
1287 				fsetfd(lp->lwp_proc->p_fd, NULL, fd);
1288 			fdrop(fx);
1289 		} else {
1290 			fsetfd(lp->lwp_proc->p_fd, fp, fd);
1291 		}
1292 	}
1293 	spin_lock(&unp_spin);
1294 	fp->f_msgcount--;
1295 	unp_rights--;
1296 	spin_unlock(&unp_spin);
1297 	fdrop(fp);
1298 
1299 	lwkt_reltoken(&unp_token);
1300 }
1301 
1302 
1303 void
1304 unp_init(void)
1305 {
1306 	LIST_INIT(&unp_dhead);
1307 	LIST_INIT(&unp_shead);
1308 	spin_init(&unp_spin);
1309 }
1310 
1311 static int
1312 unp_internalize(struct mbuf *control, struct thread *td)
1313 {
1314 	struct proc *p = td->td_proc;
1315 	struct filedesc *fdescp;
1316 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1317 	struct file **rp;
1318 	struct file *fp;
1319 	int i, fd, *fdp;
1320 	struct cmsgcred *cmcred;
1321 	int oldfds;
1322 	u_int newlen;
1323 	int error;
1324 
1325 	KKASSERT(p);
1326 	lwkt_gettoken(&unp_token);
1327 
1328 	fdescp = p->p_fd;
1329 	if ((cm->cmsg_type != SCM_RIGHTS && cm->cmsg_type != SCM_CREDS) ||
1330 	    cm->cmsg_level != SOL_SOCKET ||
1331 	    CMSG_ALIGN(cm->cmsg_len) != control->m_len) {
1332 		error = EINVAL;
1333 		goto done;
1334 	}
1335 
1336 	/*
1337 	 * Fill in credential information.
1338 	 */
1339 	if (cm->cmsg_type == SCM_CREDS) {
1340 		cmcred = (struct cmsgcred *)CMSG_DATA(cm);
1341 		cmcred->cmcred_pid = p->p_pid;
1342 		cmcred->cmcred_uid = p->p_ucred->cr_ruid;
1343 		cmcred->cmcred_gid = p->p_ucred->cr_rgid;
1344 		cmcred->cmcred_euid = p->p_ucred->cr_uid;
1345 		cmcred->cmcred_ngroups = MIN(p->p_ucred->cr_ngroups,
1346 							CMGROUP_MAX);
1347 		for (i = 0; i < cmcred->cmcred_ngroups; i++)
1348 			cmcred->cmcred_groups[i] = p->p_ucred->cr_groups[i];
1349 		error = 0;
1350 		goto done;
1351 	}
1352 
1353 	/*
1354 	 * cmsghdr may not be aligned, do not allow calculation(s) to
1355 	 * go negative.
1356 	 */
1357 	if (cm->cmsg_len < CMSG_LEN(0)) {
1358 		error = EINVAL;
1359 		goto done;
1360 	}
1361 
1362 	oldfds = (cm->cmsg_len - CMSG_LEN(0)) / sizeof (int);
1363 
1364 	/*
1365 	 * check that all the FDs passed in refer to legal OPEN files
1366 	 * If not, reject the entire operation.
1367 	 */
1368 	fdp = (int *)CMSG_DATA(cm);
1369 	for (i = 0; i < oldfds; i++) {
1370 		fd = *fdp++;
1371 		if ((unsigned)fd >= fdescp->fd_nfiles ||
1372 		    fdescp->fd_files[fd].fp == NULL) {
1373 			error = EBADF;
1374 			goto done;
1375 		}
1376 		if (fdescp->fd_files[fd].fp->f_type == DTYPE_KQUEUE) {
1377 			error = EOPNOTSUPP;
1378 			goto done;
1379 		}
1380 	}
1381 	/*
1382 	 * Now replace the integer FDs with pointers to
1383 	 * the associated global file table entry..
1384 	 * Allocate a bigger buffer as necessary. But if an cluster is not
1385 	 * enough, return E2BIG.
1386 	 */
1387 	newlen = CMSG_LEN(oldfds * sizeof(struct file *));
1388 	if (newlen > MCLBYTES) {
1389 		error = E2BIG;
1390 		goto done;
1391 	}
1392 	if (newlen - control->m_len > M_TRAILINGSPACE(control)) {
1393 		if (control->m_flags & M_EXT) {
1394 			error = E2BIG;
1395 			goto done;
1396 		}
1397 		MCLGET(control, MB_WAIT);
1398 		if (!(control->m_flags & M_EXT)) {
1399 			error = ENOBUFS;
1400 			goto done;
1401 		}
1402 
1403 		/* copy the data to the cluster */
1404 		memcpy(mtod(control, char *), cm, cm->cmsg_len);
1405 		cm = mtod(control, struct cmsghdr *);
1406 	}
1407 
1408 	/*
1409 	 * Adjust length, in case sizeof(struct file *) and sizeof(int)
1410 	 * differs.
1411 	 */
1412 	cm->cmsg_len = newlen;
1413 	control->m_len = CMSG_ALIGN(newlen);
1414 
1415 	/*
1416 	 * Transform the file descriptors into struct file pointers.
1417 	 * If sizeof (struct file *) is bigger than or equal to sizeof int,
1418 	 * then do it in reverse order so that the int won't get until
1419 	 * we're done.
1420 	 * If sizeof (struct file *) is smaller than sizeof int, then
1421 	 * do it in forward order.
1422 	 */
1423 	if (sizeof (struct file *) >= sizeof (int)) {
1424 		fdp = (int *)CMSG_DATA(cm) + oldfds - 1;
1425 		rp = (struct file **)CMSG_DATA(cm) + oldfds - 1;
1426 		for (i = 0; i < oldfds; i++) {
1427 			fp = fdescp->fd_files[*fdp--].fp;
1428 			*rp-- = fp;
1429 			fhold(fp);
1430 			spin_lock(&unp_spin);
1431 			fp->f_msgcount++;
1432 			unp_rights++;
1433 			spin_unlock(&unp_spin);
1434 		}
1435 	} else {
1436 		fdp = (int *)CMSG_DATA(cm);
1437 		rp = (struct file **)CMSG_DATA(cm);
1438 		for (i = 0; i < oldfds; i++) {
1439 			fp = fdescp->fd_files[*fdp++].fp;
1440 			*rp++ = fp;
1441 			fhold(fp);
1442 			spin_lock(&unp_spin);
1443 			fp->f_msgcount++;
1444 			unp_rights++;
1445 			spin_unlock(&unp_spin);
1446 		}
1447 	}
1448 	error = 0;
1449 done:
1450 	lwkt_reltoken(&unp_token);
1451 	return error;
1452 }
1453 
1454 /*
1455  * Garbage collect in-transit file descriptors that get lost due to
1456  * loops (i.e. when a socket is sent to another process over itself,
1457  * and more complex situations).
1458  *
1459  * NOT MPSAFE - TODO socket flush code and maybe closef.  Rest is MPSAFE.
1460  */
1461 
1462 struct unp_gc_info {
1463 	struct file **extra_ref;
1464 	struct file *locked_fp;
1465 	int defer;
1466 	int index;
1467 	int maxindex;
1468 };
1469 
1470 static void
1471 unp_gc(void)
1472 {
1473 	struct unp_gc_info info;
1474 	static boolean_t unp_gcing;
1475 	struct file **fpp;
1476 	int i;
1477 
1478 	/*
1479 	 * Only one gc can be in-progress at any given moment
1480 	 */
1481 	spin_lock(&unp_spin);
1482 	if (unp_gcing) {
1483 		spin_unlock(&unp_spin);
1484 		return;
1485 	}
1486 	unp_gcing = TRUE;
1487 	spin_unlock(&unp_spin);
1488 
1489 	lwkt_gettoken(&unp_token);
1490 
1491 	/*
1492 	 * Before going through all this, set all FDs to be NOT defered
1493 	 * and NOT externally accessible (not marked).  During the scan
1494 	 * a fd can be marked externally accessible but we may or may not
1495 	 * be able to immediately process it (controlled by FDEFER).
1496 	 *
1497 	 * If we loop sleep a bit.  The complexity of the topology can cause
1498 	 * multiple loops.  Also failure to acquire the socket's so_rcv
1499 	 * token can cause us to loop.
1500 	 */
1501 	allfiles_scan_exclusive(unp_gc_clearmarks, NULL);
1502 	do {
1503 		info.defer = 0;
1504 		allfiles_scan_exclusive(unp_gc_checkmarks, &info);
1505 		if (info.defer)
1506 			tsleep(&info, 0, "gcagain", 1);
1507 	} while (info.defer);
1508 
1509 	/*
1510 	 * We grab an extra reference to each of the file table entries
1511 	 * that are not otherwise accessible and then free the rights
1512 	 * that are stored in messages on them.
1513 	 *
1514 	 * The bug in the orginal code is a little tricky, so I'll describe
1515 	 * what's wrong with it here.
1516 	 *
1517 	 * It is incorrect to simply unp_discard each entry for f_msgcount
1518 	 * times -- consider the case of sockets A and B that contain
1519 	 * references to each other.  On a last close of some other socket,
1520 	 * we trigger a gc since the number of outstanding rights (unp_rights)
1521 	 * is non-zero.  If during the sweep phase the gc code un_discards,
1522 	 * we end up doing a (full) closef on the descriptor.  A closef on A
1523 	 * results in the following chain.  Closef calls soo_close, which
1524 	 * calls soclose.   Soclose calls first (through the switch
1525 	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
1526 	 * returns because the previous instance had set unp_gcing, and
1527 	 * we return all the way back to soclose, which marks the socket
1528 	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
1529 	 * to free up the rights that are queued in messages on the socket A,
1530 	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
1531 	 * switch unp_dispose, which unp_scans with unp_discard.  This second
1532 	 * instance of unp_discard just calls closef on B.
1533 	 *
1534 	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
1535 	 * which results in another closef on A.  Unfortunately, A is already
1536 	 * being closed, and the descriptor has already been marked with
1537 	 * SS_NOFDREF, and soclose panics at this point.
1538 	 *
1539 	 * Here, we first take an extra reference to each inaccessible
1540 	 * descriptor.  Then, we call sorflush ourself, since we know
1541 	 * it is a Unix domain socket anyhow.  After we destroy all the
1542 	 * rights carried in messages, we do a last closef to get rid
1543 	 * of our extra reference.  This is the last close, and the
1544 	 * unp_detach etc will shut down the socket.
1545 	 *
1546 	 * 91/09/19, bsy@cs.cmu.edu
1547 	 */
1548 	info.extra_ref = kmalloc(256 * sizeof(struct file *), M_FILE, M_WAITOK);
1549 	info.maxindex = 256;
1550 
1551 	do {
1552 		/*
1553 		 * Look for matches
1554 		 */
1555 		info.index = 0;
1556 		allfiles_scan_exclusive(unp_gc_checkrefs, &info);
1557 
1558 		/*
1559 		 * For each FD on our hit list, do the following two things
1560 		 */
1561 		for (i = info.index, fpp = info.extra_ref; --i >= 0; ++fpp) {
1562 			struct file *tfp = *fpp;
1563 			if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL)
1564 				sorflush((struct socket *)(tfp->f_data));
1565 		}
1566 		for (i = info.index, fpp = info.extra_ref; --i >= 0; ++fpp)
1567 			closef(*fpp, NULL);
1568 	} while (info.index == info.maxindex);
1569 
1570 	lwkt_reltoken(&unp_token);
1571 
1572 	kfree((caddr_t)info.extra_ref, M_FILE);
1573 	unp_gcing = FALSE;
1574 }
1575 
1576 /*
1577  * MPSAFE - NOTE: filehead list and file pointer spinlocked on entry
1578  */
1579 static int
1580 unp_gc_checkrefs(struct file *fp, void *data)
1581 {
1582 	struct unp_gc_info *info = data;
1583 
1584 	if (fp->f_count == 0)
1585 		return(0);
1586 	if (info->index == info->maxindex)
1587 		return(-1);
1588 
1589 	/*
1590 	 * If all refs are from msgs, and it's not marked accessible
1591 	 * then it must be referenced from some unreachable cycle
1592 	 * of (shut-down) FDs, so include it in our
1593 	 * list of FDs to remove
1594 	 */
1595 	if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) {
1596 		info->extra_ref[info->index++] = fp;
1597 		fhold(fp);
1598 	}
1599 	return(0);
1600 }
1601 
1602 /*
1603  * MPSAFE - NOTE: filehead list and file pointer spinlocked on entry
1604  */
1605 static int
1606 unp_gc_clearmarks(struct file *fp, void *data __unused)
1607 {
1608 	atomic_clear_int(&fp->f_flag, FMARK | FDEFER);
1609 	return(0);
1610 }
1611 
1612 /*
1613  * MPSAFE - NOTE: filehead list and file pointer spinlocked on entry
1614  */
1615 static int
1616 unp_gc_checkmarks(struct file *fp, void *data)
1617 {
1618 	struct unp_gc_info *info = data;
1619 	struct socket *so;
1620 
1621 	/*
1622 	 * If the file is not open, skip it.  Make sure it isn't marked
1623 	 * defered or we could loop forever, in case we somehow race
1624 	 * something.
1625 	 */
1626 	if (fp->f_count == 0) {
1627 		if (fp->f_flag & FDEFER)
1628 			atomic_clear_int(&fp->f_flag, FDEFER);
1629 		return(0);
1630 	}
1631 	/*
1632 	 * If we already marked it as 'defer'  in a
1633 	 * previous pass, then try process it this time
1634 	 * and un-mark it
1635 	 */
1636 	if (fp->f_flag & FDEFER) {
1637 		atomic_clear_int(&fp->f_flag, FDEFER);
1638 	} else {
1639 		/*
1640 		 * if it's not defered, then check if it's
1641 		 * already marked.. if so skip it
1642 		 */
1643 		if (fp->f_flag & FMARK)
1644 			return(0);
1645 		/*
1646 		 * If all references are from messages
1647 		 * in transit, then skip it. it's not
1648 		 * externally accessible.
1649 		 */
1650 		if (fp->f_count == fp->f_msgcount)
1651 			return(0);
1652 		/*
1653 		 * If it got this far then it must be
1654 		 * externally accessible.
1655 		 */
1656 		atomic_set_int(&fp->f_flag, FMARK);
1657 	}
1658 
1659 	/*
1660 	 * either it was defered, or it is externally
1661 	 * accessible and not already marked so.
1662 	 * Now check if it is possibly one of OUR sockets.
1663 	 */
1664 	if (fp->f_type != DTYPE_SOCKET ||
1665 	    (so = (struct socket *)fp->f_data) == NULL) {
1666 		return(0);
1667 	}
1668 	if (so->so_proto->pr_domain != &localdomain ||
1669 	    !(so->so_proto->pr_flags & PR_RIGHTS)) {
1670 		return(0);
1671 	}
1672 
1673 	/*
1674 	 * So, Ok, it's one of our sockets and it IS externally accessible
1675 	 * (or was defered).  Now we look to see if we hold any file
1676 	 * descriptors in its message buffers.  Follow those links and mark
1677 	 * them as accessible too.
1678 	 *
1679 	 * We are holding multiple spinlocks here, if we cannot get the
1680 	 * token non-blocking defer until the next loop.
1681 	 */
1682 	info->locked_fp = fp;
1683 	if (lwkt_trytoken(&so->so_rcv.ssb_token)) {
1684 		unp_scan(so->so_rcv.ssb_mb, unp_mark, info);
1685 		lwkt_reltoken(&so->so_rcv.ssb_token);
1686 	} else {
1687 		atomic_set_int(&fp->f_flag, FDEFER);
1688 		++info->defer;
1689 	}
1690 	return (0);
1691 }
1692 
1693 /*
1694  * Scan all unix domain sockets and replace any revoked file pointers
1695  * found with the dummy file pointer fx.  We don't worry about races
1696  * against file pointers being read out as those are handled in the
1697  * externalize code.
1698  */
1699 
1700 #define REVOKE_GC_MAXFILES	32
1701 
1702 struct unp_revoke_gc_info {
1703 	struct file	*fx;
1704 	struct file	*fary[REVOKE_GC_MAXFILES];
1705 	int		fcount;
1706 };
1707 
1708 void
1709 unp_revoke_gc(struct file *fx)
1710 {
1711 	struct unp_revoke_gc_info info;
1712 	int i;
1713 
1714 	lwkt_gettoken(&unp_token);
1715 	info.fx = fx;
1716 	do {
1717 		info.fcount = 0;
1718 		allfiles_scan_exclusive(unp_revoke_gc_check, &info);
1719 		for (i = 0; i < info.fcount; ++i)
1720 			unp_fp_externalize(NULL, info.fary[i], -1);
1721 	} while (info.fcount == REVOKE_GC_MAXFILES);
1722 	lwkt_reltoken(&unp_token);
1723 }
1724 
1725 /*
1726  * Check for and replace revoked descriptors.
1727  *
1728  * WARNING:  This routine is not allowed to block.
1729  */
1730 static int
1731 unp_revoke_gc_check(struct file *fps, void *vinfo)
1732 {
1733 	struct unp_revoke_gc_info *info = vinfo;
1734 	struct file *fp;
1735 	struct socket *so;
1736 	struct mbuf *m0;
1737 	struct mbuf *m;
1738 	struct file **rp;
1739 	struct cmsghdr *cm;
1740 	int i;
1741 	int qfds;
1742 
1743 	/*
1744 	 * Is this a unix domain socket with rights-passing abilities?
1745 	 */
1746 	if (fps->f_type != DTYPE_SOCKET)
1747 		return (0);
1748 	if ((so = (struct socket *)fps->f_data) == NULL)
1749 		return(0);
1750 	if (so->so_proto->pr_domain != &localdomain)
1751 		return(0);
1752 	if ((so->so_proto->pr_flags & PR_RIGHTS) == 0)
1753 		return(0);
1754 
1755 	/*
1756 	 * Scan the mbufs for control messages and replace any revoked
1757 	 * descriptors we find.
1758 	 */
1759 	lwkt_gettoken(&so->so_rcv.ssb_token);
1760 	m0 = so->so_rcv.ssb_mb;
1761 	while (m0) {
1762 		for (m = m0; m; m = m->m_next) {
1763 			if (m->m_type != MT_CONTROL)
1764 				continue;
1765 			if (m->m_len < sizeof(*cm))
1766 				continue;
1767 			cm = mtod(m, struct cmsghdr *);
1768 			if (cm->cmsg_level != SOL_SOCKET ||
1769 			    cm->cmsg_type != SCM_RIGHTS) {
1770 				continue;
1771 			}
1772 			qfds = (cm->cmsg_len - CMSG_LEN(0)) / sizeof(void *);
1773 			rp = (struct file **)CMSG_DATA(cm);
1774 			for (i = 0; i < qfds; i++) {
1775 				fp = rp[i];
1776 				if (fp->f_flag & FREVOKED) {
1777 					kprintf("Warning: Removing revoked fp from unix domain socket queue\n");
1778 					fhold(info->fx);
1779 					info->fx->f_msgcount++;
1780 					unp_rights++;
1781 					rp[i] = info->fx;
1782 					info->fary[info->fcount++] = fp;
1783 				}
1784 				if (info->fcount == REVOKE_GC_MAXFILES)
1785 					break;
1786 			}
1787 			if (info->fcount == REVOKE_GC_MAXFILES)
1788 				break;
1789 		}
1790 		m0 = m0->m_nextpkt;
1791 		if (info->fcount == REVOKE_GC_MAXFILES)
1792 			break;
1793 	}
1794 	lwkt_reltoken(&so->so_rcv.ssb_token);
1795 
1796 	/*
1797 	 * Stop the scan if we filled up our array.
1798 	 */
1799 	if (info->fcount == REVOKE_GC_MAXFILES)
1800 		return(-1);
1801 	return(0);
1802 }
1803 
1804 /*
1805  * Dispose of the fp's stored in a mbuf.
1806  *
1807  * The dds loop can cause additional fps to be entered onto the
1808  * list while it is running, flattening out the operation and avoiding
1809  * a deep kernel stack recursion.
1810  */
1811 void
1812 unp_dispose(struct mbuf *m)
1813 {
1814 	unp_defdiscard_t dds;
1815 
1816 	lwkt_gettoken(&unp_token);
1817 	++unp_defdiscard_nest;
1818 	if (m) {
1819 		unp_scan(m, unp_discard, NULL);
1820 	}
1821 	if (unp_defdiscard_nest == 1) {
1822 		while ((dds = unp_defdiscard_base) != NULL) {
1823 			unp_defdiscard_base = dds->next;
1824 			closef(dds->fp, NULL);
1825 			kfree(dds, M_UNPCB);
1826 		}
1827 	}
1828 	--unp_defdiscard_nest;
1829 	lwkt_reltoken(&unp_token);
1830 }
1831 
1832 static int
1833 unp_listen(struct unpcb *unp, struct thread *td)
1834 {
1835 	struct proc *p = td->td_proc;
1836 
1837 	KKASSERT(p);
1838 	lwkt_gettoken(&unp_token);
1839 	cru2x(p->p_ucred, &unp->unp_peercred);
1840 	unp->unp_flags |= UNP_HAVEPCCACHED;
1841 	lwkt_reltoken(&unp_token);
1842 	return (0);
1843 }
1844 
1845 static void
1846 unp_scan(struct mbuf *m0, void (*op)(struct file *, void *), void *data)
1847 {
1848 	struct mbuf *m;
1849 	struct file **rp;
1850 	struct cmsghdr *cm;
1851 	int i;
1852 	int qfds;
1853 
1854 	while (m0) {
1855 		for (m = m0; m; m = m->m_next) {
1856 			if (m->m_type == MT_CONTROL &&
1857 			    m->m_len >= sizeof(*cm)) {
1858 				cm = mtod(m, struct cmsghdr *);
1859 				if (cm->cmsg_level != SOL_SOCKET ||
1860 				    cm->cmsg_type != SCM_RIGHTS)
1861 					continue;
1862 				qfds = (cm->cmsg_len - CMSG_LEN(0)) /
1863 					sizeof(void *);
1864 				rp = (struct file **)CMSG_DATA(cm);
1865 				for (i = 0; i < qfds; i++)
1866 					(*op)(*rp++, data);
1867 				break;		/* XXX, but saves time */
1868 			}
1869 		}
1870 		m0 = m0->m_nextpkt;
1871 	}
1872 }
1873 
1874 /*
1875  * Mark visibility.  info->defer is recalculated on every pass.
1876  */
1877 static void
1878 unp_mark(struct file *fp, void *data)
1879 {
1880 	struct unp_gc_info *info = data;
1881 
1882 	if ((fp->f_flag & FMARK) == 0) {
1883 		++info->defer;
1884 		atomic_set_int(&fp->f_flag, FMARK | FDEFER);
1885 	} else if (fp->f_flag & FDEFER) {
1886 		++info->defer;
1887 	}
1888 }
1889 
1890 /*
1891  * Discard a fp previously held in a unix domain socket mbuf.  To
1892  * avoid blowing out the kernel stack due to contrived chain-reactions
1893  * we may have to defer the operation to a higher procedural level.
1894  *
1895  * Caller holds unp_token
1896  */
1897 static void
1898 unp_discard(struct file *fp, void *data __unused)
1899 {
1900 	unp_defdiscard_t dds;
1901 
1902 	spin_lock(&unp_spin);
1903 	fp->f_msgcount--;
1904 	unp_rights--;
1905 	spin_unlock(&unp_spin);
1906 
1907 	if (unp_defdiscard_nest) {
1908 		dds = kmalloc(sizeof(*dds), M_UNPCB, M_WAITOK|M_ZERO);
1909 		dds->fp = fp;
1910 		dds->next = unp_defdiscard_base;
1911 		unp_defdiscard_base = dds;
1912 	} else {
1913 		closef(fp, NULL);
1914 	}
1915 }
1916 
1917