xref: /openbsd/sys/kern/uipc_usrreq.c (revision 666d181c)
1 /*	$OpenBSD: uipc_usrreq.c,v 1.212 2025/01/01 13:44:22 bluhm Exp $	*/
2 /*	$NetBSD: uipc_usrreq.c,v 1.18 1996/02/09 19:00:50 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1989, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/filedesc.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/queue.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/unpcb.h>
45 #include <sys/un.h>
46 #include <sys/namei.h>
47 #include <sys/vnode.h>
48 #include <sys/file.h>
49 #include <sys/stat.h>
50 #include <sys/mbuf.h>
51 #include <sys/task.h>
52 #include <sys/pledge.h>
53 #include <sys/pool.h>
54 #include <sys/rwlock.h>
55 #include <sys/mutex.h>
56 #include <sys/sysctl.h>
57 #include <sys/lock.h>
58 #include <sys/refcnt.h>
59 
60 #include "kcov.h"
61 #if NKCOV > 0
62 #include <sys/kcov.h>
63 #endif
64 
65 /*
66  * Locks used to protect global data and struct members:
67  *      I       immutable after creation
68  *      D       unp_df_lock
69  *      G       unp_gc_lock
70  *      M       unp_ino_mtx
71  *      R       unp_rights_mtx
72  *      a       atomic
73  *      s       socket lock
74  */
75 
76 struct rwlock unp_df_lock = RWLOCK_INITIALIZER("unpdflk");
77 struct rwlock unp_gc_lock = RWLOCK_INITIALIZER("unpgclk");
78 
79 struct mutex unp_rights_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
80 struct mutex unp_ino_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
81 
82 /*
83  * Stack of sets of files that were passed over a socket but were
84  * not received and need to be closed.
85  */
86 struct	unp_deferral {
87 	SLIST_ENTRY(unp_deferral)	ud_link;	/* [D] */
88 	int				ud_n;		/* [I] */
89 	/* followed by ud_n struct fdpass */
90 	struct fdpass			ud_fp[];	/* [I] */
91 };
92 
93 void	uipc_setaddr(const struct unpcb *, struct mbuf *);
94 void	unp_discard(struct fdpass *, int);
95 void	unp_remove_gcrefs(struct fdpass *, int);
96 void	unp_restore_gcrefs(struct fdpass *, int);
97 void	unp_scan(struct mbuf *, void (*)(struct fdpass *, int));
98 int	unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *);
99 static inline void unp_ref(struct unpcb *);
100 static inline void unp_rele(struct unpcb *);
101 struct socket *unp_solock_peer(struct socket *);
102 
103 struct pool unpcb_pool;
104 struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
105 
106 /*
107  * Unix communications domain.
108  *
109  * TODO:
110  *	RDM
111  *	rethink name space problems
112  *	need a proper out-of-band
113  */
114 const struct	sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
115 
116 /* [G] list of all UNIX domain sockets, for unp_gc() */
117 LIST_HEAD(unp_head, unpcb)	unp_head =
118 	LIST_HEAD_INITIALIZER(unp_head);
119 /* [D] list of sets of files that were sent over sockets that are now closed */
120 SLIST_HEAD(,unp_deferral)	unp_deferred =
121 	SLIST_HEAD_INITIALIZER(unp_deferred);
122 
123 ino_t	unp_ino;	/* [U] prototype for fake inode numbers */
124 int	unp_rights;	/* [R] file descriptors in flight */
125 int	unp_defer;	/* [G] number of deferred fp to close by the GC task */
126 int	unp_gcing;	/* [G] GC task currently running */
127 
128 const struct pr_usrreqs uipc_usrreqs = {
129 	.pru_attach	= uipc_attach,
130 	.pru_detach	= uipc_detach,
131 	.pru_bind	= uipc_bind,
132 	.pru_listen	= uipc_listen,
133 	.pru_connect	= uipc_connect,
134 	.pru_accept	= uipc_accept,
135 	.pru_disconnect	= uipc_disconnect,
136 	.pru_shutdown	= uipc_shutdown,
137 	.pru_rcvd	= uipc_rcvd,
138 	.pru_send	= uipc_send,
139 	.pru_abort	= uipc_abort,
140 	.pru_sense	= uipc_sense,
141 	.pru_sockaddr	= uipc_sockaddr,
142 	.pru_peeraddr	= uipc_peeraddr,
143 	.pru_connect2	= uipc_connect2,
144 };
145 
146 const struct pr_usrreqs uipc_dgram_usrreqs = {
147 	.pru_attach	= uipc_attach,
148 	.pru_detach	= uipc_detach,
149 	.pru_bind	= uipc_bind,
150 	.pru_listen	= uipc_listen,
151 	.pru_connect	= uipc_connect,
152 	.pru_disconnect	= uipc_disconnect,
153 	.pru_shutdown	= uipc_dgram_shutdown,
154 	.pru_send	= uipc_dgram_send,
155 	.pru_sense	= uipc_sense,
156 	.pru_sockaddr	= uipc_sockaddr,
157 	.pru_peeraddr	= uipc_peeraddr,
158 	.pru_connect2	= uipc_connect2,
159 };
160 
161 void
162 unp_init(void)
163 {
164 	pool_init(&unpcb_pool, sizeof(struct unpcb), 0,
165 	    IPL_SOFTNET, 0, "unpcb", NULL);
166 }
167 
168 static inline void
169 unp_ref(struct unpcb *unp)
170 {
171 	refcnt_take(&unp->unp_refcnt);
172 }
173 
174 static inline void
175 unp_rele(struct unpcb *unp)
176 {
177 	refcnt_rele_wake(&unp->unp_refcnt);
178 }
179 
180 struct socket *
181 unp_solock_peer(struct socket *so)
182 {
183 	struct unpcb *unp, *unp2;
184 	struct socket *so2;
185 
186 	unp = so->so_pcb;
187 
188 again:
189 	if ((unp2 = unp->unp_conn) == NULL)
190 		return NULL;
191 
192 	so2 = unp2->unp_socket;
193 
194 	if (so < so2)
195 		solock(so2);
196 	else if (so > so2) {
197 		unp_ref(unp2);
198 		sounlock(so);
199 		solock(so2);
200 		solock(so);
201 
202 		/* Datagram socket could be reconnected due to re-lock. */
203 		if (unp->unp_conn != unp2) {
204 			sounlock(so2);
205 			unp_rele(unp2);
206 			goto again;
207 		}
208 
209 		unp_rele(unp2);
210 	}
211 
212 	return so2;
213 }
214 
215 void
216 uipc_setaddr(const struct unpcb *unp, struct mbuf *nam)
217 {
218 	if (unp != NULL && unp->unp_addr != NULL) {
219 		nam->m_len = unp->unp_addr->m_len;
220 		memcpy(mtod(nam, caddr_t), mtod(unp->unp_addr, caddr_t),
221 		    nam->m_len);
222 	} else {
223 		nam->m_len = sizeof(sun_noname);
224 		memcpy(mtod(nam, struct sockaddr *), &sun_noname,
225 		    nam->m_len);
226 	}
227 }
228 
229 /*
230  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
231  * for stream sockets, although the total for sender and receiver is
232  * actually only PIPSIZ.
233  * Datagram sockets really use the sendspace as the maximum datagram size,
234  * and don't really want to reserve the sendspace.  Their recvspace should
235  * be large enough for at least one max-size datagram plus address.
236  */
237 #define	PIPSIZ	32768
238 u_int	unpst_sendspace = PIPSIZ;	/* [a] */
239 u_int	unpst_recvspace = PIPSIZ;	/* [a] */
240 u_int	unpsq_sendspace = PIPSIZ;	/* [a] */
241 u_int	unpsq_recvspace = PIPSIZ;	/* [a] */
242 u_int	unpdg_sendspace = 8192;		/* [a] really max datagram size */
243 u_int	unpdg_recvspace = PIPSIZ;	/* [a] */
244 
245 const struct sysctl_bounded_args unpstctl_vars[] = {
246 	{ UNPCTL_RECVSPACE, &unpst_recvspace, 0, SB_MAX },
247 	{ UNPCTL_SENDSPACE, &unpst_sendspace, 0, SB_MAX },
248 };
249 const struct sysctl_bounded_args unpsqctl_vars[] = {
250 	{ UNPCTL_RECVSPACE, &unpsq_recvspace, 0, SB_MAX },
251 	{ UNPCTL_SENDSPACE, &unpsq_sendspace, 0, SB_MAX },
252 };
253 const struct sysctl_bounded_args unpdgctl_vars[] = {
254 	{ UNPCTL_RECVSPACE, &unpdg_recvspace, 0, SB_MAX },
255 	{ UNPCTL_SENDSPACE, &unpdg_sendspace, 0, SB_MAX },
256 };
257 
258 int
259 uipc_attach(struct socket *so, int proto, int wait)
260 {
261 	struct unpcb *unp;
262 	int error;
263 
264 	if (so->so_pcb)
265 		return EISCONN;
266 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
267 		switch (so->so_type) {
268 
269 		case SOCK_STREAM:
270 			error = soreserve(so,
271 			    atomic_load_int(&unpst_sendspace),
272 			    atomic_load_int(&unpst_recvspace));
273 			break;
274 
275 		case SOCK_SEQPACKET:
276 			error = soreserve(so,
277 			    atomic_load_int(&unpsq_sendspace),
278 			    atomic_load_int(&unpsq_recvspace));
279 			break;
280 
281 		case SOCK_DGRAM:
282 			error = soreserve(so,
283 			    atomic_load_int(&unpdg_sendspace),
284 			    atomic_load_int(&unpdg_recvspace));
285 			break;
286 
287 		default:
288 			panic("unp_attach");
289 		}
290 		if (error)
291 			return (error);
292 	}
293 	unp = pool_get(&unpcb_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) |
294 	    PR_ZERO);
295 	if (unp == NULL)
296 		return (ENOBUFS);
297 	refcnt_init(&unp->unp_refcnt);
298 	unp->unp_socket = so;
299 	so->so_pcb = unp;
300 	getnanotime(&unp->unp_ctime);
301 
302 	rw_enter_write(&unp_gc_lock);
303 	LIST_INSERT_HEAD(&unp_head, unp, unp_link);
304 	rw_exit_write(&unp_gc_lock);
305 
306 	return (0);
307 }
308 
309 int
310 uipc_detach(struct socket *so)
311 {
312 	struct unpcb *unp = sotounpcb(so);
313 
314 	if (unp == NULL)
315 		return (EINVAL);
316 
317 	unp_detach(unp);
318 
319 	return (0);
320 }
321 
322 int
323 uipc_bind(struct socket *so, struct mbuf *nam, struct proc *p)
324 {
325 	struct unpcb *unp = sotounpcb(so);
326 	struct sockaddr_un *soun;
327 	struct mbuf *nam2;
328 	struct vnode *vp;
329 	struct vattr vattr;
330 	int error;
331 	struct nameidata nd;
332 	size_t pathlen;
333 
334 	if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING))
335 		return (EINVAL);
336 	if (unp->unp_vnode != NULL)
337 		return (EINVAL);
338 	if ((error = unp_nam2sun(nam, &soun, &pathlen)))
339 		return (error);
340 
341 	unp->unp_flags |= UNP_BINDING;
342 
343 	/*
344 	 * Enforce `i_lock' -> `solock' because fifo subsystem
345 	 * requires it. The socket can't be closed concurrently
346 	 * because the file descriptor reference is still held.
347 	 */
348 
349 	sounlock(unp->unp_socket);
350 
351 	nam2 = m_getclr(M_WAITOK, MT_SONAME);
352 	nam2->m_len = sizeof(struct sockaddr_un);
353 	memcpy(mtod(nam2, struct sockaddr_un *), soun,
354 	    offsetof(struct sockaddr_un, sun_path) + pathlen);
355 	/* No need to NUL terminate: m_getclr() returns zero'd mbufs. */
356 
357 	soun = mtod(nam2, struct sockaddr_un *);
358 
359 	/* Fixup sun_len to keep it in sync with m_len. */
360 	soun->sun_len = nam2->m_len;
361 
362 	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
363 	    soun->sun_path, p);
364 	nd.ni_pledge = PLEDGE_UNIX;
365 	nd.ni_unveil = UNVEIL_CREATE;
366 
367 	KERNEL_LOCK();
368 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
369 	error = namei(&nd);
370 	if (error != 0) {
371 		m_freem(nam2);
372 		solock(unp->unp_socket);
373 		goto out;
374 	}
375 	vp = nd.ni_vp;
376 	if (vp != NULL) {
377 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
378 		if (nd.ni_dvp == vp)
379 			vrele(nd.ni_dvp);
380 		else
381 			vput(nd.ni_dvp);
382 		vrele(vp);
383 		m_freem(nam2);
384 		error = EADDRINUSE;
385 		solock(unp->unp_socket);
386 		goto out;
387 	}
388 	vattr_null(&vattr);
389 	vattr.va_type = VSOCK;
390 	vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
391 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
392 	vput(nd.ni_dvp);
393 	if (error) {
394 		m_freem(nam2);
395 		solock(unp->unp_socket);
396 		goto out;
397 	}
398 	solock(unp->unp_socket);
399 	unp->unp_addr = nam2;
400 	vp = nd.ni_vp;
401 	vp->v_socket = unp->unp_socket;
402 	unp->unp_vnode = vp;
403 	unp->unp_connid.uid = p->p_ucred->cr_uid;
404 	unp->unp_connid.gid = p->p_ucred->cr_gid;
405 	unp->unp_connid.pid = p->p_p->ps_pid;
406 	unp->unp_flags |= UNP_FEIDSBIND;
407 	VOP_UNLOCK(vp);
408 out:
409 	KERNEL_UNLOCK();
410 	unp->unp_flags &= ~UNP_BINDING;
411 
412 	return (error);
413 }
414 
415 int
416 uipc_listen(struct socket *so)
417 {
418 	struct unpcb *unp = sotounpcb(so);
419 
420 	if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING))
421 		return (EINVAL);
422 	if (unp->unp_vnode == NULL)
423 		return (EINVAL);
424 	return (0);
425 }
426 
427 int
428 uipc_connect(struct socket *so, struct mbuf *nam)
429 {
430 	return unp_connect(so, nam, curproc);
431 }
432 
433 int
434 uipc_accept(struct socket *so, struct mbuf *nam)
435 {
436 	struct socket *so2;
437 	struct unpcb *unp = sotounpcb(so);
438 
439 	/*
440 	 * Pass back name of connected socket, if it was bound and
441 	 * we are still connected (our peer may have closed already!).
442 	 */
443 	so2 = unp_solock_peer(so);
444 	uipc_setaddr(unp->unp_conn, nam);
445 
446 	if (so2 != NULL && so2 != so)
447 		sounlock(so2);
448 	return (0);
449 }
450 
451 int
452 uipc_disconnect(struct socket *so)
453 {
454 	struct unpcb *unp = sotounpcb(so);
455 
456 	unp_disconnect(unp);
457 	return (0);
458 }
459 
460 int
461 uipc_shutdown(struct socket *so)
462 {
463 	struct unpcb *unp = sotounpcb(so);
464 	struct socket *so2;
465 
466 	socantsendmore(so);
467 
468 	if (unp->unp_conn != NULL) {
469 		so2 = unp->unp_conn->unp_socket;
470 		socantrcvmore(so2);
471 	}
472 
473 	return (0);
474 }
475 
476 int
477 uipc_dgram_shutdown(struct socket *so)
478 {
479 	socantsendmore(so);
480 	return (0);
481 }
482 
483 void
484 uipc_rcvd(struct socket *so)
485 {
486 	struct unpcb *unp = sotounpcb(so);
487 	struct socket *so2;
488 
489 	if (unp->unp_conn == NULL)
490 		return;
491 	so2 = unp->unp_conn->unp_socket;
492 
493 	/*
494 	 * Adjust backpressure on sender
495 	 * and wakeup any waiting to write.
496 	 */
497 	mtx_enter(&so->so_rcv.sb_mtx);
498 	mtx_enter(&so2->so_snd.sb_mtx);
499 	so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
500 	so2->so_snd.sb_cc = so->so_rcv.sb_cc;
501 	mtx_leave(&so2->so_snd.sb_mtx);
502 	mtx_leave(&so->so_rcv.sb_mtx);
503 	sowwakeup(so2);
504 }
505 
506 int
507 uipc_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
508     struct mbuf *control)
509 {
510 	struct unpcb *unp = sotounpcb(so);
511 	struct socket *so2;
512 	int error = 0, dowakeup = 0;
513 
514 	if (control) {
515 		sounlock(so);
516 		error = unp_internalize(control, curproc);
517 		solock(so);
518 		if (error)
519 			goto out;
520 	}
521 
522 	/*
523 	 * We hold both solock() and `sb_mtx' mutex while modifying
524 	 * SS_CANTSENDMORE flag. solock() is enough to check it.
525 	 */
526 	if (so->so_snd.sb_state & SS_CANTSENDMORE) {
527 		error = EPIPE;
528 		goto dispose;
529 	}
530 	if (unp->unp_conn == NULL) {
531 		error = ENOTCONN;
532 		goto dispose;
533 	}
534 
535 	so2 = unp->unp_conn->unp_socket;
536 
537 	/*
538 	 * Send to paired receive port, and then raise
539 	 * send buffer counts to maintain backpressure.
540 	 * Wake up readers.
541 	 */
542 	/*
543 	 * sbappend*() should be serialized together
544 	 * with so_snd modification.
545 	 */
546 	mtx_enter(&so2->so_rcv.sb_mtx);
547 	mtx_enter(&so->so_snd.sb_mtx);
548 	if (control) {
549 		if (sbappendcontrol(so2, &so2->so_rcv, m, control)) {
550 			control = NULL;
551 		} else {
552 			mtx_leave(&so->so_snd.sb_mtx);
553 			mtx_leave(&so2->so_rcv.sb_mtx);
554 			error = ENOBUFS;
555 			goto dispose;
556 		}
557 	} else if (so->so_type == SOCK_SEQPACKET)
558 		sbappendrecord(so2, &so2->so_rcv, m);
559 	else
560 		sbappend(so2, &so2->so_rcv, m);
561 	so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt;
562 	so->so_snd.sb_cc = so2->so_rcv.sb_cc;
563 	if (so2->so_rcv.sb_cc > 0)
564 		dowakeup = 1;
565 	mtx_leave(&so->so_snd.sb_mtx);
566 	mtx_leave(&so2->so_rcv.sb_mtx);
567 
568 	if (dowakeup)
569 		sorwakeup(so2);
570 
571 	m = NULL;
572 
573 dispose:
574 	/* we need to undo unp_internalize in case of errors */
575 	if (control && error)
576 		unp_dispose(control);
577 
578 out:
579 	m_freem(control);
580 	m_freem(m);
581 
582 	return (error);
583 }
584 
585 int
586 uipc_dgram_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
587     struct mbuf *control)
588 {
589 	struct unpcb *unp = sotounpcb(so);
590 	struct socket *so2;
591 	const struct sockaddr *from;
592 	int error = 0, dowakeup = 0;
593 
594 	if (control) {
595 		sounlock(so);
596 		error = unp_internalize(control, curproc);
597 		solock(so);
598 		if (error)
599 			goto out;
600 	}
601 
602 	if (nam) {
603 		if (unp->unp_conn) {
604 			error = EISCONN;
605 			goto dispose;
606 		}
607 		error = unp_connect(so, nam, curproc);
608 		if (error)
609 			goto dispose;
610 	}
611 
612 	if (unp->unp_conn == NULL) {
613 		if (nam != NULL)
614 			error = ECONNREFUSED;
615 		else
616 			error = ENOTCONN;
617 		goto dispose;
618 	}
619 
620 	so2 = unp->unp_conn->unp_socket;
621 
622 	if (unp->unp_addr)
623 		from = mtod(unp->unp_addr, struct sockaddr *);
624 	else
625 		from = &sun_noname;
626 
627 	mtx_enter(&so2->so_rcv.sb_mtx);
628 	if (sbappendaddr(so2, &so2->so_rcv, from, m, control)) {
629 		dowakeup = 1;
630 		m = NULL;
631 		control = NULL;
632 	} else
633 		error = ENOBUFS;
634 	mtx_leave(&so2->so_rcv.sb_mtx);
635 
636 	if (dowakeup)
637 		sorwakeup(so2);
638 	if (nam)
639 		unp_disconnect(unp);
640 
641 dispose:
642 	/* we need to undo unp_internalize in case of errors */
643 	if (control && error)
644 		unp_dispose(control);
645 
646 out:
647 	m_freem(control);
648 	m_freem(m);
649 
650 	return (error);
651 }
652 
653 void
654 uipc_abort(struct socket *so)
655 {
656 	struct unpcb *unp = sotounpcb(so);
657 
658 	unp_detach(unp);
659 	sofree(so, 0);
660 }
661 
662 int
663 uipc_sense(struct socket *so, struct stat *sb)
664 {
665 	struct unpcb *unp = sotounpcb(so);
666 
667 	sb->st_blksize = so->so_snd.sb_hiwat;
668 	sb->st_dev = NODEV;
669 	mtx_enter(&unp_ino_mtx);
670 	if (unp->unp_ino == 0)
671 		unp->unp_ino = unp_ino++;
672 	mtx_leave(&unp_ino_mtx);
673 	sb->st_atim.tv_sec =
674 	    sb->st_mtim.tv_sec =
675 	    sb->st_ctim.tv_sec = unp->unp_ctime.tv_sec;
676 	sb->st_atim.tv_nsec =
677 	    sb->st_mtim.tv_nsec =
678 	    sb->st_ctim.tv_nsec = unp->unp_ctime.tv_nsec;
679 	sb->st_ino = unp->unp_ino;
680 
681 	return (0);
682 }
683 
684 int
685 uipc_sockaddr(struct socket *so, struct mbuf *nam)
686 {
687 	struct unpcb *unp = sotounpcb(so);
688 
689 	uipc_setaddr(unp, nam);
690 	return (0);
691 }
692 
693 int
694 uipc_peeraddr(struct socket *so, struct mbuf *nam)
695 {
696 	struct unpcb *unp = sotounpcb(so);
697 	struct socket *so2;
698 
699 	so2 = unp_solock_peer(so);
700 	uipc_setaddr(unp->unp_conn, nam);
701 	if (so2 != NULL && so2 != so)
702 		sounlock(so2);
703 	return (0);
704 }
705 
706 int
707 uipc_connect2(struct socket *so, struct socket *so2)
708 {
709 	struct unpcb *unp = sotounpcb(so), *unp2;
710 	int error;
711 
712 	if ((error = unp_connect2(so, so2)))
713 		return (error);
714 
715 	unp->unp_connid.uid = curproc->p_ucred->cr_uid;
716 	unp->unp_connid.gid = curproc->p_ucred->cr_gid;
717 	unp->unp_connid.pid = curproc->p_p->ps_pid;
718 	unp->unp_flags |= UNP_FEIDS;
719 	unp2 = sotounpcb(so2);
720 	unp2->unp_connid.uid = curproc->p_ucred->cr_uid;
721 	unp2->unp_connid.gid = curproc->p_ucred->cr_gid;
722 	unp2->unp_connid.pid = curproc->p_p->ps_pid;
723 	unp2->unp_flags |= UNP_FEIDS;
724 
725 	return (0);
726 }
727 
728 int
729 uipc_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
730     size_t newlen)
731 {
732 	int *valp = &unp_defer;
733 
734 	/* All sysctl names at this level are terminal. */
735 	switch (name[0]) {
736 	case SOCK_STREAM:
737 		if (namelen != 2)
738 			return (ENOTDIR);
739 		return sysctl_bounded_arr(unpstctl_vars, nitems(unpstctl_vars),
740 		    name + 1, namelen - 1, oldp, oldlenp, newp, newlen);
741 	case SOCK_SEQPACKET:
742 		if (namelen != 2)
743 			return (ENOTDIR);
744 		return sysctl_bounded_arr(unpsqctl_vars, nitems(unpsqctl_vars),
745 		    name + 1, namelen - 1, oldp, oldlenp, newp, newlen);
746 	case SOCK_DGRAM:
747 		if (namelen != 2)
748 			return (ENOTDIR);
749 		return sysctl_bounded_arr(unpdgctl_vars, nitems(unpdgctl_vars),
750 		    name + 1, namelen - 1, oldp, oldlenp, newp, newlen);
751 	case NET_UNIX_INFLIGHT:
752 		valp = &unp_rights;
753 		/* FALLTHROUGH */
754 	case NET_UNIX_DEFERRED:
755 		if (namelen != 1)
756 			return (ENOTDIR);
757 		return sysctl_rdint(oldp, oldlenp, newp, *valp);
758 	default:
759 		return (ENOPROTOOPT);
760 	}
761 }
762 
763 void
764 unp_detach(struct unpcb *unp)
765 {
766 	struct socket *so = unp->unp_socket;
767 	struct vnode *vp = unp->unp_vnode;
768 	struct unpcb *unp2;
769 
770 	unp->unp_vnode = NULL;
771 
772 	rw_enter_write(&unp_gc_lock);
773 	LIST_REMOVE(unp, unp_link);
774 	rw_exit_write(&unp_gc_lock);
775 
776 	if (vp != NULL) {
777 		/* Enforce `i_lock' -> solock() lock order. */
778 		sounlock(so);
779 		VOP_LOCK(vp, LK_EXCLUSIVE);
780 		vp->v_socket = NULL;
781 
782 		KERNEL_LOCK();
783 		vput(vp);
784 		KERNEL_UNLOCK();
785 		solock(so);
786 	}
787 
788 	if (unp->unp_conn != NULL) {
789 		/*
790 		 * Datagram socket could be connected to itself.
791 		 * Such socket will be disconnected here.
792 		 */
793 		unp_disconnect(unp);
794 	}
795 
796 	while ((unp2 = SLIST_FIRST(&unp->unp_refs)) != NULL) {
797 		struct socket *so2 = unp2->unp_socket;
798 
799 		if (so < so2)
800 			solock(so2);
801 		else {
802 			unp_ref(unp2);
803 			sounlock(so);
804 			solock(so2);
805 			solock(so);
806 
807 			if (unp2->unp_conn != unp) {
808 				/* `unp2' was disconnected due to re-lock. */
809 				sounlock(so2);
810 				unp_rele(unp2);
811 				continue;
812 			}
813 
814 			unp_rele(unp2);
815 		}
816 
817 		unp2->unp_conn = NULL;
818 		SLIST_REMOVE(&unp->unp_refs, unp2, unpcb, unp_nextref);
819 		so2->so_error = ECONNRESET;
820 		so2->so_state &= ~SS_ISCONNECTED;
821 
822 		sounlock(so2);
823 	}
824 
825 	sounlock(so);
826 	refcnt_finalize(&unp->unp_refcnt, "unpfinal");
827 	solock(so);
828 
829 	soisdisconnected(so);
830 	so->so_pcb = NULL;
831 	m_freem(unp->unp_addr);
832 	pool_put(&unpcb_pool, unp);
833 	if (unp_rights)
834 		task_add(systqmp, &unp_gc_task);
835 }
836 
837 int
838 unp_connect(struct socket *so, struct mbuf *nam, struct proc *p)
839 {
840 	struct sockaddr_un *soun;
841 	struct vnode *vp;
842 	struct socket *so2, *so3;
843 	struct unpcb *unp, *unp2, *unp3;
844 	struct nameidata nd;
845 	int error;
846 
847 	unp = sotounpcb(so);
848 	if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING))
849 		return (EISCONN);
850 	if ((error = unp_nam2sun(nam, &soun, NULL)))
851 		return (error);
852 
853 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
854 	nd.ni_pledge = PLEDGE_UNIX;
855 	nd.ni_unveil = UNVEIL_WRITE;
856 
857 	unp->unp_flags |= UNP_CONNECTING;
858 
859 	/*
860 	 * Enforce `i_lock' -> `solock' because fifo subsystem
861 	 * requires it. The socket can't be closed concurrently
862 	 * because the file descriptor reference is still held.
863 	 */
864 
865 	sounlock(so);
866 
867 	KERNEL_LOCK();
868 	error = namei(&nd);
869 	if (error != 0)
870 		goto unlock;
871 	vp = nd.ni_vp;
872 	if (vp->v_type != VSOCK) {
873 		error = ENOTSOCK;
874 		goto put;
875 	}
876 	if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
877 		goto put;
878 	so2 = vp->v_socket;
879 	if (so2 == NULL) {
880 		error = ECONNREFUSED;
881 		goto put;
882 	}
883 	if (so->so_type != so2->so_type) {
884 		error = EPROTOTYPE;
885 		goto put;
886 	}
887 
888 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
889 		solock(so2);
890 
891 		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
892 		    (so3 = sonewconn(so2, 0, M_WAIT)) == NULL) {
893 			error = ECONNREFUSED;
894 		}
895 
896 		sounlock(so2);
897 
898 		if (error != 0)
899 			goto put;
900 
901 		/*
902 		 * Since `so2' is protected by vnode(9) lock, `so3'
903 		 * can't be PRU_ABORT'ed here.
904 		 */
905 		solock_pair(so, so3);
906 
907 		unp2 = sotounpcb(so2);
908 		unp3 = sotounpcb(so3);
909 
910 		/*
911 		 * `unp_addr', `unp_connid' and 'UNP_FEIDSBIND' flag
912 		 * are immutable since we set them in uipc_bind().
913 		 */
914 		if (unp2->unp_addr)
915 			unp3->unp_addr =
916 			    m_copym(unp2->unp_addr, 0, M_COPYALL, M_NOWAIT);
917 		unp3->unp_connid.uid = p->p_ucred->cr_uid;
918 		unp3->unp_connid.gid = p->p_ucred->cr_gid;
919 		unp3->unp_connid.pid = p->p_p->ps_pid;
920 		unp3->unp_flags |= UNP_FEIDS;
921 
922 		if (unp2->unp_flags & UNP_FEIDSBIND) {
923 			unp->unp_connid = unp2->unp_connid;
924 			unp->unp_flags |= UNP_FEIDS;
925 		}
926 
927 		so2 = so3;
928 	} else {
929 		if (so2 != so)
930 			solock_pair(so, so2);
931 		else
932 			solock(so);
933 	}
934 
935 	error = unp_connect2(so, so2);
936 
937 	sounlock(so);
938 
939 	/*
940 	 * `so2' can't be PRU_ABORT'ed concurrently
941 	 */
942 	if (so2 != so)
943 		sounlock(so2);
944 put:
945 	vput(vp);
946 unlock:
947 	KERNEL_UNLOCK();
948 	solock(so);
949 	unp->unp_flags &= ~UNP_CONNECTING;
950 
951 	/*
952 	 * The peer socket could be closed by concurrent thread
953 	 * when `so' and `vp' are unlocked.
954 	 */
955 	if (error == 0 && unp->unp_conn == NULL)
956 		error = ECONNREFUSED;
957 
958 	return (error);
959 }
960 
961 int
962 unp_connect2(struct socket *so, struct socket *so2)
963 {
964 	struct unpcb *unp = sotounpcb(so);
965 	struct unpcb *unp2;
966 
967 	soassertlocked(so);
968 	soassertlocked(so2);
969 
970 	if (so2->so_type != so->so_type)
971 		return (EPROTOTYPE);
972 	unp2 = sotounpcb(so2);
973 	unp->unp_conn = unp2;
974 	switch (so->so_type) {
975 
976 	case SOCK_DGRAM:
977 		SLIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_nextref);
978 		soisconnected(so);
979 		break;
980 
981 	case SOCK_STREAM:
982 	case SOCK_SEQPACKET:
983 		unp2->unp_conn = unp;
984 		soisconnected(so);
985 		soisconnected(so2);
986 		break;
987 
988 	default:
989 		panic("unp_connect2");
990 	}
991 	return (0);
992 }
993 
994 void
995 unp_disconnect(struct unpcb *unp)
996 {
997 	struct socket *so2;
998 	struct unpcb *unp2;
999 
1000 	if ((so2 = unp_solock_peer(unp->unp_socket)) == NULL)
1001 		return;
1002 
1003 	unp2 = unp->unp_conn;
1004 	unp->unp_conn = NULL;
1005 
1006 	switch (unp->unp_socket->so_type) {
1007 
1008 	case SOCK_DGRAM:
1009 		SLIST_REMOVE(&unp2->unp_refs, unp, unpcb, unp_nextref);
1010 		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
1011 		break;
1012 
1013 	case SOCK_STREAM:
1014 	case SOCK_SEQPACKET:
1015 		unp->unp_socket->so_snd.sb_mbcnt = 0;
1016 		unp->unp_socket->so_snd.sb_cc = 0;
1017 		soisdisconnected(unp->unp_socket);
1018 		unp2->unp_conn = NULL;
1019 		unp2->unp_socket->so_snd.sb_mbcnt = 0;
1020 		unp2->unp_socket->so_snd.sb_cc = 0;
1021 		soisdisconnected(unp2->unp_socket);
1022 		break;
1023 	}
1024 
1025 	if (so2 != unp->unp_socket)
1026 		sounlock(so2);
1027 }
1028 
1029 static struct unpcb *
1030 fptounp(struct file *fp)
1031 {
1032 	struct socket *so;
1033 
1034 	if (fp->f_type != DTYPE_SOCKET)
1035 		return (NULL);
1036 	if ((so = fp->f_data) == NULL)
1037 		return (NULL);
1038 	if (so->so_proto->pr_domain != &unixdomain)
1039 		return (NULL);
1040 	return (sotounpcb(so));
1041 }
1042 
1043 int
1044 unp_externalize(struct mbuf *rights, socklen_t controllen, int flags)
1045 {
1046 	struct proc *p = curproc;		/* XXX */
1047 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
1048 	struct filedesc *fdp = p->p_fd;
1049 	int i, *fds = NULL;
1050 	struct fdpass *rp;
1051 	struct file *fp;
1052 	int nfds, error = 0;
1053 
1054 	/*
1055 	 * This code only works because SCM_RIGHTS is the only supported
1056 	 * control message type on unix sockets. Enforce this here.
1057 	 */
1058 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET)
1059 		return EINVAL;
1060 
1061 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
1062 	    sizeof(struct fdpass);
1063 	if (controllen < CMSG_ALIGN(sizeof(struct cmsghdr)))
1064 		controllen = 0;
1065 	else
1066 		controllen -= CMSG_ALIGN(sizeof(struct cmsghdr));
1067 	if (nfds > controllen / sizeof(int)) {
1068 		error = EMSGSIZE;
1069 		goto out;
1070 	}
1071 
1072 	/* Make sure the recipient should be able to see the descriptors.. */
1073 	rp = (struct fdpass *)CMSG_DATA(cm);
1074 
1075 	/* fdp->fd_rdir requires KERNEL_LOCK() */
1076 	KERNEL_LOCK();
1077 
1078 	for (i = 0; i < nfds; i++) {
1079 		fp = rp->fp;
1080 		rp++;
1081 		error = pledge_recvfd(p, fp);
1082 		if (error)
1083 			break;
1084 
1085 		/*
1086 		 * No to block devices.  If passing a directory,
1087 		 * make sure that it is underneath the root.
1088 		 */
1089 		if (fdp->fd_rdir != NULL && fp->f_type == DTYPE_VNODE) {
1090 			struct vnode *vp = (struct vnode *)fp->f_data;
1091 
1092 			if (vp->v_type == VBLK ||
1093 			    (vp->v_type == VDIR &&
1094 			    !vn_isunder(vp, fdp->fd_rdir, p))) {
1095 				error = EPERM;
1096 				break;
1097 			}
1098 		}
1099 	}
1100 
1101 	KERNEL_UNLOCK();
1102 
1103 	if (error)
1104 		goto out;
1105 
1106 	fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK);
1107 
1108 	fdplock(fdp);
1109 restart:
1110 	/*
1111 	 * First loop -- allocate file descriptor table slots for the
1112 	 * new descriptors.
1113 	 */
1114 	rp = ((struct fdpass *)CMSG_DATA(cm));
1115 	for (i = 0; i < nfds; i++) {
1116 		if ((error = fdalloc(p, 0, &fds[i])) != 0) {
1117 			/*
1118 			 * Back out what we've done so far.
1119 			 */
1120 			for (--i; i >= 0; i--)
1121 				fdremove(fdp, fds[i]);
1122 
1123 			if (error == ENOSPC) {
1124 				fdexpand(p);
1125 				goto restart;
1126 			}
1127 
1128 			fdpunlock(fdp);
1129 
1130 			/*
1131 			 * This is the error that has historically
1132 			 * been returned, and some callers may
1133 			 * expect it.
1134 			 */
1135 
1136 			error = EMSGSIZE;
1137 			goto out;
1138 		}
1139 
1140 		/*
1141 		 * Make the slot reference the descriptor so that
1142 		 * fdalloc() works properly.. We finalize it all
1143 		 * in the loop below.
1144 		 */
1145 		mtx_enter(&fdp->fd_fplock);
1146 		KASSERT(fdp->fd_ofiles[fds[i]] == NULL);
1147 		fdp->fd_ofiles[fds[i]] = rp->fp;
1148 		mtx_leave(&fdp->fd_fplock);
1149 
1150 		fdp->fd_ofileflags[fds[i]] = (rp->flags & UF_PLEDGED);
1151 		if (flags & MSG_CMSG_CLOEXEC)
1152 			fdp->fd_ofileflags[fds[i]] |= UF_EXCLOSE;
1153 
1154 		rp++;
1155 	}
1156 
1157 	/*
1158 	 * Keep `fdp' locked to prevent concurrent close() of just
1159 	 * inserted descriptors. Such descriptors could have the only
1160 	 * `f_count' reference which is now shared between control
1161 	 * message and `fdp'.
1162 	 */
1163 
1164 	/*
1165 	 * Now that adding them has succeeded, update all of the
1166 	 * descriptor passing state.
1167 	 */
1168 	rp = (struct fdpass *)CMSG_DATA(cm);
1169 
1170 	for (i = 0; i < nfds; i++) {
1171 		struct unpcb *unp;
1172 
1173 		fp = rp->fp;
1174 		rp++;
1175 		if ((unp = fptounp(fp)) != NULL) {
1176 			rw_enter_write(&unp_gc_lock);
1177 			unp->unp_msgcount--;
1178 			rw_exit_write(&unp_gc_lock);
1179 		}
1180 	}
1181 	fdpunlock(fdp);
1182 
1183 	mtx_enter(&unp_rights_mtx);
1184 	unp_rights -= nfds;
1185 	mtx_leave(&unp_rights_mtx);
1186 
1187 	/*
1188 	 * Copy temporary array to message and adjust length, in case of
1189 	 * transition from large struct file pointers to ints.
1190 	 */
1191 	memcpy(CMSG_DATA(cm), fds, nfds * sizeof(int));
1192 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
1193 	rights->m_len = CMSG_LEN(nfds * sizeof(int));
1194  out:
1195 	if (fds != NULL)
1196 		free(fds, M_TEMP, nfds * sizeof(int));
1197 
1198 	if (error) {
1199 		if (nfds > 0) {
1200 			/*
1201 			 * No lock required. We are the only `cm' holder.
1202 			 */
1203 			rp = ((struct fdpass *)CMSG_DATA(cm));
1204 			unp_discard(rp, nfds);
1205 		}
1206 	}
1207 
1208 	return (error);
1209 }
1210 
1211 int
1212 unp_internalize(struct mbuf *control, struct proc *p)
1213 {
1214 	struct filedesc *fdp = p->p_fd;
1215 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1216 	struct fdpass *rp;
1217 	struct file *fp;
1218 	struct unpcb *unp;
1219 	int i, error;
1220 	int nfds, *ip, fd, neededspace;
1221 
1222 	/*
1223 	 * Check for two potential msg_controllen values because
1224 	 * IETF stuck their nose in a place it does not belong.
1225 	 */
1226 	if (control->m_len < CMSG_LEN(0) || cm->cmsg_len < CMSG_LEN(0))
1227 		return (EINVAL);
1228 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
1229 	    !(cm->cmsg_len == control->m_len ||
1230 	    control->m_len == CMSG_ALIGN(cm->cmsg_len)))
1231 		return (EINVAL);
1232 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
1233 
1234 	mtx_enter(&unp_rights_mtx);
1235 	if (unp_rights + nfds > maxfiles / 10) {
1236 		mtx_leave(&unp_rights_mtx);
1237 		return (EMFILE);
1238 	}
1239 	unp_rights += nfds;
1240 	mtx_leave(&unp_rights_mtx);
1241 
1242 	/* Make sure we have room for the struct file pointers */
1243 morespace:
1244 	neededspace = CMSG_SPACE(nfds * sizeof(struct fdpass)) -
1245 	    control->m_len;
1246 	if (neededspace > m_trailingspace(control)) {
1247 		char *tmp;
1248 		/* if we already have a cluster, the message is just too big */
1249 		if (control->m_flags & M_EXT) {
1250 			error = E2BIG;
1251 			goto nospace;
1252 		}
1253 
1254 		/* copy cmsg data temporarily out of the mbuf */
1255 		tmp = malloc(control->m_len, M_TEMP, M_WAITOK);
1256 		memcpy(tmp, mtod(control, caddr_t), control->m_len);
1257 
1258 		/* allocate a cluster and try again */
1259 		MCLGET(control, M_WAIT);
1260 		if ((control->m_flags & M_EXT) == 0) {
1261 			free(tmp, M_TEMP, control->m_len);
1262 			error = ENOBUFS;       /* allocation failed */
1263 			goto nospace;
1264 		}
1265 
1266 		/* copy the data back into the cluster */
1267 		cm = mtod(control, struct cmsghdr *);
1268 		memcpy(cm, tmp, control->m_len);
1269 		free(tmp, M_TEMP, control->m_len);
1270 		goto morespace;
1271 	}
1272 
1273 	/* adjust message & mbuf to note amount of space actually used. */
1274 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(struct fdpass));
1275 	control->m_len = CMSG_SPACE(nfds * sizeof(struct fdpass));
1276 
1277 	ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
1278 	rp = ((struct fdpass *)CMSG_DATA(cm)) + nfds - 1;
1279 	fdplock(fdp);
1280 	for (i = 0; i < nfds; i++) {
1281 		memcpy(&fd, ip, sizeof fd);
1282 		ip--;
1283 		if ((fp = fd_getfile(fdp, fd)) == NULL) {
1284 			error = EBADF;
1285 			goto fail;
1286 		}
1287 		if (fp->f_count >= FDUP_MAX_COUNT) {
1288 			error = EDEADLK;
1289 			goto fail;
1290 		}
1291 		error = pledge_sendfd(p, fp);
1292 		if (error)
1293 			goto fail;
1294 
1295 		/* kqueue descriptors cannot be copied */
1296 		if (fp->f_type == DTYPE_KQUEUE) {
1297 			error = EINVAL;
1298 			goto fail;
1299 		}
1300 #if NKCOV > 0
1301 		/* kcov descriptors cannot be copied */
1302 		if (fp->f_type == DTYPE_VNODE && kcov_vnode(fp->f_data)) {
1303 			error = EINVAL;
1304 			goto fail;
1305 		}
1306 #endif
1307 		rp->fp = fp;
1308 		rp->flags = fdp->fd_ofileflags[fd] & UF_PLEDGED;
1309 		rp--;
1310 		if ((unp = fptounp(fp)) != NULL) {
1311 			rw_enter_write(&unp_gc_lock);
1312 			unp->unp_msgcount++;
1313 			unp->unp_file = fp;
1314 			rw_exit_write(&unp_gc_lock);
1315 		}
1316 	}
1317 	fdpunlock(fdp);
1318 	return (0);
1319 fail:
1320 	fdpunlock(fdp);
1321 	if (fp != NULL)
1322 		FRELE(fp, p);
1323 	/* Back out what we just did. */
1324 	for ( ; i > 0; i--) {
1325 		rp++;
1326 		fp = rp->fp;
1327 		if ((unp = fptounp(fp)) != NULL) {
1328 			rw_enter_write(&unp_gc_lock);
1329 			unp->unp_msgcount--;
1330 			rw_exit_write(&unp_gc_lock);
1331 		}
1332 		FRELE(fp, p);
1333 	}
1334 
1335 nospace:
1336 	mtx_enter(&unp_rights_mtx);
1337 	unp_rights -= nfds;
1338 	mtx_leave(&unp_rights_mtx);
1339 
1340 	return (error);
1341 }
1342 
1343 void
1344 unp_gc(void *arg __unused)
1345 {
1346 	struct unp_deferral *defer;
1347 	struct file *fp;
1348 	struct socket *so;
1349 	struct unpcb *unp;
1350 	int nunref, i;
1351 
1352 	rw_enter_write(&unp_gc_lock);
1353 	if (unp_gcing)
1354 		goto unlock;
1355 	unp_gcing = 1;
1356 	rw_exit_write(&unp_gc_lock);
1357 
1358 	rw_enter_write(&unp_df_lock);
1359 	/* close any fds on the deferred list */
1360 	while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
1361 		SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
1362 		rw_exit_write(&unp_df_lock);
1363 		for (i = 0; i < defer->ud_n; i++) {
1364 			fp = defer->ud_fp[i].fp;
1365 			if (fp == NULL)
1366 				continue;
1367 			if ((unp = fptounp(fp)) != NULL) {
1368 				rw_enter_write(&unp_gc_lock);
1369 				unp->unp_msgcount--;
1370 				rw_exit_write(&unp_gc_lock);
1371 			}
1372 			mtx_enter(&unp_rights_mtx);
1373 			unp_rights--;
1374 			mtx_leave(&unp_rights_mtx);
1375 			 /* closef() expects a refcount of 2 */
1376 			FREF(fp);
1377 			(void) closef(fp, NULL);
1378 		}
1379 		free(defer, M_TEMP, sizeof(*defer) +
1380 		    sizeof(struct fdpass) * defer->ud_n);
1381 		rw_enter_write(&unp_df_lock);
1382 	}
1383 	rw_exit_write(&unp_df_lock);
1384 
1385 	nunref = 0;
1386 
1387 	rw_enter_write(&unp_gc_lock);
1388 
1389 	/*
1390 	 * Determine sockets which may be prospectively dead. Such
1391 	 * sockets have their `unp_msgcount' equal to the `f_count'.
1392 	 * If `unp_msgcount' is 0, the socket has not been passed
1393 	 * and can't be unreferenced.
1394 	 */
1395 	LIST_FOREACH(unp, &unp_head, unp_link) {
1396 		unp->unp_gcflags = 0;
1397 
1398 		if (unp->unp_msgcount == 0)
1399 			continue;
1400 		if ((fp = unp->unp_file) == NULL)
1401 			continue;
1402 		if (fp->f_count == unp->unp_msgcount) {
1403 			unp->unp_gcflags |= UNP_GCDEAD;
1404 			unp->unp_gcrefs = unp->unp_msgcount;
1405 			nunref++;
1406 		}
1407 	}
1408 
1409 	/*
1410 	 * Scan all sockets previously marked as dead. Remove
1411 	 * the `unp_gcrefs' reference each socket holds on any
1412 	 * dead socket in its buffer.
1413 	 */
1414 	LIST_FOREACH(unp, &unp_head, unp_link) {
1415 		if ((unp->unp_gcflags & UNP_GCDEAD) == 0)
1416 			continue;
1417 		so = unp->unp_socket;
1418 		mtx_enter(&so->so_rcv.sb_mtx);
1419 		unp_scan(so->so_rcv.sb_mb, unp_remove_gcrefs);
1420 		mtx_leave(&so->so_rcv.sb_mtx);
1421 	}
1422 
1423 	/*
1424 	 * If the dead socket has `unp_gcrefs' reference counter
1425 	 * greater than 0, it can't be unreferenced. Mark it as
1426 	 * alive and increment the `unp_gcrefs' reference for each
1427 	 * dead socket within its buffer. Repeat this until we
1428 	 * have no new alive sockets found.
1429 	 */
1430 	do {
1431 		unp_defer = 0;
1432 
1433 		LIST_FOREACH(unp, &unp_head, unp_link) {
1434 			if ((unp->unp_gcflags & UNP_GCDEAD) == 0)
1435 				continue;
1436 			if (unp->unp_gcrefs == 0)
1437 				continue;
1438 
1439 			unp->unp_gcflags &= ~UNP_GCDEAD;
1440 
1441 			so = unp->unp_socket;
1442 			mtx_enter(&so->so_rcv.sb_mtx);
1443 			unp_scan(so->so_rcv.sb_mb, unp_restore_gcrefs);
1444 			mtx_leave(&so->so_rcv.sb_mtx);
1445 
1446 			KASSERT(nunref > 0);
1447 			nunref--;
1448 		}
1449 	} while (unp_defer > 0);
1450 
1451 	/*
1452 	 * If there are any unreferenced sockets, then for each dispose
1453 	 * of files in its receive buffer and then close it.
1454 	 */
1455 	if (nunref) {
1456 		LIST_FOREACH(unp, &unp_head, unp_link) {
1457 			if (unp->unp_gcflags & UNP_GCDEAD) {
1458 				struct sockbuf *sb = &unp->unp_socket->so_rcv;
1459 				struct mbuf *m;
1460 
1461 				/*
1462 				 * This socket could still be connected
1463 				 * and if so it's `so_rcv' is still
1464 				 * accessible by concurrent PRU_SEND
1465 				 * thread.
1466 				 */
1467 
1468 				mtx_enter(&sb->sb_mtx);
1469 				m = sb->sb_mb;
1470 				memset(&sb->sb_startzero, 0,
1471 				    (caddr_t)&sb->sb_endzero -
1472 				    (caddr_t)&sb->sb_startzero);
1473 				sb->sb_timeo_nsecs = INFSLP;
1474 				mtx_leave(&sb->sb_mtx);
1475 
1476 				unp_scan(m, unp_discard);
1477 				m_purge(m);
1478 			}
1479 		}
1480 	}
1481 
1482 	unp_gcing = 0;
1483 unlock:
1484 	rw_exit_write(&unp_gc_lock);
1485 }
1486 
1487 void
1488 unp_dispose(struct mbuf *m)
1489 {
1490 
1491 	if (m)
1492 		unp_scan(m, unp_discard);
1493 }
1494 
1495 void
1496 unp_scan(struct mbuf *m0, void (*op)(struct fdpass *, int))
1497 {
1498 	struct mbuf *m;
1499 	struct fdpass *rp;
1500 	struct cmsghdr *cm;
1501 	int qfds;
1502 
1503 	while (m0) {
1504 		for (m = m0; m; m = m->m_next) {
1505 			if (m->m_type == MT_CONTROL &&
1506 			    m->m_len >= sizeof(*cm)) {
1507 				cm = mtod(m, struct cmsghdr *);
1508 				if (cm->cmsg_level != SOL_SOCKET ||
1509 				    cm->cmsg_type != SCM_RIGHTS)
1510 					continue;
1511 				qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof *cm))
1512 				    / sizeof(struct fdpass);
1513 				if (qfds > 0) {
1514 					rp = (struct fdpass *)CMSG_DATA(cm);
1515 					op(rp, qfds);
1516 				}
1517 				break;		/* XXX, but saves time */
1518 			}
1519 		}
1520 		m0 = m0->m_nextpkt;
1521 	}
1522 }
1523 
1524 void
1525 unp_discard(struct fdpass *rp, int nfds)
1526 {
1527 	struct unp_deferral *defer;
1528 
1529 	/* copy the file pointers to a deferral structure */
1530 	defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
1531 	defer->ud_n = nfds;
1532 	memcpy(&defer->ud_fp[0], rp, sizeof(*rp) * nfds);
1533 	memset(rp, 0, sizeof(*rp) * nfds);
1534 
1535 	rw_enter_write(&unp_df_lock);
1536 	SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link);
1537 	rw_exit_write(&unp_df_lock);
1538 
1539 	task_add(systqmp, &unp_gc_task);
1540 }
1541 
1542 void
1543 unp_remove_gcrefs(struct fdpass *rp, int nfds)
1544 {
1545 	struct unpcb *unp;
1546 	int i;
1547 
1548 	rw_assert_wrlock(&unp_gc_lock);
1549 
1550 	for (i = 0; i < nfds; i++) {
1551 		if (rp[i].fp == NULL)
1552 			continue;
1553 		if ((unp = fptounp(rp[i].fp)) == NULL)
1554 			continue;
1555 		if (unp->unp_gcflags & UNP_GCDEAD) {
1556 			KASSERT(unp->unp_gcrefs > 0);
1557 			unp->unp_gcrefs--;
1558 		}
1559 	}
1560 }
1561 
1562 void
1563 unp_restore_gcrefs(struct fdpass *rp, int nfds)
1564 {
1565 	struct unpcb *unp;
1566 	int i;
1567 
1568 	rw_assert_wrlock(&unp_gc_lock);
1569 
1570 	for (i = 0; i < nfds; i++) {
1571 		if (rp[i].fp == NULL)
1572 			continue;
1573 		if ((unp = fptounp(rp[i].fp)) == NULL)
1574 			continue;
1575 		if (unp->unp_gcflags & UNP_GCDEAD) {
1576 			unp->unp_gcrefs++;
1577 			unp_defer++;
1578 		}
1579 	}
1580 }
1581 
1582 int
1583 unp_nam2sun(struct mbuf *nam, struct sockaddr_un **sun, size_t *pathlen)
1584 {
1585 	struct sockaddr *sa = mtod(nam, struct sockaddr *);
1586 	size_t size, len;
1587 
1588 	if (nam->m_len < offsetof(struct sockaddr, sa_data))
1589 		return EINVAL;
1590 	if (sa->sa_family != AF_UNIX)
1591 		return EAFNOSUPPORT;
1592 	if (sa->sa_len != nam->m_len)
1593 		return EINVAL;
1594 	if (sa->sa_len > sizeof(struct sockaddr_un))
1595 		return EINVAL;
1596 	*sun = (struct sockaddr_un *)sa;
1597 
1598 	/* ensure that sun_path is NUL terminated and fits */
1599 	size = (*sun)->sun_len - offsetof(struct sockaddr_un, sun_path);
1600 	len = strnlen((*sun)->sun_path, size);
1601 	if (len == sizeof((*sun)->sun_path))
1602 		return EINVAL;
1603 	if (len == size) {
1604 		if (m_trailingspace(nam) == 0)
1605 			return EINVAL;
1606 		nam->m_len++;
1607 		(*sun)->sun_len++;
1608 		(*sun)->sun_path[len] = '\0';
1609 	}
1610 	if (pathlen != NULL)
1611 		*pathlen = len;
1612 
1613 	return 0;
1614 }
1615