xref: /linux/net/unix/af_unix.c (revision 8a34d4e8)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 
130 static unsigned int unix_unbound_hash(struct sock *sk)
131 {
132 	unsigned long hash = (unsigned long)sk;
133 
134 	hash ^= hash >> 16;
135 	hash ^= hash >> 8;
136 	hash ^= sk->sk_type;
137 
138 	return hash & UNIX_HASH_MOD;
139 }
140 
141 static unsigned int unix_bsd_hash(struct inode *i)
142 {
143 	return i->i_ino & UNIX_HASH_MOD;
144 }
145 
146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147 				       int addr_len, int type)
148 {
149 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
150 	unsigned int hash;
151 
152 	hash = (__force unsigned int)csum_fold(csum);
153 	hash ^= hash >> 8;
154 	hash ^= type;
155 
156 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
157 }
158 
159 static void unix_table_double_lock(struct net *net,
160 				   unsigned int hash1, unsigned int hash2)
161 {
162 	if (hash1 == hash2) {
163 		spin_lock(&net->unx.table.locks[hash1]);
164 		return;
165 	}
166 
167 	if (hash1 > hash2)
168 		swap(hash1, hash2);
169 
170 	spin_lock(&net->unx.table.locks[hash1]);
171 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172 }
173 
174 static void unix_table_double_unlock(struct net *net,
175 				     unsigned int hash1, unsigned int hash2)
176 {
177 	if (hash1 == hash2) {
178 		spin_unlock(&net->unx.table.locks[hash1]);
179 		return;
180 	}
181 
182 	spin_unlock(&net->unx.table.locks[hash1]);
183 	spin_unlock(&net->unx.table.locks[hash2]);
184 }
185 
186 #ifdef CONFIG_SECURITY_NETWORK
187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
188 {
189 	UNIXCB(skb).secid = scm->secid;
190 }
191 
192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
193 {
194 	scm->secid = UNIXCB(skb).secid;
195 }
196 
197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
198 {
199 	return (scm->secid == UNIXCB(skb).secid);
200 }
201 #else
202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
203 { }
204 
205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206 { }
207 
208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
209 {
210 	return true;
211 }
212 #endif /* CONFIG_SECURITY_NETWORK */
213 
214 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
215 {
216 	return unix_peer(osk) == sk;
217 }
218 
219 static inline int unix_may_send(struct sock *sk, struct sock *osk)
220 {
221 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
222 }
223 
224 static inline int unix_recvq_full(const struct sock *sk)
225 {
226 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
227 }
228 
229 static inline int unix_recvq_full_lockless(const struct sock *sk)
230 {
231 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
232 		READ_ONCE(sk->sk_max_ack_backlog);
233 }
234 
235 struct sock *unix_peer_get(struct sock *s)
236 {
237 	struct sock *peer;
238 
239 	unix_state_lock(s);
240 	peer = unix_peer(s);
241 	if (peer)
242 		sock_hold(peer);
243 	unix_state_unlock(s);
244 	return peer;
245 }
246 EXPORT_SYMBOL_GPL(unix_peer_get);
247 
248 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
249 					     int addr_len)
250 {
251 	struct unix_address *addr;
252 
253 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
254 	if (!addr)
255 		return NULL;
256 
257 	refcount_set(&addr->refcnt, 1);
258 	addr->len = addr_len;
259 	memcpy(addr->name, sunaddr, addr_len);
260 
261 	return addr;
262 }
263 
264 static inline void unix_release_addr(struct unix_address *addr)
265 {
266 	if (refcount_dec_and_test(&addr->refcnt))
267 		kfree(addr);
268 }
269 
270 /*
271  *	Check unix socket name:
272  *		- should be not zero length.
273  *	        - if started by not zero, should be NULL terminated (FS object)
274  *		- if started by zero, it is abstract name.
275  */
276 
277 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
278 {
279 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
280 	    addr_len > sizeof(*sunaddr))
281 		return -EINVAL;
282 
283 	if (sunaddr->sun_family != AF_UNIX)
284 		return -EINVAL;
285 
286 	return 0;
287 }
288 
289 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
290 {
291 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
292 	short offset = offsetof(struct sockaddr_storage, __data);
293 
294 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
295 
296 	/* This may look like an off by one error but it is a bit more
297 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
298 	 * sun_path[108] doesn't as such exist.  However in kernel space
299 	 * we are guaranteed that it is a valid memory location in our
300 	 * kernel address buffer because syscall functions always pass
301 	 * a pointer of struct sockaddr_storage which has a bigger buffer
302 	 * than 108.  Also, we must terminate sun_path for strlen() in
303 	 * getname_kernel().
304 	 */
305 	addr->__data[addr_len - offset] = 0;
306 
307 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
308 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
309 	 * know the actual buffer.
310 	 */
311 	return strlen(addr->__data) + offset + 1;
312 }
313 
314 static void __unix_remove_socket(struct sock *sk)
315 {
316 	sk_del_node_init(sk);
317 }
318 
319 static void __unix_insert_socket(struct net *net, struct sock *sk)
320 {
321 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
322 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
323 }
324 
325 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
326 				 struct unix_address *addr, unsigned int hash)
327 {
328 	__unix_remove_socket(sk);
329 	smp_store_release(&unix_sk(sk)->addr, addr);
330 
331 	sk->sk_hash = hash;
332 	__unix_insert_socket(net, sk);
333 }
334 
335 static void unix_remove_socket(struct net *net, struct sock *sk)
336 {
337 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
338 	__unix_remove_socket(sk);
339 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
340 }
341 
342 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
343 {
344 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
345 	__unix_insert_socket(net, sk);
346 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
347 }
348 
349 static void unix_insert_bsd_socket(struct sock *sk)
350 {
351 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
352 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
353 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
354 }
355 
356 static void unix_remove_bsd_socket(struct sock *sk)
357 {
358 	if (!hlist_unhashed(&sk->sk_bind_node)) {
359 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
360 		__sk_del_bind_node(sk);
361 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
362 
363 		sk_node_init(&sk->sk_bind_node);
364 	}
365 }
366 
367 static struct sock *__unix_find_socket_byname(struct net *net,
368 					      struct sockaddr_un *sunname,
369 					      int len, unsigned int hash)
370 {
371 	struct sock *s;
372 
373 	sk_for_each(s, &net->unx.table.buckets[hash]) {
374 		struct unix_sock *u = unix_sk(s);
375 
376 		if (u->addr->len == len &&
377 		    !memcmp(u->addr->name, sunname, len))
378 			return s;
379 	}
380 	return NULL;
381 }
382 
383 static inline struct sock *unix_find_socket_byname(struct net *net,
384 						   struct sockaddr_un *sunname,
385 						   int len, unsigned int hash)
386 {
387 	struct sock *s;
388 
389 	spin_lock(&net->unx.table.locks[hash]);
390 	s = __unix_find_socket_byname(net, sunname, len, hash);
391 	if (s)
392 		sock_hold(s);
393 	spin_unlock(&net->unx.table.locks[hash]);
394 	return s;
395 }
396 
397 static struct sock *unix_find_socket_byinode(struct inode *i)
398 {
399 	unsigned int hash = unix_bsd_hash(i);
400 	struct sock *s;
401 
402 	spin_lock(&bsd_socket_locks[hash]);
403 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
404 		struct dentry *dentry = unix_sk(s)->path.dentry;
405 
406 		if (dentry && d_backing_inode(dentry) == i) {
407 			sock_hold(s);
408 			spin_unlock(&bsd_socket_locks[hash]);
409 			return s;
410 		}
411 	}
412 	spin_unlock(&bsd_socket_locks[hash]);
413 	return NULL;
414 }
415 
416 /* Support code for asymmetrically connected dgram sockets
417  *
418  * If a datagram socket is connected to a socket not itself connected
419  * to the first socket (eg, /dev/log), clients may only enqueue more
420  * messages if the present receive queue of the server socket is not
421  * "too large". This means there's a second writeability condition
422  * poll and sendmsg need to test. The dgram recv code will do a wake
423  * up on the peer_wait wait queue of a socket upon reception of a
424  * datagram which needs to be propagated to sleeping would-be writers
425  * since these might not have sent anything so far. This can't be
426  * accomplished via poll_wait because the lifetime of the server
427  * socket might be less than that of its clients if these break their
428  * association with it or if the server socket is closed while clients
429  * are still connected to it and there's no way to inform "a polling
430  * implementation" that it should let go of a certain wait queue
431  *
432  * In order to propagate a wake up, a wait_queue_entry_t of the client
433  * socket is enqueued on the peer_wait queue of the server socket
434  * whose wake function does a wake_up on the ordinary client socket
435  * wait queue. This connection is established whenever a write (or
436  * poll for write) hit the flow control condition and broken when the
437  * association to the server socket is dissolved or after a wake up
438  * was relayed.
439  */
440 
441 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
442 				      void *key)
443 {
444 	struct unix_sock *u;
445 	wait_queue_head_t *u_sleep;
446 
447 	u = container_of(q, struct unix_sock, peer_wake);
448 
449 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
450 			    q);
451 	u->peer_wake.private = NULL;
452 
453 	/* relaying can only happen while the wq still exists */
454 	u_sleep = sk_sleep(&u->sk);
455 	if (u_sleep)
456 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
457 
458 	return 0;
459 }
460 
461 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
462 {
463 	struct unix_sock *u, *u_other;
464 	int rc;
465 
466 	u = unix_sk(sk);
467 	u_other = unix_sk(other);
468 	rc = 0;
469 	spin_lock(&u_other->peer_wait.lock);
470 
471 	if (!u->peer_wake.private) {
472 		u->peer_wake.private = other;
473 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
474 
475 		rc = 1;
476 	}
477 
478 	spin_unlock(&u_other->peer_wait.lock);
479 	return rc;
480 }
481 
482 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
483 					    struct sock *other)
484 {
485 	struct unix_sock *u, *u_other;
486 
487 	u = unix_sk(sk);
488 	u_other = unix_sk(other);
489 	spin_lock(&u_other->peer_wait.lock);
490 
491 	if (u->peer_wake.private == other) {
492 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
493 		u->peer_wake.private = NULL;
494 	}
495 
496 	spin_unlock(&u_other->peer_wait.lock);
497 }
498 
499 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
500 						   struct sock *other)
501 {
502 	unix_dgram_peer_wake_disconnect(sk, other);
503 	wake_up_interruptible_poll(sk_sleep(sk),
504 				   EPOLLOUT |
505 				   EPOLLWRNORM |
506 				   EPOLLWRBAND);
507 }
508 
509 /* preconditions:
510  *	- unix_peer(sk) == other
511  *	- association is stable
512  */
513 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
514 {
515 	int connected;
516 
517 	connected = unix_dgram_peer_wake_connect(sk, other);
518 
519 	/* If other is SOCK_DEAD, we want to make sure we signal
520 	 * POLLOUT, such that a subsequent write() can get a
521 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
522 	 * to other and its full, we will hang waiting for POLLOUT.
523 	 */
524 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
525 		return 1;
526 
527 	if (connected)
528 		unix_dgram_peer_wake_disconnect(sk, other);
529 
530 	return 0;
531 }
532 
533 static int unix_writable(const struct sock *sk, unsigned char state)
534 {
535 	return state != TCP_LISTEN &&
536 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
537 }
538 
539 static void unix_write_space(struct sock *sk)
540 {
541 	struct socket_wq *wq;
542 
543 	rcu_read_lock();
544 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
545 		wq = rcu_dereference(sk->sk_wq);
546 		if (skwq_has_sleeper(wq))
547 			wake_up_interruptible_sync_poll(&wq->wait,
548 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
549 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
550 	}
551 	rcu_read_unlock();
552 }
553 
554 /* When dgram socket disconnects (or changes its peer), we clear its receive
555  * queue of packets arrived from previous peer. First, it allows to do
556  * flow control based only on wmem_alloc; second, sk connected to peer
557  * may receive messages only from that peer. */
558 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
559 {
560 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
561 		skb_queue_purge(&sk->sk_receive_queue);
562 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
563 
564 		/* If one link of bidirectional dgram pipe is disconnected,
565 		 * we signal error. Messages are lost. Do not make this,
566 		 * when peer was not connected to us.
567 		 */
568 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
569 			WRITE_ONCE(other->sk_err, ECONNRESET);
570 			sk_error_report(other);
571 		}
572 	}
573 }
574 
575 static void unix_sock_destructor(struct sock *sk)
576 {
577 	struct unix_sock *u = unix_sk(sk);
578 
579 	skb_queue_purge(&sk->sk_receive_queue);
580 
581 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
582 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
583 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
584 	if (!sock_flag(sk, SOCK_DEAD)) {
585 		pr_info("Attempt to release alive unix socket: %p\n", sk);
586 		return;
587 	}
588 
589 	if (u->addr)
590 		unix_release_addr(u->addr);
591 
592 	atomic_long_dec(&unix_nr_socks);
593 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
594 #ifdef UNIX_REFCNT_DEBUG
595 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
596 		atomic_long_read(&unix_nr_socks));
597 #endif
598 }
599 
600 static void unix_release_sock(struct sock *sk, int embrion)
601 {
602 	struct unix_sock *u = unix_sk(sk);
603 	struct sock *skpair;
604 	struct sk_buff *skb;
605 	struct path path;
606 	int state;
607 
608 	unix_remove_socket(sock_net(sk), sk);
609 	unix_remove_bsd_socket(sk);
610 
611 	/* Clear state */
612 	unix_state_lock(sk);
613 	sock_orphan(sk);
614 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
615 	path	     = u->path;
616 	u->path.dentry = NULL;
617 	u->path.mnt = NULL;
618 	state = sk->sk_state;
619 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
620 
621 	skpair = unix_peer(sk);
622 	unix_peer(sk) = NULL;
623 
624 	unix_state_unlock(sk);
625 
626 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
627 	if (u->oob_skb) {
628 		kfree_skb(u->oob_skb);
629 		u->oob_skb = NULL;
630 	}
631 #endif
632 
633 	wake_up_interruptible_all(&u->peer_wait);
634 
635 	if (skpair != NULL) {
636 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
637 			unix_state_lock(skpair);
638 			/* No more writes */
639 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
640 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
641 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
642 			unix_state_unlock(skpair);
643 			skpair->sk_state_change(skpair);
644 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
645 		}
646 
647 		unix_dgram_peer_wake_disconnect(sk, skpair);
648 		sock_put(skpair); /* It may now die */
649 	}
650 
651 	/* Try to flush out this socket. Throw out buffers at least */
652 
653 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
654 		if (state == TCP_LISTEN)
655 			unix_release_sock(skb->sk, 1);
656 		/* passed fds are erased in the kfree_skb hook	      */
657 		UNIXCB(skb).consumed = skb->len;
658 		kfree_skb(skb);
659 	}
660 
661 	if (path.dentry)
662 		path_put(&path);
663 
664 	sock_put(sk);
665 
666 	/* ---- Socket is dead now and most probably destroyed ---- */
667 
668 	/*
669 	 * Fixme: BSD difference: In BSD all sockets connected to us get
670 	 *	  ECONNRESET and we die on the spot. In Linux we behave
671 	 *	  like files and pipes do and wait for the last
672 	 *	  dereference.
673 	 *
674 	 * Can't we simply set sock->err?
675 	 *
676 	 *	  What the above comment does talk about? --ANK(980817)
677 	 */
678 
679 	if (READ_ONCE(unix_tot_inflight))
680 		unix_gc();		/* Garbage collect fds */
681 }
682 
683 static void init_peercred(struct sock *sk)
684 {
685 	const struct cred *old_cred;
686 	struct pid *old_pid;
687 
688 	spin_lock(&sk->sk_peer_lock);
689 	old_pid = sk->sk_peer_pid;
690 	old_cred = sk->sk_peer_cred;
691 	sk->sk_peer_pid  = get_pid(task_tgid(current));
692 	sk->sk_peer_cred = get_current_cred();
693 	spin_unlock(&sk->sk_peer_lock);
694 
695 	put_pid(old_pid);
696 	put_cred(old_cred);
697 }
698 
699 static void copy_peercred(struct sock *sk, struct sock *peersk)
700 {
701 	const struct cred *old_cred;
702 	struct pid *old_pid;
703 
704 	if (sk < peersk) {
705 		spin_lock(&sk->sk_peer_lock);
706 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
707 	} else {
708 		spin_lock(&peersk->sk_peer_lock);
709 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
710 	}
711 	old_pid = sk->sk_peer_pid;
712 	old_cred = sk->sk_peer_cred;
713 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
714 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
715 
716 	spin_unlock(&sk->sk_peer_lock);
717 	spin_unlock(&peersk->sk_peer_lock);
718 
719 	put_pid(old_pid);
720 	put_cred(old_cred);
721 }
722 
723 static int unix_listen(struct socket *sock, int backlog)
724 {
725 	int err;
726 	struct sock *sk = sock->sk;
727 	struct unix_sock *u = unix_sk(sk);
728 
729 	err = -EOPNOTSUPP;
730 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
731 		goto out;	/* Only stream/seqpacket sockets accept */
732 	err = -EINVAL;
733 	if (!READ_ONCE(u->addr))
734 		goto out;	/* No listens on an unbound socket */
735 	unix_state_lock(sk);
736 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
737 		goto out_unlock;
738 	if (backlog > sk->sk_max_ack_backlog)
739 		wake_up_interruptible_all(&u->peer_wait);
740 	sk->sk_max_ack_backlog	= backlog;
741 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
742 
743 	/* set credentials so connect can copy them */
744 	init_peercred(sk);
745 	err = 0;
746 
747 out_unlock:
748 	unix_state_unlock(sk);
749 out:
750 	return err;
751 }
752 
753 static int unix_release(struct socket *);
754 static int unix_bind(struct socket *, struct sockaddr *, int);
755 static int unix_stream_connect(struct socket *, struct sockaddr *,
756 			       int addr_len, int flags);
757 static int unix_socketpair(struct socket *, struct socket *);
758 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
759 static int unix_getname(struct socket *, struct sockaddr *, int);
760 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
761 static __poll_t unix_dgram_poll(struct file *, struct socket *,
762 				    poll_table *);
763 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
764 #ifdef CONFIG_COMPAT
765 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
766 #endif
767 static int unix_shutdown(struct socket *, int);
768 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
769 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
770 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
771 				       struct pipe_inode_info *, size_t size,
772 				       unsigned int flags);
773 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
774 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
775 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
776 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
777 static int unix_dgram_connect(struct socket *, struct sockaddr *,
778 			      int, int);
779 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
780 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
781 				  int);
782 
783 #ifdef CONFIG_PROC_FS
784 static int unix_count_nr_fds(struct sock *sk)
785 {
786 	struct sk_buff *skb;
787 	struct unix_sock *u;
788 	int nr_fds = 0;
789 
790 	spin_lock(&sk->sk_receive_queue.lock);
791 	skb = skb_peek(&sk->sk_receive_queue);
792 	while (skb) {
793 		u = unix_sk(skb->sk);
794 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
795 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
796 	}
797 	spin_unlock(&sk->sk_receive_queue.lock);
798 
799 	return nr_fds;
800 }
801 
802 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
803 {
804 	struct sock *sk = sock->sk;
805 	unsigned char s_state;
806 	struct unix_sock *u;
807 	int nr_fds = 0;
808 
809 	if (sk) {
810 		s_state = READ_ONCE(sk->sk_state);
811 		u = unix_sk(sk);
812 
813 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
814 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
815 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
816 		 */
817 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
818 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
819 		else if (s_state == TCP_LISTEN)
820 			nr_fds = unix_count_nr_fds(sk);
821 
822 		seq_printf(m, "scm_fds: %u\n", nr_fds);
823 	}
824 }
825 #else
826 #define unix_show_fdinfo NULL
827 #endif
828 
829 static const struct proto_ops unix_stream_ops = {
830 	.family =	PF_UNIX,
831 	.owner =	THIS_MODULE,
832 	.release =	unix_release,
833 	.bind =		unix_bind,
834 	.connect =	unix_stream_connect,
835 	.socketpair =	unix_socketpair,
836 	.accept =	unix_accept,
837 	.getname =	unix_getname,
838 	.poll =		unix_poll,
839 	.ioctl =	unix_ioctl,
840 #ifdef CONFIG_COMPAT
841 	.compat_ioctl =	unix_compat_ioctl,
842 #endif
843 	.listen =	unix_listen,
844 	.shutdown =	unix_shutdown,
845 	.sendmsg =	unix_stream_sendmsg,
846 	.recvmsg =	unix_stream_recvmsg,
847 	.read_skb =	unix_stream_read_skb,
848 	.mmap =		sock_no_mmap,
849 	.splice_read =	unix_stream_splice_read,
850 	.set_peek_off =	sk_set_peek_off,
851 	.show_fdinfo =	unix_show_fdinfo,
852 };
853 
854 static const struct proto_ops unix_dgram_ops = {
855 	.family =	PF_UNIX,
856 	.owner =	THIS_MODULE,
857 	.release =	unix_release,
858 	.bind =		unix_bind,
859 	.connect =	unix_dgram_connect,
860 	.socketpair =	unix_socketpair,
861 	.accept =	sock_no_accept,
862 	.getname =	unix_getname,
863 	.poll =		unix_dgram_poll,
864 	.ioctl =	unix_ioctl,
865 #ifdef CONFIG_COMPAT
866 	.compat_ioctl =	unix_compat_ioctl,
867 #endif
868 	.listen =	sock_no_listen,
869 	.shutdown =	unix_shutdown,
870 	.sendmsg =	unix_dgram_sendmsg,
871 	.read_skb =	unix_read_skb,
872 	.recvmsg =	unix_dgram_recvmsg,
873 	.mmap =		sock_no_mmap,
874 	.set_peek_off =	sk_set_peek_off,
875 	.show_fdinfo =	unix_show_fdinfo,
876 };
877 
878 static const struct proto_ops unix_seqpacket_ops = {
879 	.family =	PF_UNIX,
880 	.owner =	THIS_MODULE,
881 	.release =	unix_release,
882 	.bind =		unix_bind,
883 	.connect =	unix_stream_connect,
884 	.socketpair =	unix_socketpair,
885 	.accept =	unix_accept,
886 	.getname =	unix_getname,
887 	.poll =		unix_dgram_poll,
888 	.ioctl =	unix_ioctl,
889 #ifdef CONFIG_COMPAT
890 	.compat_ioctl =	unix_compat_ioctl,
891 #endif
892 	.listen =	unix_listen,
893 	.shutdown =	unix_shutdown,
894 	.sendmsg =	unix_seqpacket_sendmsg,
895 	.recvmsg =	unix_seqpacket_recvmsg,
896 	.mmap =		sock_no_mmap,
897 	.set_peek_off =	sk_set_peek_off,
898 	.show_fdinfo =	unix_show_fdinfo,
899 };
900 
901 static void unix_close(struct sock *sk, long timeout)
902 {
903 	/* Nothing to do here, unix socket does not need a ->close().
904 	 * This is merely for sockmap.
905 	 */
906 }
907 
908 static void unix_unhash(struct sock *sk)
909 {
910 	/* Nothing to do here, unix socket does not need a ->unhash().
911 	 * This is merely for sockmap.
912 	 */
913 }
914 
915 static bool unix_bpf_bypass_getsockopt(int level, int optname)
916 {
917 	if (level == SOL_SOCKET) {
918 		switch (optname) {
919 		case SO_PEERPIDFD:
920 			return true;
921 		default:
922 			return false;
923 		}
924 	}
925 
926 	return false;
927 }
928 
929 struct proto unix_dgram_proto = {
930 	.name			= "UNIX",
931 	.owner			= THIS_MODULE,
932 	.obj_size		= sizeof(struct unix_sock),
933 	.close			= unix_close,
934 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
935 #ifdef CONFIG_BPF_SYSCALL
936 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
937 #endif
938 };
939 
940 struct proto unix_stream_proto = {
941 	.name			= "UNIX-STREAM",
942 	.owner			= THIS_MODULE,
943 	.obj_size		= sizeof(struct unix_sock),
944 	.close			= unix_close,
945 	.unhash			= unix_unhash,
946 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
947 #ifdef CONFIG_BPF_SYSCALL
948 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
949 #endif
950 };
951 
952 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
953 {
954 	struct unix_sock *u;
955 	struct sock *sk;
956 	int err;
957 
958 	atomic_long_inc(&unix_nr_socks);
959 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
960 		err = -ENFILE;
961 		goto err;
962 	}
963 
964 	if (type == SOCK_STREAM)
965 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
966 	else /*dgram and  seqpacket */
967 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
968 
969 	if (!sk) {
970 		err = -ENOMEM;
971 		goto err;
972 	}
973 
974 	sock_init_data(sock, sk);
975 
976 	sk->sk_hash		= unix_unbound_hash(sk);
977 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
978 	sk->sk_write_space	= unix_write_space;
979 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
980 	sk->sk_destruct		= unix_sock_destructor;
981 	u = unix_sk(sk);
982 	u->listener = NULL;
983 	u->vertex = NULL;
984 	u->path.dentry = NULL;
985 	u->path.mnt = NULL;
986 	spin_lock_init(&u->lock);
987 	mutex_init(&u->iolock); /* single task reading lock */
988 	mutex_init(&u->bindlock); /* single task binding lock */
989 	init_waitqueue_head(&u->peer_wait);
990 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
991 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
992 	unix_insert_unbound_socket(net, sk);
993 
994 	sock_prot_inuse_add(net, sk->sk_prot, 1);
995 
996 	return sk;
997 
998 err:
999 	atomic_long_dec(&unix_nr_socks);
1000 	return ERR_PTR(err);
1001 }
1002 
1003 static int unix_create(struct net *net, struct socket *sock, int protocol,
1004 		       int kern)
1005 {
1006 	struct sock *sk;
1007 
1008 	if (protocol && protocol != PF_UNIX)
1009 		return -EPROTONOSUPPORT;
1010 
1011 	sock->state = SS_UNCONNECTED;
1012 
1013 	switch (sock->type) {
1014 	case SOCK_STREAM:
1015 		sock->ops = &unix_stream_ops;
1016 		break;
1017 		/*
1018 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1019 		 *	nothing uses it.
1020 		 */
1021 	case SOCK_RAW:
1022 		sock->type = SOCK_DGRAM;
1023 		fallthrough;
1024 	case SOCK_DGRAM:
1025 		sock->ops = &unix_dgram_ops;
1026 		break;
1027 	case SOCK_SEQPACKET:
1028 		sock->ops = &unix_seqpacket_ops;
1029 		break;
1030 	default:
1031 		return -ESOCKTNOSUPPORT;
1032 	}
1033 
1034 	sk = unix_create1(net, sock, kern, sock->type);
1035 	if (IS_ERR(sk))
1036 		return PTR_ERR(sk);
1037 
1038 	return 0;
1039 }
1040 
1041 static int unix_release(struct socket *sock)
1042 {
1043 	struct sock *sk = sock->sk;
1044 
1045 	if (!sk)
1046 		return 0;
1047 
1048 	sk->sk_prot->close(sk, 0);
1049 	unix_release_sock(sk, 0);
1050 	sock->sk = NULL;
1051 
1052 	return 0;
1053 }
1054 
1055 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1056 				  int type)
1057 {
1058 	struct inode *inode;
1059 	struct path path;
1060 	struct sock *sk;
1061 	int err;
1062 
1063 	unix_mkname_bsd(sunaddr, addr_len);
1064 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1065 	if (err)
1066 		goto fail;
1067 
1068 	err = path_permission(&path, MAY_WRITE);
1069 	if (err)
1070 		goto path_put;
1071 
1072 	err = -ECONNREFUSED;
1073 	inode = d_backing_inode(path.dentry);
1074 	if (!S_ISSOCK(inode->i_mode))
1075 		goto path_put;
1076 
1077 	sk = unix_find_socket_byinode(inode);
1078 	if (!sk)
1079 		goto path_put;
1080 
1081 	err = -EPROTOTYPE;
1082 	if (sk->sk_type == type)
1083 		touch_atime(&path);
1084 	else
1085 		goto sock_put;
1086 
1087 	path_put(&path);
1088 
1089 	return sk;
1090 
1091 sock_put:
1092 	sock_put(sk);
1093 path_put:
1094 	path_put(&path);
1095 fail:
1096 	return ERR_PTR(err);
1097 }
1098 
1099 static struct sock *unix_find_abstract(struct net *net,
1100 				       struct sockaddr_un *sunaddr,
1101 				       int addr_len, int type)
1102 {
1103 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1104 	struct dentry *dentry;
1105 	struct sock *sk;
1106 
1107 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1108 	if (!sk)
1109 		return ERR_PTR(-ECONNREFUSED);
1110 
1111 	dentry = unix_sk(sk)->path.dentry;
1112 	if (dentry)
1113 		touch_atime(&unix_sk(sk)->path);
1114 
1115 	return sk;
1116 }
1117 
1118 static struct sock *unix_find_other(struct net *net,
1119 				    struct sockaddr_un *sunaddr,
1120 				    int addr_len, int type)
1121 {
1122 	struct sock *sk;
1123 
1124 	if (sunaddr->sun_path[0])
1125 		sk = unix_find_bsd(sunaddr, addr_len, type);
1126 	else
1127 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1128 
1129 	return sk;
1130 }
1131 
1132 static int unix_autobind(struct sock *sk)
1133 {
1134 	struct unix_sock *u = unix_sk(sk);
1135 	unsigned int new_hash, old_hash;
1136 	struct net *net = sock_net(sk);
1137 	struct unix_address *addr;
1138 	u32 lastnum, ordernum;
1139 	int err;
1140 
1141 	err = mutex_lock_interruptible(&u->bindlock);
1142 	if (err)
1143 		return err;
1144 
1145 	if (u->addr)
1146 		goto out;
1147 
1148 	err = -ENOMEM;
1149 	addr = kzalloc(sizeof(*addr) +
1150 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1151 	if (!addr)
1152 		goto out;
1153 
1154 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1155 	addr->name->sun_family = AF_UNIX;
1156 	refcount_set(&addr->refcnt, 1);
1157 
1158 	old_hash = sk->sk_hash;
1159 	ordernum = get_random_u32();
1160 	lastnum = ordernum & 0xFFFFF;
1161 retry:
1162 	ordernum = (ordernum + 1) & 0xFFFFF;
1163 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1164 
1165 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1166 	unix_table_double_lock(net, old_hash, new_hash);
1167 
1168 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1169 		unix_table_double_unlock(net, old_hash, new_hash);
1170 
1171 		/* __unix_find_socket_byname() may take long time if many names
1172 		 * are already in use.
1173 		 */
1174 		cond_resched();
1175 
1176 		if (ordernum == lastnum) {
1177 			/* Give up if all names seems to be in use. */
1178 			err = -ENOSPC;
1179 			unix_release_addr(addr);
1180 			goto out;
1181 		}
1182 
1183 		goto retry;
1184 	}
1185 
1186 	__unix_set_addr_hash(net, sk, addr, new_hash);
1187 	unix_table_double_unlock(net, old_hash, new_hash);
1188 	err = 0;
1189 
1190 out:	mutex_unlock(&u->bindlock);
1191 	return err;
1192 }
1193 
1194 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1195 			 int addr_len)
1196 {
1197 	umode_t mode = S_IFSOCK |
1198 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1199 	struct unix_sock *u = unix_sk(sk);
1200 	unsigned int new_hash, old_hash;
1201 	struct net *net = sock_net(sk);
1202 	struct mnt_idmap *idmap;
1203 	struct unix_address *addr;
1204 	struct dentry *dentry;
1205 	struct path parent;
1206 	int err;
1207 
1208 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1209 	addr = unix_create_addr(sunaddr, addr_len);
1210 	if (!addr)
1211 		return -ENOMEM;
1212 
1213 	/*
1214 	 * Get the parent directory, calculate the hash for last
1215 	 * component.
1216 	 */
1217 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1218 	if (IS_ERR(dentry)) {
1219 		err = PTR_ERR(dentry);
1220 		goto out;
1221 	}
1222 
1223 	/*
1224 	 * All right, let's create it.
1225 	 */
1226 	idmap = mnt_idmap(parent.mnt);
1227 	err = security_path_mknod(&parent, dentry, mode, 0);
1228 	if (!err)
1229 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1230 	if (err)
1231 		goto out_path;
1232 	err = mutex_lock_interruptible(&u->bindlock);
1233 	if (err)
1234 		goto out_unlink;
1235 	if (u->addr)
1236 		goto out_unlock;
1237 
1238 	old_hash = sk->sk_hash;
1239 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1240 	unix_table_double_lock(net, old_hash, new_hash);
1241 	u->path.mnt = mntget(parent.mnt);
1242 	u->path.dentry = dget(dentry);
1243 	__unix_set_addr_hash(net, sk, addr, new_hash);
1244 	unix_table_double_unlock(net, old_hash, new_hash);
1245 	unix_insert_bsd_socket(sk);
1246 	mutex_unlock(&u->bindlock);
1247 	done_path_create(&parent, dentry);
1248 	return 0;
1249 
1250 out_unlock:
1251 	mutex_unlock(&u->bindlock);
1252 	err = -EINVAL;
1253 out_unlink:
1254 	/* failed after successful mknod?  unlink what we'd created... */
1255 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1256 out_path:
1257 	done_path_create(&parent, dentry);
1258 out:
1259 	unix_release_addr(addr);
1260 	return err == -EEXIST ? -EADDRINUSE : err;
1261 }
1262 
1263 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1264 			      int addr_len)
1265 {
1266 	struct unix_sock *u = unix_sk(sk);
1267 	unsigned int new_hash, old_hash;
1268 	struct net *net = sock_net(sk);
1269 	struct unix_address *addr;
1270 	int err;
1271 
1272 	addr = unix_create_addr(sunaddr, addr_len);
1273 	if (!addr)
1274 		return -ENOMEM;
1275 
1276 	err = mutex_lock_interruptible(&u->bindlock);
1277 	if (err)
1278 		goto out;
1279 
1280 	if (u->addr) {
1281 		err = -EINVAL;
1282 		goto out_mutex;
1283 	}
1284 
1285 	old_hash = sk->sk_hash;
1286 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1287 	unix_table_double_lock(net, old_hash, new_hash);
1288 
1289 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1290 		goto out_spin;
1291 
1292 	__unix_set_addr_hash(net, sk, addr, new_hash);
1293 	unix_table_double_unlock(net, old_hash, new_hash);
1294 	mutex_unlock(&u->bindlock);
1295 	return 0;
1296 
1297 out_spin:
1298 	unix_table_double_unlock(net, old_hash, new_hash);
1299 	err = -EADDRINUSE;
1300 out_mutex:
1301 	mutex_unlock(&u->bindlock);
1302 out:
1303 	unix_release_addr(addr);
1304 	return err;
1305 }
1306 
1307 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1308 {
1309 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1310 	struct sock *sk = sock->sk;
1311 	int err;
1312 
1313 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1314 	    sunaddr->sun_family == AF_UNIX)
1315 		return unix_autobind(sk);
1316 
1317 	err = unix_validate_addr(sunaddr, addr_len);
1318 	if (err)
1319 		return err;
1320 
1321 	if (sunaddr->sun_path[0])
1322 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1323 	else
1324 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1325 
1326 	return err;
1327 }
1328 
1329 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1330 {
1331 	if (unlikely(sk1 == sk2) || !sk2) {
1332 		unix_state_lock(sk1);
1333 		return;
1334 	}
1335 	if (sk1 > sk2)
1336 		swap(sk1, sk2);
1337 
1338 	unix_state_lock(sk1);
1339 	unix_state_lock_nested(sk2, U_LOCK_SECOND);
1340 }
1341 
1342 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1343 {
1344 	if (unlikely(sk1 == sk2) || !sk2) {
1345 		unix_state_unlock(sk1);
1346 		return;
1347 	}
1348 	unix_state_unlock(sk1);
1349 	unix_state_unlock(sk2);
1350 }
1351 
1352 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1353 			      int alen, int flags)
1354 {
1355 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1356 	struct sock *sk = sock->sk;
1357 	struct sock *other;
1358 	int err;
1359 
1360 	err = -EINVAL;
1361 	if (alen < offsetofend(struct sockaddr, sa_family))
1362 		goto out;
1363 
1364 	if (addr->sa_family != AF_UNSPEC) {
1365 		err = unix_validate_addr(sunaddr, alen);
1366 		if (err)
1367 			goto out;
1368 
1369 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1370 		if (err)
1371 			goto out;
1372 
1373 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1374 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1375 		    !READ_ONCE(unix_sk(sk)->addr)) {
1376 			err = unix_autobind(sk);
1377 			if (err)
1378 				goto out;
1379 		}
1380 
1381 restart:
1382 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1383 		if (IS_ERR(other)) {
1384 			err = PTR_ERR(other);
1385 			goto out;
1386 		}
1387 
1388 		unix_state_double_lock(sk, other);
1389 
1390 		/* Apparently VFS overslept socket death. Retry. */
1391 		if (sock_flag(other, SOCK_DEAD)) {
1392 			unix_state_double_unlock(sk, other);
1393 			sock_put(other);
1394 			goto restart;
1395 		}
1396 
1397 		err = -EPERM;
1398 		if (!unix_may_send(sk, other))
1399 			goto out_unlock;
1400 
1401 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1402 		if (err)
1403 			goto out_unlock;
1404 
1405 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1406 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1407 	} else {
1408 		/*
1409 		 *	1003.1g breaking connected state with AF_UNSPEC
1410 		 */
1411 		other = NULL;
1412 		unix_state_double_lock(sk, other);
1413 	}
1414 
1415 	/*
1416 	 * If it was connected, reconnect.
1417 	 */
1418 	if (unix_peer(sk)) {
1419 		struct sock *old_peer = unix_peer(sk);
1420 
1421 		unix_peer(sk) = other;
1422 		if (!other)
1423 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1424 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1425 
1426 		unix_state_double_unlock(sk, other);
1427 
1428 		if (other != old_peer) {
1429 			unix_dgram_disconnected(sk, old_peer);
1430 
1431 			unix_state_lock(old_peer);
1432 			if (!unix_peer(old_peer))
1433 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1434 			unix_state_unlock(old_peer);
1435 		}
1436 
1437 		sock_put(old_peer);
1438 	} else {
1439 		unix_peer(sk) = other;
1440 		unix_state_double_unlock(sk, other);
1441 	}
1442 
1443 	return 0;
1444 
1445 out_unlock:
1446 	unix_state_double_unlock(sk, other);
1447 	sock_put(other);
1448 out:
1449 	return err;
1450 }
1451 
1452 static long unix_wait_for_peer(struct sock *other, long timeo)
1453 	__releases(&unix_sk(other)->lock)
1454 {
1455 	struct unix_sock *u = unix_sk(other);
1456 	int sched;
1457 	DEFINE_WAIT(wait);
1458 
1459 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1460 
1461 	sched = !sock_flag(other, SOCK_DEAD) &&
1462 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1463 		unix_recvq_full_lockless(other);
1464 
1465 	unix_state_unlock(other);
1466 
1467 	if (sched)
1468 		timeo = schedule_timeout(timeo);
1469 
1470 	finish_wait(&u->peer_wait, &wait);
1471 	return timeo;
1472 }
1473 
1474 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1475 			       int addr_len, int flags)
1476 {
1477 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1478 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1479 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1480 	struct net *net = sock_net(sk);
1481 	struct sk_buff *skb = NULL;
1482 	long timeo;
1483 	int err;
1484 
1485 	err = unix_validate_addr(sunaddr, addr_len);
1486 	if (err)
1487 		goto out;
1488 
1489 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1490 	if (err)
1491 		goto out;
1492 
1493 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1494 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1495 	    !READ_ONCE(u->addr)) {
1496 		err = unix_autobind(sk);
1497 		if (err)
1498 			goto out;
1499 	}
1500 
1501 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1502 
1503 	/* First of all allocate resources.
1504 	   If we will make it after state is locked,
1505 	   we will have to recheck all again in any case.
1506 	 */
1507 
1508 	/* create new sock for complete connection */
1509 	newsk = unix_create1(net, NULL, 0, sock->type);
1510 	if (IS_ERR(newsk)) {
1511 		err = PTR_ERR(newsk);
1512 		newsk = NULL;
1513 		goto out;
1514 	}
1515 
1516 	err = -ENOMEM;
1517 
1518 	/* Allocate skb for sending to listening sock */
1519 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1520 	if (skb == NULL)
1521 		goto out;
1522 
1523 restart:
1524 	/*  Find listening sock. */
1525 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1526 	if (IS_ERR(other)) {
1527 		err = PTR_ERR(other);
1528 		other = NULL;
1529 		goto out;
1530 	}
1531 
1532 	/* Latch state of peer */
1533 	unix_state_lock(other);
1534 
1535 	/* Apparently VFS overslept socket death. Retry. */
1536 	if (sock_flag(other, SOCK_DEAD)) {
1537 		unix_state_unlock(other);
1538 		sock_put(other);
1539 		goto restart;
1540 	}
1541 
1542 	err = -ECONNREFUSED;
1543 	if (other->sk_state != TCP_LISTEN)
1544 		goto out_unlock;
1545 	if (other->sk_shutdown & RCV_SHUTDOWN)
1546 		goto out_unlock;
1547 
1548 	if (unix_recvq_full(other)) {
1549 		err = -EAGAIN;
1550 		if (!timeo)
1551 			goto out_unlock;
1552 
1553 		timeo = unix_wait_for_peer(other, timeo);
1554 
1555 		err = sock_intr_errno(timeo);
1556 		if (signal_pending(current))
1557 			goto out;
1558 		sock_put(other);
1559 		goto restart;
1560 	}
1561 
1562 	/* Latch our state.
1563 
1564 	   It is tricky place. We need to grab our state lock and cannot
1565 	   drop lock on peer. It is dangerous because deadlock is
1566 	   possible. Connect to self case and simultaneous
1567 	   attempt to connect are eliminated by checking socket
1568 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1569 	   check this before attempt to grab lock.
1570 
1571 	   Well, and we have to recheck the state after socket locked.
1572 	 */
1573 	switch (READ_ONCE(sk->sk_state)) {
1574 	case TCP_CLOSE:
1575 		/* This is ok... continue with connect */
1576 		break;
1577 	case TCP_ESTABLISHED:
1578 		/* Socket is already connected */
1579 		err = -EISCONN;
1580 		goto out_unlock;
1581 	default:
1582 		err = -EINVAL;
1583 		goto out_unlock;
1584 	}
1585 
1586 	unix_state_lock_nested(sk, U_LOCK_SECOND);
1587 
1588 	if (sk->sk_state != TCP_CLOSE) {
1589 		unix_state_unlock(sk);
1590 		unix_state_unlock(other);
1591 		sock_put(other);
1592 		goto restart;
1593 	}
1594 
1595 	err = security_unix_stream_connect(sk, other, newsk);
1596 	if (err) {
1597 		unix_state_unlock(sk);
1598 		goto out_unlock;
1599 	}
1600 
1601 	/* The way is open! Fastly set all the necessary fields... */
1602 
1603 	sock_hold(sk);
1604 	unix_peer(newsk)	= sk;
1605 	newsk->sk_state		= TCP_ESTABLISHED;
1606 	newsk->sk_type		= sk->sk_type;
1607 	init_peercred(newsk);
1608 	newu = unix_sk(newsk);
1609 	newu->listener = other;
1610 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1611 	otheru = unix_sk(other);
1612 
1613 	/* copy address information from listening to new sock
1614 	 *
1615 	 * The contents of *(otheru->addr) and otheru->path
1616 	 * are seen fully set up here, since we have found
1617 	 * otheru in hash under its lock.  Insertion into the
1618 	 * hash chain we'd found it in had been done in an
1619 	 * earlier critical area protected by the chain's lock,
1620 	 * the same one where we'd set *(otheru->addr) contents,
1621 	 * as well as otheru->path and otheru->addr itself.
1622 	 *
1623 	 * Using smp_store_release() here to set newu->addr
1624 	 * is enough to make those stores, as well as stores
1625 	 * to newu->path visible to anyone who gets newu->addr
1626 	 * by smp_load_acquire().  IOW, the same warranties
1627 	 * as for unix_sock instances bound in unix_bind() or
1628 	 * in unix_autobind().
1629 	 */
1630 	if (otheru->path.dentry) {
1631 		path_get(&otheru->path);
1632 		newu->path = otheru->path;
1633 	}
1634 	refcount_inc(&otheru->addr->refcnt);
1635 	smp_store_release(&newu->addr, otheru->addr);
1636 
1637 	/* Set credentials */
1638 	copy_peercred(sk, other);
1639 
1640 	sock->state	= SS_CONNECTED;
1641 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1642 	sock_hold(newsk);
1643 
1644 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1645 	unix_peer(sk)	= newsk;
1646 
1647 	unix_state_unlock(sk);
1648 
1649 	/* take ten and send info to listening sock */
1650 	spin_lock(&other->sk_receive_queue.lock);
1651 	__skb_queue_tail(&other->sk_receive_queue, skb);
1652 	spin_unlock(&other->sk_receive_queue.lock);
1653 	unix_state_unlock(other);
1654 	other->sk_data_ready(other);
1655 	sock_put(other);
1656 	return 0;
1657 
1658 out_unlock:
1659 	if (other)
1660 		unix_state_unlock(other);
1661 
1662 out:
1663 	kfree_skb(skb);
1664 	if (newsk)
1665 		unix_release_sock(newsk, 0);
1666 	if (other)
1667 		sock_put(other);
1668 	return err;
1669 }
1670 
1671 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1672 {
1673 	struct sock *ska = socka->sk, *skb = sockb->sk;
1674 
1675 	/* Join our sockets back to back */
1676 	sock_hold(ska);
1677 	sock_hold(skb);
1678 	unix_peer(ska) = skb;
1679 	unix_peer(skb) = ska;
1680 	init_peercred(ska);
1681 	init_peercred(skb);
1682 
1683 	ska->sk_state = TCP_ESTABLISHED;
1684 	skb->sk_state = TCP_ESTABLISHED;
1685 	socka->state  = SS_CONNECTED;
1686 	sockb->state  = SS_CONNECTED;
1687 	return 0;
1688 }
1689 
1690 static void unix_sock_inherit_flags(const struct socket *old,
1691 				    struct socket *new)
1692 {
1693 	if (test_bit(SOCK_PASSCRED, &old->flags))
1694 		set_bit(SOCK_PASSCRED, &new->flags);
1695 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1696 		set_bit(SOCK_PASSPIDFD, &new->flags);
1697 	if (test_bit(SOCK_PASSSEC, &old->flags))
1698 		set_bit(SOCK_PASSSEC, &new->flags);
1699 }
1700 
1701 static int unix_accept(struct socket *sock, struct socket *newsock,
1702 		       struct proto_accept_arg *arg)
1703 {
1704 	struct sock *sk = sock->sk;
1705 	struct sk_buff *skb;
1706 	struct sock *tsk;
1707 
1708 	arg->err = -EOPNOTSUPP;
1709 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1710 		goto out;
1711 
1712 	arg->err = -EINVAL;
1713 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1714 		goto out;
1715 
1716 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1717 	 * so that no locks are necessary.
1718 	 */
1719 
1720 	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1721 				&arg->err);
1722 	if (!skb) {
1723 		/* This means receive shutdown. */
1724 		if (arg->err == 0)
1725 			arg->err = -EINVAL;
1726 		goto out;
1727 	}
1728 
1729 	tsk = skb->sk;
1730 	skb_free_datagram(sk, skb);
1731 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1732 
1733 	/* attach accepted sock to socket */
1734 	unix_state_lock(tsk);
1735 	unix_update_edges(unix_sk(tsk));
1736 	newsock->state = SS_CONNECTED;
1737 	unix_sock_inherit_flags(sock, newsock);
1738 	sock_graft(tsk, newsock);
1739 	unix_state_unlock(tsk);
1740 	return 0;
1741 
1742 out:
1743 	return arg->err;
1744 }
1745 
1746 
1747 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1748 {
1749 	struct sock *sk = sock->sk;
1750 	struct unix_address *addr;
1751 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1752 	int err = 0;
1753 
1754 	if (peer) {
1755 		sk = unix_peer_get(sk);
1756 
1757 		err = -ENOTCONN;
1758 		if (!sk)
1759 			goto out;
1760 		err = 0;
1761 	} else {
1762 		sock_hold(sk);
1763 	}
1764 
1765 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1766 	if (!addr) {
1767 		sunaddr->sun_family = AF_UNIX;
1768 		sunaddr->sun_path[0] = 0;
1769 		err = offsetof(struct sockaddr_un, sun_path);
1770 	} else {
1771 		err = addr->len;
1772 		memcpy(sunaddr, addr->name, addr->len);
1773 
1774 		if (peer)
1775 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1776 					       CGROUP_UNIX_GETPEERNAME);
1777 		else
1778 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1779 					       CGROUP_UNIX_GETSOCKNAME);
1780 	}
1781 	sock_put(sk);
1782 out:
1783 	return err;
1784 }
1785 
1786 /* The "user->unix_inflight" variable is protected by the garbage
1787  * collection lock, and we just read it locklessly here. If you go
1788  * over the limit, there might be a tiny race in actually noticing
1789  * it across threads. Tough.
1790  */
1791 static inline bool too_many_unix_fds(struct task_struct *p)
1792 {
1793 	struct user_struct *user = current_user();
1794 
1795 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1796 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1797 	return false;
1798 }
1799 
1800 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1801 {
1802 	if (too_many_unix_fds(current))
1803 		return -ETOOMANYREFS;
1804 
1805 	UNIXCB(skb).fp = scm->fp;
1806 	scm->fp = NULL;
1807 
1808 	if (unix_prepare_fpl(UNIXCB(skb).fp))
1809 		return -ENOMEM;
1810 
1811 	return 0;
1812 }
1813 
1814 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1815 {
1816 	scm->fp = UNIXCB(skb).fp;
1817 	UNIXCB(skb).fp = NULL;
1818 
1819 	unix_destroy_fpl(scm->fp);
1820 }
1821 
1822 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1823 {
1824 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1825 }
1826 
1827 static void unix_destruct_scm(struct sk_buff *skb)
1828 {
1829 	struct scm_cookie scm;
1830 
1831 	memset(&scm, 0, sizeof(scm));
1832 	scm.pid  = UNIXCB(skb).pid;
1833 	if (UNIXCB(skb).fp)
1834 		unix_detach_fds(&scm, skb);
1835 
1836 	/* Alas, it calls VFS */
1837 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1838 	scm_destroy(&scm);
1839 	sock_wfree(skb);
1840 }
1841 
1842 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1843 {
1844 	int err = 0;
1845 
1846 	UNIXCB(skb).pid  = get_pid(scm->pid);
1847 	UNIXCB(skb).uid = scm->creds.uid;
1848 	UNIXCB(skb).gid = scm->creds.gid;
1849 	UNIXCB(skb).fp = NULL;
1850 	unix_get_secdata(scm, skb);
1851 	if (scm->fp && send_fds)
1852 		err = unix_attach_fds(scm, skb);
1853 
1854 	skb->destructor = unix_destruct_scm;
1855 	return err;
1856 }
1857 
1858 static bool unix_passcred_enabled(const struct socket *sock,
1859 				  const struct sock *other)
1860 {
1861 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1862 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1863 	       !other->sk_socket ||
1864 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1865 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1866 }
1867 
1868 /*
1869  * Some apps rely on write() giving SCM_CREDENTIALS
1870  * We include credentials if source or destination socket
1871  * asserted SOCK_PASSCRED.
1872  */
1873 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1874 			    const struct sock *other)
1875 {
1876 	if (UNIXCB(skb).pid)
1877 		return;
1878 	if (unix_passcred_enabled(sock, other)) {
1879 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1880 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1881 	}
1882 }
1883 
1884 static bool unix_skb_scm_eq(struct sk_buff *skb,
1885 			    struct scm_cookie *scm)
1886 {
1887 	return UNIXCB(skb).pid == scm->pid &&
1888 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1889 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1890 	       unix_secdata_eq(scm, skb);
1891 }
1892 
1893 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1894 {
1895 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1896 	struct unix_sock *u = unix_sk(sk);
1897 
1898 	if (unlikely(fp && fp->count)) {
1899 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1900 		unix_add_edges(fp, u);
1901 	}
1902 }
1903 
1904 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1905 {
1906 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1907 	struct unix_sock *u = unix_sk(sk);
1908 
1909 	if (unlikely(fp && fp->count)) {
1910 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1911 		unix_del_edges(fp);
1912 	}
1913 }
1914 
1915 /*
1916  *	Send AF_UNIX data.
1917  */
1918 
1919 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1920 			      size_t len)
1921 {
1922 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1923 	struct sock *sk = sock->sk, *other = NULL;
1924 	struct unix_sock *u = unix_sk(sk);
1925 	struct scm_cookie scm;
1926 	struct sk_buff *skb;
1927 	int data_len = 0;
1928 	int sk_locked;
1929 	long timeo;
1930 	int err;
1931 
1932 	err = scm_send(sock, msg, &scm, false);
1933 	if (err < 0)
1934 		return err;
1935 
1936 	wait_for_unix_gc(scm.fp);
1937 
1938 	err = -EOPNOTSUPP;
1939 	if (msg->msg_flags&MSG_OOB)
1940 		goto out;
1941 
1942 	if (msg->msg_namelen) {
1943 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1944 		if (err)
1945 			goto out;
1946 
1947 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1948 							    msg->msg_name,
1949 							    &msg->msg_namelen,
1950 							    NULL);
1951 		if (err)
1952 			goto out;
1953 	} else {
1954 		sunaddr = NULL;
1955 		err = -ENOTCONN;
1956 		other = unix_peer_get(sk);
1957 		if (!other)
1958 			goto out;
1959 	}
1960 
1961 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1962 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1963 	    !READ_ONCE(u->addr)) {
1964 		err = unix_autobind(sk);
1965 		if (err)
1966 			goto out;
1967 	}
1968 
1969 	err = -EMSGSIZE;
1970 	if (len > sk->sk_sndbuf - 32)
1971 		goto out;
1972 
1973 	if (len > SKB_MAX_ALLOC) {
1974 		data_len = min_t(size_t,
1975 				 len - SKB_MAX_ALLOC,
1976 				 MAX_SKB_FRAGS * PAGE_SIZE);
1977 		data_len = PAGE_ALIGN(data_len);
1978 
1979 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1980 	}
1981 
1982 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1983 				   msg->msg_flags & MSG_DONTWAIT, &err,
1984 				   PAGE_ALLOC_COSTLY_ORDER);
1985 	if (skb == NULL)
1986 		goto out;
1987 
1988 	err = unix_scm_to_skb(&scm, skb, true);
1989 	if (err < 0)
1990 		goto out_free;
1991 
1992 	skb_put(skb, len - data_len);
1993 	skb->data_len = data_len;
1994 	skb->len = len;
1995 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1996 	if (err)
1997 		goto out_free;
1998 
1999 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2000 
2001 restart:
2002 	if (!other) {
2003 		err = -ECONNRESET;
2004 		if (sunaddr == NULL)
2005 			goto out_free;
2006 
2007 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2008 					sk->sk_type);
2009 		if (IS_ERR(other)) {
2010 			err = PTR_ERR(other);
2011 			other = NULL;
2012 			goto out_free;
2013 		}
2014 	}
2015 
2016 	if (sk_filter(other, skb) < 0) {
2017 		/* Toss the packet but do not return any error to the sender */
2018 		err = len;
2019 		goto out_free;
2020 	}
2021 
2022 	sk_locked = 0;
2023 	unix_state_lock(other);
2024 restart_locked:
2025 	err = -EPERM;
2026 	if (!unix_may_send(sk, other))
2027 		goto out_unlock;
2028 
2029 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2030 		/*
2031 		 *	Check with 1003.1g - what should
2032 		 *	datagram error
2033 		 */
2034 		unix_state_unlock(other);
2035 		sock_put(other);
2036 
2037 		if (!sk_locked)
2038 			unix_state_lock(sk);
2039 
2040 		err = 0;
2041 		if (sk->sk_type == SOCK_SEQPACKET) {
2042 			/* We are here only when racing with unix_release_sock()
2043 			 * is clearing @other. Never change state to TCP_CLOSE
2044 			 * unlike SOCK_DGRAM wants.
2045 			 */
2046 			unix_state_unlock(sk);
2047 			err = -EPIPE;
2048 		} else if (unix_peer(sk) == other) {
2049 			unix_peer(sk) = NULL;
2050 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2051 
2052 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2053 			unix_state_unlock(sk);
2054 
2055 			unix_dgram_disconnected(sk, other);
2056 			sock_put(other);
2057 			err = -ECONNREFUSED;
2058 		} else {
2059 			unix_state_unlock(sk);
2060 		}
2061 
2062 		other = NULL;
2063 		if (err)
2064 			goto out_free;
2065 		goto restart;
2066 	}
2067 
2068 	err = -EPIPE;
2069 	if (other->sk_shutdown & RCV_SHUTDOWN)
2070 		goto out_unlock;
2071 
2072 	if (sk->sk_type != SOCK_SEQPACKET) {
2073 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2074 		if (err)
2075 			goto out_unlock;
2076 	}
2077 
2078 	/* other == sk && unix_peer(other) != sk if
2079 	 * - unix_peer(sk) == NULL, destination address bound to sk
2080 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2081 	 */
2082 	if (other != sk &&
2083 	    unlikely(unix_peer(other) != sk &&
2084 	    unix_recvq_full_lockless(other))) {
2085 		if (timeo) {
2086 			timeo = unix_wait_for_peer(other, timeo);
2087 
2088 			err = sock_intr_errno(timeo);
2089 			if (signal_pending(current))
2090 				goto out_free;
2091 
2092 			goto restart;
2093 		}
2094 
2095 		if (!sk_locked) {
2096 			unix_state_unlock(other);
2097 			unix_state_double_lock(sk, other);
2098 		}
2099 
2100 		if (unix_peer(sk) != other ||
2101 		    unix_dgram_peer_wake_me(sk, other)) {
2102 			err = -EAGAIN;
2103 			sk_locked = 1;
2104 			goto out_unlock;
2105 		}
2106 
2107 		if (!sk_locked) {
2108 			sk_locked = 1;
2109 			goto restart_locked;
2110 		}
2111 	}
2112 
2113 	if (unlikely(sk_locked))
2114 		unix_state_unlock(sk);
2115 
2116 	if (sock_flag(other, SOCK_RCVTSTAMP))
2117 		__net_timestamp(skb);
2118 	maybe_add_creds(skb, sock, other);
2119 	scm_stat_add(other, skb);
2120 	skb_queue_tail(&other->sk_receive_queue, skb);
2121 	unix_state_unlock(other);
2122 	other->sk_data_ready(other);
2123 	sock_put(other);
2124 	scm_destroy(&scm);
2125 	return len;
2126 
2127 out_unlock:
2128 	if (sk_locked)
2129 		unix_state_unlock(sk);
2130 	unix_state_unlock(other);
2131 out_free:
2132 	kfree_skb(skb);
2133 out:
2134 	if (other)
2135 		sock_put(other);
2136 	scm_destroy(&scm);
2137 	return err;
2138 }
2139 
2140 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2141  * bytes, and a minimum of a full page.
2142  */
2143 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2144 
2145 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2146 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2147 		     struct scm_cookie *scm, bool fds_sent)
2148 {
2149 	struct unix_sock *ousk = unix_sk(other);
2150 	struct sk_buff *skb;
2151 	int err = 0;
2152 
2153 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2154 
2155 	if (!skb)
2156 		return err;
2157 
2158 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2159 	if (err < 0) {
2160 		kfree_skb(skb);
2161 		return err;
2162 	}
2163 	skb_put(skb, 1);
2164 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2165 
2166 	if (err) {
2167 		kfree_skb(skb);
2168 		return err;
2169 	}
2170 
2171 	unix_state_lock(other);
2172 
2173 	if (sock_flag(other, SOCK_DEAD) ||
2174 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2175 		unix_state_unlock(other);
2176 		kfree_skb(skb);
2177 		return -EPIPE;
2178 	}
2179 
2180 	maybe_add_creds(skb, sock, other);
2181 	skb_get(skb);
2182 
2183 	scm_stat_add(other, skb);
2184 
2185 	spin_lock(&other->sk_receive_queue.lock);
2186 	if (ousk->oob_skb)
2187 		consume_skb(ousk->oob_skb);
2188 	WRITE_ONCE(ousk->oob_skb, skb);
2189 	__skb_queue_tail(&other->sk_receive_queue, skb);
2190 	spin_unlock(&other->sk_receive_queue.lock);
2191 
2192 	sk_send_sigurg(other);
2193 	unix_state_unlock(other);
2194 	other->sk_data_ready(other);
2195 
2196 	return err;
2197 }
2198 #endif
2199 
2200 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2201 			       size_t len)
2202 {
2203 	struct sock *sk = sock->sk;
2204 	struct sock *other = NULL;
2205 	int err, size;
2206 	struct sk_buff *skb;
2207 	int sent = 0;
2208 	struct scm_cookie scm;
2209 	bool fds_sent = false;
2210 	int data_len;
2211 
2212 	err = scm_send(sock, msg, &scm, false);
2213 	if (err < 0)
2214 		return err;
2215 
2216 	wait_for_unix_gc(scm.fp);
2217 
2218 	err = -EOPNOTSUPP;
2219 	if (msg->msg_flags & MSG_OOB) {
2220 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2221 		if (len)
2222 			len--;
2223 		else
2224 #endif
2225 			goto out_err;
2226 	}
2227 
2228 	if (msg->msg_namelen) {
2229 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2230 		goto out_err;
2231 	} else {
2232 		err = -ENOTCONN;
2233 		other = unix_peer(sk);
2234 		if (!other)
2235 			goto out_err;
2236 	}
2237 
2238 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2239 		goto pipe_err;
2240 
2241 	while (sent < len) {
2242 		size = len - sent;
2243 
2244 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2245 			skb = sock_alloc_send_pskb(sk, 0, 0,
2246 						   msg->msg_flags & MSG_DONTWAIT,
2247 						   &err, 0);
2248 		} else {
2249 			/* Keep two messages in the pipe so it schedules better */
2250 			size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2251 
2252 			/* allow fallback to order-0 allocations */
2253 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2254 
2255 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2256 
2257 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2258 
2259 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2260 						   msg->msg_flags & MSG_DONTWAIT, &err,
2261 						   get_order(UNIX_SKB_FRAGS_SZ));
2262 		}
2263 		if (!skb)
2264 			goto out_err;
2265 
2266 		/* Only send the fds in the first buffer */
2267 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2268 		if (err < 0) {
2269 			kfree_skb(skb);
2270 			goto out_err;
2271 		}
2272 		fds_sent = true;
2273 
2274 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2275 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2276 						   sk->sk_allocation);
2277 			if (err < 0) {
2278 				kfree_skb(skb);
2279 				goto out_err;
2280 			}
2281 			size = err;
2282 			refcount_add(size, &sk->sk_wmem_alloc);
2283 		} else {
2284 			skb_put(skb, size - data_len);
2285 			skb->data_len = data_len;
2286 			skb->len = size;
2287 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2288 			if (err) {
2289 				kfree_skb(skb);
2290 				goto out_err;
2291 			}
2292 		}
2293 
2294 		unix_state_lock(other);
2295 
2296 		if (sock_flag(other, SOCK_DEAD) ||
2297 		    (other->sk_shutdown & RCV_SHUTDOWN))
2298 			goto pipe_err_free;
2299 
2300 		maybe_add_creds(skb, sock, other);
2301 		scm_stat_add(other, skb);
2302 		skb_queue_tail(&other->sk_receive_queue, skb);
2303 		unix_state_unlock(other);
2304 		other->sk_data_ready(other);
2305 		sent += size;
2306 	}
2307 
2308 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2309 	if (msg->msg_flags & MSG_OOB) {
2310 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2311 		if (err)
2312 			goto out_err;
2313 		sent++;
2314 	}
2315 #endif
2316 
2317 	scm_destroy(&scm);
2318 
2319 	return sent;
2320 
2321 pipe_err_free:
2322 	unix_state_unlock(other);
2323 	kfree_skb(skb);
2324 pipe_err:
2325 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2326 		send_sig(SIGPIPE, current, 0);
2327 	err = -EPIPE;
2328 out_err:
2329 	scm_destroy(&scm);
2330 	return sent ? : err;
2331 }
2332 
2333 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2334 				  size_t len)
2335 {
2336 	int err;
2337 	struct sock *sk = sock->sk;
2338 
2339 	err = sock_error(sk);
2340 	if (err)
2341 		return err;
2342 
2343 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2344 		return -ENOTCONN;
2345 
2346 	if (msg->msg_namelen)
2347 		msg->msg_namelen = 0;
2348 
2349 	return unix_dgram_sendmsg(sock, msg, len);
2350 }
2351 
2352 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2353 				  size_t size, int flags)
2354 {
2355 	struct sock *sk = sock->sk;
2356 
2357 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2358 		return -ENOTCONN;
2359 
2360 	return unix_dgram_recvmsg(sock, msg, size, flags);
2361 }
2362 
2363 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2364 {
2365 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2366 
2367 	if (addr) {
2368 		msg->msg_namelen = addr->len;
2369 		memcpy(msg->msg_name, addr->name, addr->len);
2370 	}
2371 }
2372 
2373 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2374 			 int flags)
2375 {
2376 	struct scm_cookie scm;
2377 	struct socket *sock = sk->sk_socket;
2378 	struct unix_sock *u = unix_sk(sk);
2379 	struct sk_buff *skb, *last;
2380 	long timeo;
2381 	int skip;
2382 	int err;
2383 
2384 	err = -EOPNOTSUPP;
2385 	if (flags&MSG_OOB)
2386 		goto out;
2387 
2388 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2389 
2390 	do {
2391 		mutex_lock(&u->iolock);
2392 
2393 		skip = sk_peek_offset(sk, flags);
2394 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2395 					      &skip, &err, &last);
2396 		if (skb) {
2397 			if (!(flags & MSG_PEEK))
2398 				scm_stat_del(sk, skb);
2399 			break;
2400 		}
2401 
2402 		mutex_unlock(&u->iolock);
2403 
2404 		if (err != -EAGAIN)
2405 			break;
2406 	} while (timeo &&
2407 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2408 					      &err, &timeo, last));
2409 
2410 	if (!skb) { /* implies iolock unlocked */
2411 		unix_state_lock(sk);
2412 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2413 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2414 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2415 			err = 0;
2416 		unix_state_unlock(sk);
2417 		goto out;
2418 	}
2419 
2420 	if (wq_has_sleeper(&u->peer_wait))
2421 		wake_up_interruptible_sync_poll(&u->peer_wait,
2422 						EPOLLOUT | EPOLLWRNORM |
2423 						EPOLLWRBAND);
2424 
2425 	if (msg->msg_name) {
2426 		unix_copy_addr(msg, skb->sk);
2427 
2428 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2429 						      msg->msg_name,
2430 						      &msg->msg_namelen);
2431 	}
2432 
2433 	if (size > skb->len - skip)
2434 		size = skb->len - skip;
2435 	else if (size < skb->len - skip)
2436 		msg->msg_flags |= MSG_TRUNC;
2437 
2438 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2439 	if (err)
2440 		goto out_free;
2441 
2442 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2443 		__sock_recv_timestamp(msg, sk, skb);
2444 
2445 	memset(&scm, 0, sizeof(scm));
2446 
2447 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2448 	unix_set_secdata(&scm, skb);
2449 
2450 	if (!(flags & MSG_PEEK)) {
2451 		if (UNIXCB(skb).fp)
2452 			unix_detach_fds(&scm, skb);
2453 
2454 		sk_peek_offset_bwd(sk, skb->len);
2455 	} else {
2456 		/* It is questionable: on PEEK we could:
2457 		   - do not return fds - good, but too simple 8)
2458 		   - return fds, and do not return them on read (old strategy,
2459 		     apparently wrong)
2460 		   - clone fds (I chose it for now, it is the most universal
2461 		     solution)
2462 
2463 		   POSIX 1003.1g does not actually define this clearly
2464 		   at all. POSIX 1003.1g doesn't define a lot of things
2465 		   clearly however!
2466 
2467 		*/
2468 
2469 		sk_peek_offset_fwd(sk, size);
2470 
2471 		if (UNIXCB(skb).fp)
2472 			unix_peek_fds(&scm, skb);
2473 	}
2474 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2475 
2476 	scm_recv_unix(sock, msg, &scm, flags);
2477 
2478 out_free:
2479 	skb_free_datagram(sk, skb);
2480 	mutex_unlock(&u->iolock);
2481 out:
2482 	return err;
2483 }
2484 
2485 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2486 			      int flags)
2487 {
2488 	struct sock *sk = sock->sk;
2489 
2490 #ifdef CONFIG_BPF_SYSCALL
2491 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2492 
2493 	if (prot != &unix_dgram_proto)
2494 		return prot->recvmsg(sk, msg, size, flags, NULL);
2495 #endif
2496 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2497 }
2498 
2499 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2500 {
2501 	struct unix_sock *u = unix_sk(sk);
2502 	struct sk_buff *skb;
2503 	int err;
2504 
2505 	mutex_lock(&u->iolock);
2506 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2507 	mutex_unlock(&u->iolock);
2508 	if (!skb)
2509 		return err;
2510 
2511 	return recv_actor(sk, skb);
2512 }
2513 
2514 /*
2515  *	Sleep until more data has arrived. But check for races..
2516  */
2517 static long unix_stream_data_wait(struct sock *sk, long timeo,
2518 				  struct sk_buff *last, unsigned int last_len,
2519 				  bool freezable)
2520 {
2521 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2522 	struct sk_buff *tail;
2523 	DEFINE_WAIT(wait);
2524 
2525 	unix_state_lock(sk);
2526 
2527 	for (;;) {
2528 		prepare_to_wait(sk_sleep(sk), &wait, state);
2529 
2530 		tail = skb_peek_tail(&sk->sk_receive_queue);
2531 		if (tail != last ||
2532 		    (tail && tail->len != last_len) ||
2533 		    sk->sk_err ||
2534 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2535 		    signal_pending(current) ||
2536 		    !timeo)
2537 			break;
2538 
2539 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2540 		unix_state_unlock(sk);
2541 		timeo = schedule_timeout(timeo);
2542 		unix_state_lock(sk);
2543 
2544 		if (sock_flag(sk, SOCK_DEAD))
2545 			break;
2546 
2547 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2548 	}
2549 
2550 	finish_wait(sk_sleep(sk), &wait);
2551 	unix_state_unlock(sk);
2552 	return timeo;
2553 }
2554 
2555 static unsigned int unix_skb_len(const struct sk_buff *skb)
2556 {
2557 	return skb->len - UNIXCB(skb).consumed;
2558 }
2559 
2560 struct unix_stream_read_state {
2561 	int (*recv_actor)(struct sk_buff *, int, int,
2562 			  struct unix_stream_read_state *);
2563 	struct socket *socket;
2564 	struct msghdr *msg;
2565 	struct pipe_inode_info *pipe;
2566 	size_t size;
2567 	int flags;
2568 	unsigned int splice_flags;
2569 };
2570 
2571 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2572 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2573 {
2574 	struct socket *sock = state->socket;
2575 	struct sock *sk = sock->sk;
2576 	struct unix_sock *u = unix_sk(sk);
2577 	int chunk = 1;
2578 	struct sk_buff *oob_skb;
2579 
2580 	mutex_lock(&u->iolock);
2581 	unix_state_lock(sk);
2582 	spin_lock(&sk->sk_receive_queue.lock);
2583 
2584 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2585 		spin_unlock(&sk->sk_receive_queue.lock);
2586 		unix_state_unlock(sk);
2587 		mutex_unlock(&u->iolock);
2588 		return -EINVAL;
2589 	}
2590 
2591 	oob_skb = u->oob_skb;
2592 
2593 	if (!(state->flags & MSG_PEEK))
2594 		WRITE_ONCE(u->oob_skb, NULL);
2595 	else
2596 		skb_get(oob_skb);
2597 
2598 	spin_unlock(&sk->sk_receive_queue.lock);
2599 	unix_state_unlock(sk);
2600 
2601 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2602 
2603 	if (!(state->flags & MSG_PEEK))
2604 		UNIXCB(oob_skb).consumed += 1;
2605 
2606 	consume_skb(oob_skb);
2607 
2608 	mutex_unlock(&u->iolock);
2609 
2610 	if (chunk < 0)
2611 		return -EFAULT;
2612 
2613 	state->msg->msg_flags |= MSG_OOB;
2614 	return 1;
2615 }
2616 
2617 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2618 				  int flags, int copied)
2619 {
2620 	struct unix_sock *u = unix_sk(sk);
2621 
2622 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2623 		skb_unlink(skb, &sk->sk_receive_queue);
2624 		consume_skb(skb);
2625 		skb = NULL;
2626 	} else {
2627 		struct sk_buff *unlinked_skb = NULL;
2628 
2629 		spin_lock(&sk->sk_receive_queue.lock);
2630 
2631 		if (skb == u->oob_skb) {
2632 			if (copied) {
2633 				skb = NULL;
2634 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2635 				if (!(flags & MSG_PEEK)) {
2636 					WRITE_ONCE(u->oob_skb, NULL);
2637 					consume_skb(skb);
2638 				}
2639 			} else if (flags & MSG_PEEK) {
2640 				skb = NULL;
2641 			} else {
2642 				__skb_unlink(skb, &sk->sk_receive_queue);
2643 				WRITE_ONCE(u->oob_skb, NULL);
2644 				unlinked_skb = skb;
2645 				skb = skb_peek(&sk->sk_receive_queue);
2646 			}
2647 		}
2648 
2649 		spin_unlock(&sk->sk_receive_queue.lock);
2650 
2651 		if (unlinked_skb) {
2652 			WARN_ON_ONCE(skb_unref(unlinked_skb));
2653 			kfree_skb(unlinked_skb);
2654 		}
2655 	}
2656 	return skb;
2657 }
2658 #endif
2659 
2660 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2661 {
2662 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2663 		return -ENOTCONN;
2664 
2665 	return unix_read_skb(sk, recv_actor);
2666 }
2667 
2668 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2669 				    bool freezable)
2670 {
2671 	struct scm_cookie scm;
2672 	struct socket *sock = state->socket;
2673 	struct sock *sk = sock->sk;
2674 	struct unix_sock *u = unix_sk(sk);
2675 	int copied = 0;
2676 	int flags = state->flags;
2677 	int noblock = flags & MSG_DONTWAIT;
2678 	bool check_creds = false;
2679 	int target;
2680 	int err = 0;
2681 	long timeo;
2682 	int skip;
2683 	size_t size = state->size;
2684 	unsigned int last_len;
2685 
2686 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2687 		err = -EINVAL;
2688 		goto out;
2689 	}
2690 
2691 	if (unlikely(flags & MSG_OOB)) {
2692 		err = -EOPNOTSUPP;
2693 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2694 		err = unix_stream_recv_urg(state);
2695 #endif
2696 		goto out;
2697 	}
2698 
2699 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2700 	timeo = sock_rcvtimeo(sk, noblock);
2701 
2702 	memset(&scm, 0, sizeof(scm));
2703 
2704 	/* Lock the socket to prevent queue disordering
2705 	 * while sleeps in memcpy_tomsg
2706 	 */
2707 	mutex_lock(&u->iolock);
2708 
2709 	skip = max(sk_peek_offset(sk, flags), 0);
2710 
2711 	do {
2712 		int chunk;
2713 		bool drop_skb;
2714 		struct sk_buff *skb, *last;
2715 
2716 redo:
2717 		unix_state_lock(sk);
2718 		if (sock_flag(sk, SOCK_DEAD)) {
2719 			err = -ECONNRESET;
2720 			goto unlock;
2721 		}
2722 		last = skb = skb_peek(&sk->sk_receive_queue);
2723 		last_len = last ? last->len : 0;
2724 
2725 again:
2726 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2727 		if (skb) {
2728 			skb = manage_oob(skb, sk, flags, copied);
2729 			if (!skb && copied) {
2730 				unix_state_unlock(sk);
2731 				break;
2732 			}
2733 		}
2734 #endif
2735 		if (skb == NULL) {
2736 			if (copied >= target)
2737 				goto unlock;
2738 
2739 			/*
2740 			 *	POSIX 1003.1g mandates this order.
2741 			 */
2742 
2743 			err = sock_error(sk);
2744 			if (err)
2745 				goto unlock;
2746 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2747 				goto unlock;
2748 
2749 			unix_state_unlock(sk);
2750 			if (!timeo) {
2751 				err = -EAGAIN;
2752 				break;
2753 			}
2754 
2755 			mutex_unlock(&u->iolock);
2756 
2757 			timeo = unix_stream_data_wait(sk, timeo, last,
2758 						      last_len, freezable);
2759 
2760 			if (signal_pending(current)) {
2761 				err = sock_intr_errno(timeo);
2762 				scm_destroy(&scm);
2763 				goto out;
2764 			}
2765 
2766 			mutex_lock(&u->iolock);
2767 			goto redo;
2768 unlock:
2769 			unix_state_unlock(sk);
2770 			break;
2771 		}
2772 
2773 		while (skip >= unix_skb_len(skb)) {
2774 			skip -= unix_skb_len(skb);
2775 			last = skb;
2776 			last_len = skb->len;
2777 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2778 			if (!skb)
2779 				goto again;
2780 		}
2781 
2782 		unix_state_unlock(sk);
2783 
2784 		if (check_creds) {
2785 			/* Never glue messages from different writers */
2786 			if (!unix_skb_scm_eq(skb, &scm))
2787 				break;
2788 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2789 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2790 			/* Copy credentials */
2791 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2792 			unix_set_secdata(&scm, skb);
2793 			check_creds = true;
2794 		}
2795 
2796 		/* Copy address just once */
2797 		if (state->msg && state->msg->msg_name) {
2798 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2799 					 state->msg->msg_name);
2800 			unix_copy_addr(state->msg, skb->sk);
2801 
2802 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2803 							      state->msg->msg_name,
2804 							      &state->msg->msg_namelen);
2805 
2806 			sunaddr = NULL;
2807 		}
2808 
2809 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2810 		skb_get(skb);
2811 		chunk = state->recv_actor(skb, skip, chunk, state);
2812 		drop_skb = !unix_skb_len(skb);
2813 		/* skb is only safe to use if !drop_skb */
2814 		consume_skb(skb);
2815 		if (chunk < 0) {
2816 			if (copied == 0)
2817 				copied = -EFAULT;
2818 			break;
2819 		}
2820 		copied += chunk;
2821 		size -= chunk;
2822 
2823 		if (drop_skb) {
2824 			/* the skb was touched by a concurrent reader;
2825 			 * we should not expect anything from this skb
2826 			 * anymore and assume it invalid - we can be
2827 			 * sure it was dropped from the socket queue
2828 			 *
2829 			 * let's report a short read
2830 			 */
2831 			err = 0;
2832 			break;
2833 		}
2834 
2835 		/* Mark read part of skb as used */
2836 		if (!(flags & MSG_PEEK)) {
2837 			UNIXCB(skb).consumed += chunk;
2838 
2839 			sk_peek_offset_bwd(sk, chunk);
2840 
2841 			if (UNIXCB(skb).fp) {
2842 				scm_stat_del(sk, skb);
2843 				unix_detach_fds(&scm, skb);
2844 			}
2845 
2846 			if (unix_skb_len(skb))
2847 				break;
2848 
2849 			skb_unlink(skb, &sk->sk_receive_queue);
2850 			consume_skb(skb);
2851 
2852 			if (scm.fp)
2853 				break;
2854 		} else {
2855 			/* It is questionable, see note in unix_dgram_recvmsg.
2856 			 */
2857 			if (UNIXCB(skb).fp)
2858 				unix_peek_fds(&scm, skb);
2859 
2860 			sk_peek_offset_fwd(sk, chunk);
2861 
2862 			if (UNIXCB(skb).fp)
2863 				break;
2864 
2865 			skip = 0;
2866 			last = skb;
2867 			last_len = skb->len;
2868 			unix_state_lock(sk);
2869 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2870 			if (skb)
2871 				goto again;
2872 			unix_state_unlock(sk);
2873 			break;
2874 		}
2875 	} while (size);
2876 
2877 	mutex_unlock(&u->iolock);
2878 	if (state->msg)
2879 		scm_recv_unix(sock, state->msg, &scm, flags);
2880 	else
2881 		scm_destroy(&scm);
2882 out:
2883 	return copied ? : err;
2884 }
2885 
2886 static int unix_stream_read_actor(struct sk_buff *skb,
2887 				  int skip, int chunk,
2888 				  struct unix_stream_read_state *state)
2889 {
2890 	int ret;
2891 
2892 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2893 				    state->msg, chunk);
2894 	return ret ?: chunk;
2895 }
2896 
2897 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2898 			  size_t size, int flags)
2899 {
2900 	struct unix_stream_read_state state = {
2901 		.recv_actor = unix_stream_read_actor,
2902 		.socket = sk->sk_socket,
2903 		.msg = msg,
2904 		.size = size,
2905 		.flags = flags
2906 	};
2907 
2908 	return unix_stream_read_generic(&state, true);
2909 }
2910 
2911 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2912 			       size_t size, int flags)
2913 {
2914 	struct unix_stream_read_state state = {
2915 		.recv_actor = unix_stream_read_actor,
2916 		.socket = sock,
2917 		.msg = msg,
2918 		.size = size,
2919 		.flags = flags
2920 	};
2921 
2922 #ifdef CONFIG_BPF_SYSCALL
2923 	struct sock *sk = sock->sk;
2924 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2925 
2926 	if (prot != &unix_stream_proto)
2927 		return prot->recvmsg(sk, msg, size, flags, NULL);
2928 #endif
2929 	return unix_stream_read_generic(&state, true);
2930 }
2931 
2932 static int unix_stream_splice_actor(struct sk_buff *skb,
2933 				    int skip, int chunk,
2934 				    struct unix_stream_read_state *state)
2935 {
2936 	return skb_splice_bits(skb, state->socket->sk,
2937 			       UNIXCB(skb).consumed + skip,
2938 			       state->pipe, chunk, state->splice_flags);
2939 }
2940 
2941 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2942 				       struct pipe_inode_info *pipe,
2943 				       size_t size, unsigned int flags)
2944 {
2945 	struct unix_stream_read_state state = {
2946 		.recv_actor = unix_stream_splice_actor,
2947 		.socket = sock,
2948 		.pipe = pipe,
2949 		.size = size,
2950 		.splice_flags = flags,
2951 	};
2952 
2953 	if (unlikely(*ppos))
2954 		return -ESPIPE;
2955 
2956 	if (sock->file->f_flags & O_NONBLOCK ||
2957 	    flags & SPLICE_F_NONBLOCK)
2958 		state.flags = MSG_DONTWAIT;
2959 
2960 	return unix_stream_read_generic(&state, false);
2961 }
2962 
2963 static int unix_shutdown(struct socket *sock, int mode)
2964 {
2965 	struct sock *sk = sock->sk;
2966 	struct sock *other;
2967 
2968 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2969 		return -EINVAL;
2970 	/* This maps:
2971 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2972 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2973 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2974 	 */
2975 	++mode;
2976 
2977 	unix_state_lock(sk);
2978 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2979 	other = unix_peer(sk);
2980 	if (other)
2981 		sock_hold(other);
2982 	unix_state_unlock(sk);
2983 	sk->sk_state_change(sk);
2984 
2985 	if (other &&
2986 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2987 
2988 		int peer_mode = 0;
2989 		const struct proto *prot = READ_ONCE(other->sk_prot);
2990 
2991 		if (prot->unhash)
2992 			prot->unhash(other);
2993 		if (mode&RCV_SHUTDOWN)
2994 			peer_mode |= SEND_SHUTDOWN;
2995 		if (mode&SEND_SHUTDOWN)
2996 			peer_mode |= RCV_SHUTDOWN;
2997 		unix_state_lock(other);
2998 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2999 		unix_state_unlock(other);
3000 		other->sk_state_change(other);
3001 		if (peer_mode == SHUTDOWN_MASK)
3002 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3003 		else if (peer_mode & RCV_SHUTDOWN)
3004 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3005 	}
3006 	if (other)
3007 		sock_put(other);
3008 
3009 	return 0;
3010 }
3011 
3012 long unix_inq_len(struct sock *sk)
3013 {
3014 	struct sk_buff *skb;
3015 	long amount = 0;
3016 
3017 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3018 		return -EINVAL;
3019 
3020 	spin_lock(&sk->sk_receive_queue.lock);
3021 	if (sk->sk_type == SOCK_STREAM ||
3022 	    sk->sk_type == SOCK_SEQPACKET) {
3023 		skb_queue_walk(&sk->sk_receive_queue, skb)
3024 			amount += unix_skb_len(skb);
3025 	} else {
3026 		skb = skb_peek(&sk->sk_receive_queue);
3027 		if (skb)
3028 			amount = skb->len;
3029 	}
3030 	spin_unlock(&sk->sk_receive_queue.lock);
3031 
3032 	return amount;
3033 }
3034 EXPORT_SYMBOL_GPL(unix_inq_len);
3035 
3036 long unix_outq_len(struct sock *sk)
3037 {
3038 	return sk_wmem_alloc_get(sk);
3039 }
3040 EXPORT_SYMBOL_GPL(unix_outq_len);
3041 
3042 static int unix_open_file(struct sock *sk)
3043 {
3044 	struct path path;
3045 	struct file *f;
3046 	int fd;
3047 
3048 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3049 		return -EPERM;
3050 
3051 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3052 		return -ENOENT;
3053 
3054 	path = unix_sk(sk)->path;
3055 	if (!path.dentry)
3056 		return -ENOENT;
3057 
3058 	path_get(&path);
3059 
3060 	fd = get_unused_fd_flags(O_CLOEXEC);
3061 	if (fd < 0)
3062 		goto out;
3063 
3064 	f = dentry_open(&path, O_PATH, current_cred());
3065 	if (IS_ERR(f)) {
3066 		put_unused_fd(fd);
3067 		fd = PTR_ERR(f);
3068 		goto out;
3069 	}
3070 
3071 	fd_install(fd, f);
3072 out:
3073 	path_put(&path);
3074 
3075 	return fd;
3076 }
3077 
3078 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3079 {
3080 	struct sock *sk = sock->sk;
3081 	long amount = 0;
3082 	int err;
3083 
3084 	switch (cmd) {
3085 	case SIOCOUTQ:
3086 		amount = unix_outq_len(sk);
3087 		err = put_user(amount, (int __user *)arg);
3088 		break;
3089 	case SIOCINQ:
3090 		amount = unix_inq_len(sk);
3091 		if (amount < 0)
3092 			err = amount;
3093 		else
3094 			err = put_user(amount, (int __user *)arg);
3095 		break;
3096 	case SIOCUNIXFILE:
3097 		err = unix_open_file(sk);
3098 		break;
3099 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3100 	case SIOCATMARK:
3101 		{
3102 			struct sk_buff *skb;
3103 			int answ = 0;
3104 
3105 			skb = skb_peek(&sk->sk_receive_queue);
3106 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3107 				answ = 1;
3108 			err = put_user(answ, (int __user *)arg);
3109 		}
3110 		break;
3111 #endif
3112 	default:
3113 		err = -ENOIOCTLCMD;
3114 		break;
3115 	}
3116 	return err;
3117 }
3118 
3119 #ifdef CONFIG_COMPAT
3120 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3121 {
3122 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3123 }
3124 #endif
3125 
3126 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3127 {
3128 	struct sock *sk = sock->sk;
3129 	unsigned char state;
3130 	__poll_t mask;
3131 	u8 shutdown;
3132 
3133 	sock_poll_wait(file, sock, wait);
3134 	mask = 0;
3135 	shutdown = READ_ONCE(sk->sk_shutdown);
3136 	state = READ_ONCE(sk->sk_state);
3137 
3138 	/* exceptional events? */
3139 	if (READ_ONCE(sk->sk_err))
3140 		mask |= EPOLLERR;
3141 	if (shutdown == SHUTDOWN_MASK)
3142 		mask |= EPOLLHUP;
3143 	if (shutdown & RCV_SHUTDOWN)
3144 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3145 
3146 	/* readable? */
3147 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3148 		mask |= EPOLLIN | EPOLLRDNORM;
3149 	if (sk_is_readable(sk))
3150 		mask |= EPOLLIN | EPOLLRDNORM;
3151 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3152 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3153 		mask |= EPOLLPRI;
3154 #endif
3155 
3156 	/* Connection-based need to check for termination and startup */
3157 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3158 	    state == TCP_CLOSE)
3159 		mask |= EPOLLHUP;
3160 
3161 	/*
3162 	 * we set writable also when the other side has shut down the
3163 	 * connection. This prevents stuck sockets.
3164 	 */
3165 	if (unix_writable(sk, state))
3166 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3167 
3168 	return mask;
3169 }
3170 
3171 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3172 				    poll_table *wait)
3173 {
3174 	struct sock *sk = sock->sk, *other;
3175 	unsigned int writable;
3176 	unsigned char state;
3177 	__poll_t mask;
3178 	u8 shutdown;
3179 
3180 	sock_poll_wait(file, sock, wait);
3181 	mask = 0;
3182 	shutdown = READ_ONCE(sk->sk_shutdown);
3183 	state = READ_ONCE(sk->sk_state);
3184 
3185 	/* exceptional events? */
3186 	if (READ_ONCE(sk->sk_err) ||
3187 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3188 		mask |= EPOLLERR |
3189 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3190 
3191 	if (shutdown & RCV_SHUTDOWN)
3192 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3193 	if (shutdown == SHUTDOWN_MASK)
3194 		mask |= EPOLLHUP;
3195 
3196 	/* readable? */
3197 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3198 		mask |= EPOLLIN | EPOLLRDNORM;
3199 	if (sk_is_readable(sk))
3200 		mask |= EPOLLIN | EPOLLRDNORM;
3201 
3202 	/* Connection-based need to check for termination and startup */
3203 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3204 		mask |= EPOLLHUP;
3205 
3206 	/* No write status requested, avoid expensive OUT tests. */
3207 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3208 		return mask;
3209 
3210 	writable = unix_writable(sk, state);
3211 	if (writable) {
3212 		unix_state_lock(sk);
3213 
3214 		other = unix_peer(sk);
3215 		if (other && unix_peer(other) != sk &&
3216 		    unix_recvq_full_lockless(other) &&
3217 		    unix_dgram_peer_wake_me(sk, other))
3218 			writable = 0;
3219 
3220 		unix_state_unlock(sk);
3221 	}
3222 
3223 	if (writable)
3224 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3225 	else
3226 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3227 
3228 	return mask;
3229 }
3230 
3231 #ifdef CONFIG_PROC_FS
3232 
3233 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3234 
3235 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3236 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3237 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3238 
3239 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3240 {
3241 	unsigned long offset = get_offset(*pos);
3242 	unsigned long bucket = get_bucket(*pos);
3243 	unsigned long count = 0;
3244 	struct sock *sk;
3245 
3246 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3247 	     sk; sk = sk_next(sk)) {
3248 		if (++count == offset)
3249 			break;
3250 	}
3251 
3252 	return sk;
3253 }
3254 
3255 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3256 {
3257 	unsigned long bucket = get_bucket(*pos);
3258 	struct net *net = seq_file_net(seq);
3259 	struct sock *sk;
3260 
3261 	while (bucket < UNIX_HASH_SIZE) {
3262 		spin_lock(&net->unx.table.locks[bucket]);
3263 
3264 		sk = unix_from_bucket(seq, pos);
3265 		if (sk)
3266 			return sk;
3267 
3268 		spin_unlock(&net->unx.table.locks[bucket]);
3269 
3270 		*pos = set_bucket_offset(++bucket, 1);
3271 	}
3272 
3273 	return NULL;
3274 }
3275 
3276 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3277 				  loff_t *pos)
3278 {
3279 	unsigned long bucket = get_bucket(*pos);
3280 
3281 	sk = sk_next(sk);
3282 	if (sk)
3283 		return sk;
3284 
3285 
3286 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3287 
3288 	*pos = set_bucket_offset(++bucket, 1);
3289 
3290 	return unix_get_first(seq, pos);
3291 }
3292 
3293 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3294 {
3295 	if (!*pos)
3296 		return SEQ_START_TOKEN;
3297 
3298 	return unix_get_first(seq, pos);
3299 }
3300 
3301 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3302 {
3303 	++*pos;
3304 
3305 	if (v == SEQ_START_TOKEN)
3306 		return unix_get_first(seq, pos);
3307 
3308 	return unix_get_next(seq, v, pos);
3309 }
3310 
3311 static void unix_seq_stop(struct seq_file *seq, void *v)
3312 {
3313 	struct sock *sk = v;
3314 
3315 	if (sk)
3316 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3317 }
3318 
3319 static int unix_seq_show(struct seq_file *seq, void *v)
3320 {
3321 
3322 	if (v == SEQ_START_TOKEN)
3323 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3324 			 "Inode Path\n");
3325 	else {
3326 		struct sock *s = v;
3327 		struct unix_sock *u = unix_sk(s);
3328 		unix_state_lock(s);
3329 
3330 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3331 			s,
3332 			refcount_read(&s->sk_refcnt),
3333 			0,
3334 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3335 			s->sk_type,
3336 			s->sk_socket ?
3337 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3338 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3339 			sock_i_ino(s));
3340 
3341 		if (u->addr) {	// under a hash table lock here
3342 			int i, len;
3343 			seq_putc(seq, ' ');
3344 
3345 			i = 0;
3346 			len = u->addr->len -
3347 				offsetof(struct sockaddr_un, sun_path);
3348 			if (u->addr->name->sun_path[0]) {
3349 				len--;
3350 			} else {
3351 				seq_putc(seq, '@');
3352 				i++;
3353 			}
3354 			for ( ; i < len; i++)
3355 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3356 					 '@');
3357 		}
3358 		unix_state_unlock(s);
3359 		seq_putc(seq, '\n');
3360 	}
3361 
3362 	return 0;
3363 }
3364 
3365 static const struct seq_operations unix_seq_ops = {
3366 	.start  = unix_seq_start,
3367 	.next   = unix_seq_next,
3368 	.stop   = unix_seq_stop,
3369 	.show   = unix_seq_show,
3370 };
3371 
3372 #ifdef CONFIG_BPF_SYSCALL
3373 struct bpf_unix_iter_state {
3374 	struct seq_net_private p;
3375 	unsigned int cur_sk;
3376 	unsigned int end_sk;
3377 	unsigned int max_sk;
3378 	struct sock **batch;
3379 	bool st_bucket_done;
3380 };
3381 
3382 struct bpf_iter__unix {
3383 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3384 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3385 	uid_t uid __aligned(8);
3386 };
3387 
3388 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3389 			      struct unix_sock *unix_sk, uid_t uid)
3390 {
3391 	struct bpf_iter__unix ctx;
3392 
3393 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3394 	ctx.meta = meta;
3395 	ctx.unix_sk = unix_sk;
3396 	ctx.uid = uid;
3397 	return bpf_iter_run_prog(prog, &ctx);
3398 }
3399 
3400 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3401 
3402 {
3403 	struct bpf_unix_iter_state *iter = seq->private;
3404 	unsigned int expected = 1;
3405 	struct sock *sk;
3406 
3407 	sock_hold(start_sk);
3408 	iter->batch[iter->end_sk++] = start_sk;
3409 
3410 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3411 		if (iter->end_sk < iter->max_sk) {
3412 			sock_hold(sk);
3413 			iter->batch[iter->end_sk++] = sk;
3414 		}
3415 
3416 		expected++;
3417 	}
3418 
3419 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3420 
3421 	return expected;
3422 }
3423 
3424 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3425 {
3426 	while (iter->cur_sk < iter->end_sk)
3427 		sock_put(iter->batch[iter->cur_sk++]);
3428 }
3429 
3430 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3431 				       unsigned int new_batch_sz)
3432 {
3433 	struct sock **new_batch;
3434 
3435 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3436 			     GFP_USER | __GFP_NOWARN);
3437 	if (!new_batch)
3438 		return -ENOMEM;
3439 
3440 	bpf_iter_unix_put_batch(iter);
3441 	kvfree(iter->batch);
3442 	iter->batch = new_batch;
3443 	iter->max_sk = new_batch_sz;
3444 
3445 	return 0;
3446 }
3447 
3448 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3449 					loff_t *pos)
3450 {
3451 	struct bpf_unix_iter_state *iter = seq->private;
3452 	unsigned int expected;
3453 	bool resized = false;
3454 	struct sock *sk;
3455 
3456 	if (iter->st_bucket_done)
3457 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3458 
3459 again:
3460 	/* Get a new batch */
3461 	iter->cur_sk = 0;
3462 	iter->end_sk = 0;
3463 
3464 	sk = unix_get_first(seq, pos);
3465 	if (!sk)
3466 		return NULL; /* Done */
3467 
3468 	expected = bpf_iter_unix_hold_batch(seq, sk);
3469 
3470 	if (iter->end_sk == expected) {
3471 		iter->st_bucket_done = true;
3472 		return sk;
3473 	}
3474 
3475 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3476 		resized = true;
3477 		goto again;
3478 	}
3479 
3480 	return sk;
3481 }
3482 
3483 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3484 {
3485 	if (!*pos)
3486 		return SEQ_START_TOKEN;
3487 
3488 	/* bpf iter does not support lseek, so it always
3489 	 * continue from where it was stop()-ped.
3490 	 */
3491 	return bpf_iter_unix_batch(seq, pos);
3492 }
3493 
3494 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3495 {
3496 	struct bpf_unix_iter_state *iter = seq->private;
3497 	struct sock *sk;
3498 
3499 	/* Whenever seq_next() is called, the iter->cur_sk is
3500 	 * done with seq_show(), so advance to the next sk in
3501 	 * the batch.
3502 	 */
3503 	if (iter->cur_sk < iter->end_sk)
3504 		sock_put(iter->batch[iter->cur_sk++]);
3505 
3506 	++*pos;
3507 
3508 	if (iter->cur_sk < iter->end_sk)
3509 		sk = iter->batch[iter->cur_sk];
3510 	else
3511 		sk = bpf_iter_unix_batch(seq, pos);
3512 
3513 	return sk;
3514 }
3515 
3516 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3517 {
3518 	struct bpf_iter_meta meta;
3519 	struct bpf_prog *prog;
3520 	struct sock *sk = v;
3521 	uid_t uid;
3522 	bool slow;
3523 	int ret;
3524 
3525 	if (v == SEQ_START_TOKEN)
3526 		return 0;
3527 
3528 	slow = lock_sock_fast(sk);
3529 
3530 	if (unlikely(sk_unhashed(sk))) {
3531 		ret = SEQ_SKIP;
3532 		goto unlock;
3533 	}
3534 
3535 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3536 	meta.seq = seq;
3537 	prog = bpf_iter_get_info(&meta, false);
3538 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3539 unlock:
3540 	unlock_sock_fast(sk, slow);
3541 	return ret;
3542 }
3543 
3544 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3545 {
3546 	struct bpf_unix_iter_state *iter = seq->private;
3547 	struct bpf_iter_meta meta;
3548 	struct bpf_prog *prog;
3549 
3550 	if (!v) {
3551 		meta.seq = seq;
3552 		prog = bpf_iter_get_info(&meta, true);
3553 		if (prog)
3554 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3555 	}
3556 
3557 	if (iter->cur_sk < iter->end_sk)
3558 		bpf_iter_unix_put_batch(iter);
3559 }
3560 
3561 static const struct seq_operations bpf_iter_unix_seq_ops = {
3562 	.start	= bpf_iter_unix_seq_start,
3563 	.next	= bpf_iter_unix_seq_next,
3564 	.stop	= bpf_iter_unix_seq_stop,
3565 	.show	= bpf_iter_unix_seq_show,
3566 };
3567 #endif
3568 #endif
3569 
3570 static const struct net_proto_family unix_family_ops = {
3571 	.family = PF_UNIX,
3572 	.create = unix_create,
3573 	.owner	= THIS_MODULE,
3574 };
3575 
3576 
3577 static int __net_init unix_net_init(struct net *net)
3578 {
3579 	int i;
3580 
3581 	net->unx.sysctl_max_dgram_qlen = 10;
3582 	if (unix_sysctl_register(net))
3583 		goto out;
3584 
3585 #ifdef CONFIG_PROC_FS
3586 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3587 			     sizeof(struct seq_net_private)))
3588 		goto err_sysctl;
3589 #endif
3590 
3591 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3592 					      sizeof(spinlock_t), GFP_KERNEL);
3593 	if (!net->unx.table.locks)
3594 		goto err_proc;
3595 
3596 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3597 						sizeof(struct hlist_head),
3598 						GFP_KERNEL);
3599 	if (!net->unx.table.buckets)
3600 		goto free_locks;
3601 
3602 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3603 		spin_lock_init(&net->unx.table.locks[i]);
3604 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3605 	}
3606 
3607 	return 0;
3608 
3609 free_locks:
3610 	kvfree(net->unx.table.locks);
3611 err_proc:
3612 #ifdef CONFIG_PROC_FS
3613 	remove_proc_entry("unix", net->proc_net);
3614 err_sysctl:
3615 #endif
3616 	unix_sysctl_unregister(net);
3617 out:
3618 	return -ENOMEM;
3619 }
3620 
3621 static void __net_exit unix_net_exit(struct net *net)
3622 {
3623 	kvfree(net->unx.table.buckets);
3624 	kvfree(net->unx.table.locks);
3625 	unix_sysctl_unregister(net);
3626 	remove_proc_entry("unix", net->proc_net);
3627 }
3628 
3629 static struct pernet_operations unix_net_ops = {
3630 	.init = unix_net_init,
3631 	.exit = unix_net_exit,
3632 };
3633 
3634 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3635 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3636 		     struct unix_sock *unix_sk, uid_t uid)
3637 
3638 #define INIT_BATCH_SZ 16
3639 
3640 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3641 {
3642 	struct bpf_unix_iter_state *iter = priv_data;
3643 	int err;
3644 
3645 	err = bpf_iter_init_seq_net(priv_data, aux);
3646 	if (err)
3647 		return err;
3648 
3649 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3650 	if (err) {
3651 		bpf_iter_fini_seq_net(priv_data);
3652 		return err;
3653 	}
3654 
3655 	return 0;
3656 }
3657 
3658 static void bpf_iter_fini_unix(void *priv_data)
3659 {
3660 	struct bpf_unix_iter_state *iter = priv_data;
3661 
3662 	bpf_iter_fini_seq_net(priv_data);
3663 	kvfree(iter->batch);
3664 }
3665 
3666 static const struct bpf_iter_seq_info unix_seq_info = {
3667 	.seq_ops		= &bpf_iter_unix_seq_ops,
3668 	.init_seq_private	= bpf_iter_init_unix,
3669 	.fini_seq_private	= bpf_iter_fini_unix,
3670 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3671 };
3672 
3673 static const struct bpf_func_proto *
3674 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3675 			     const struct bpf_prog *prog)
3676 {
3677 	switch (func_id) {
3678 	case BPF_FUNC_setsockopt:
3679 		return &bpf_sk_setsockopt_proto;
3680 	case BPF_FUNC_getsockopt:
3681 		return &bpf_sk_getsockopt_proto;
3682 	default:
3683 		return NULL;
3684 	}
3685 }
3686 
3687 static struct bpf_iter_reg unix_reg_info = {
3688 	.target			= "unix",
3689 	.ctx_arg_info_size	= 1,
3690 	.ctx_arg_info		= {
3691 		{ offsetof(struct bpf_iter__unix, unix_sk),
3692 		  PTR_TO_BTF_ID_OR_NULL },
3693 	},
3694 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3695 	.seq_info		= &unix_seq_info,
3696 };
3697 
3698 static void __init bpf_iter_register(void)
3699 {
3700 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3701 	if (bpf_iter_reg_target(&unix_reg_info))
3702 		pr_warn("Warning: could not register bpf iterator unix\n");
3703 }
3704 #endif
3705 
3706 static int __init af_unix_init(void)
3707 {
3708 	int i, rc = -1;
3709 
3710 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3711 
3712 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3713 		spin_lock_init(&bsd_socket_locks[i]);
3714 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3715 	}
3716 
3717 	rc = proto_register(&unix_dgram_proto, 1);
3718 	if (rc != 0) {
3719 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3720 		goto out;
3721 	}
3722 
3723 	rc = proto_register(&unix_stream_proto, 1);
3724 	if (rc != 0) {
3725 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3726 		proto_unregister(&unix_dgram_proto);
3727 		goto out;
3728 	}
3729 
3730 	sock_register(&unix_family_ops);
3731 	register_pernet_subsys(&unix_net_ops);
3732 	unix_bpf_build_proto();
3733 
3734 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3735 	bpf_iter_register();
3736 #endif
3737 
3738 out:
3739 	return rc;
3740 }
3741 
3742 /* Later than subsys_initcall() because we depend on stuff initialised there */
3743 fs_initcall(af_unix_init);
3744