xref: /linux/net/unix/af_unix.c (revision 52338415)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/termios.h>
93 #include <linux/sockios.h>
94 #include <linux/net.h>
95 #include <linux/in.h>
96 #include <linux/fs.h>
97 #include <linux/slab.h>
98 #include <linux/uaccess.h>
99 #include <linux/skbuff.h>
100 #include <linux/netdevice.h>
101 #include <net/net_namespace.h>
102 #include <net/sock.h>
103 #include <net/tcp_states.h>
104 #include <net/af_unix.h>
105 #include <linux/proc_fs.h>
106 #include <linux/seq_file.h>
107 #include <net/scm.h>
108 #include <linux/init.h>
109 #include <linux/poll.h>
110 #include <linux/rtnetlink.h>
111 #include <linux/mount.h>
112 #include <net/checksum.h>
113 #include <linux/security.h>
114 #include <linux/freezer.h>
115 #include <linux/file.h>
116 
117 #include "scm.h"
118 
119 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
120 EXPORT_SYMBOL_GPL(unix_socket_table);
121 DEFINE_SPINLOCK(unix_table_lock);
122 EXPORT_SYMBOL_GPL(unix_table_lock);
123 static atomic_long_t unix_nr_socks;
124 
125 
126 static struct hlist_head *unix_sockets_unbound(void *addr)
127 {
128 	unsigned long hash = (unsigned long)addr;
129 
130 	hash ^= hash >> 16;
131 	hash ^= hash >> 8;
132 	hash %= UNIX_HASH_SIZE;
133 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
134 }
135 
136 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
137 
138 #ifdef CONFIG_SECURITY_NETWORK
139 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
140 {
141 	UNIXCB(skb).secid = scm->secid;
142 }
143 
144 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
145 {
146 	scm->secid = UNIXCB(skb).secid;
147 }
148 
149 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
150 {
151 	return (scm->secid == UNIXCB(skb).secid);
152 }
153 #else
154 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
155 { }
156 
157 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
158 { }
159 
160 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
161 {
162 	return true;
163 }
164 #endif /* CONFIG_SECURITY_NETWORK */
165 
166 /*
167  *  SMP locking strategy:
168  *    hash table is protected with spinlock unix_table_lock
169  *    each socket state is protected by separate spin lock.
170  */
171 
172 static inline unsigned int unix_hash_fold(__wsum n)
173 {
174 	unsigned int hash = (__force unsigned int)csum_fold(n);
175 
176 	hash ^= hash>>8;
177 	return hash&(UNIX_HASH_SIZE-1);
178 }
179 
180 #define unix_peer(sk) (unix_sk(sk)->peer)
181 
182 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
183 {
184 	return unix_peer(osk) == sk;
185 }
186 
187 static inline int unix_may_send(struct sock *sk, struct sock *osk)
188 {
189 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
190 }
191 
192 static inline int unix_recvq_full(struct sock const *sk)
193 {
194 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
195 }
196 
197 struct sock *unix_peer_get(struct sock *s)
198 {
199 	struct sock *peer;
200 
201 	unix_state_lock(s);
202 	peer = unix_peer(s);
203 	if (peer)
204 		sock_hold(peer);
205 	unix_state_unlock(s);
206 	return peer;
207 }
208 EXPORT_SYMBOL_GPL(unix_peer_get);
209 
210 static inline void unix_release_addr(struct unix_address *addr)
211 {
212 	if (refcount_dec_and_test(&addr->refcnt))
213 		kfree(addr);
214 }
215 
216 /*
217  *	Check unix socket name:
218  *		- should be not zero length.
219  *	        - if started by not zero, should be NULL terminated (FS object)
220  *		- if started by zero, it is abstract name.
221  */
222 
223 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
224 {
225 	*hashp = 0;
226 
227 	if (len <= sizeof(short) || len > sizeof(*sunaddr))
228 		return -EINVAL;
229 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
230 		return -EINVAL;
231 	if (sunaddr->sun_path[0]) {
232 		/*
233 		 * This may look like an off by one error but it is a bit more
234 		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
235 		 * sun_path[108] doesn't as such exist.  However in kernel space
236 		 * we are guaranteed that it is a valid memory location in our
237 		 * kernel address buffer.
238 		 */
239 		((char *)sunaddr)[len] = 0;
240 		len = strlen(sunaddr->sun_path)+1+sizeof(short);
241 		return len;
242 	}
243 
244 	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
245 	return len;
246 }
247 
248 static void __unix_remove_socket(struct sock *sk)
249 {
250 	sk_del_node_init(sk);
251 }
252 
253 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
254 {
255 	WARN_ON(!sk_unhashed(sk));
256 	sk_add_node(sk, list);
257 }
258 
259 static inline void unix_remove_socket(struct sock *sk)
260 {
261 	spin_lock(&unix_table_lock);
262 	__unix_remove_socket(sk);
263 	spin_unlock(&unix_table_lock);
264 }
265 
266 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
267 {
268 	spin_lock(&unix_table_lock);
269 	__unix_insert_socket(list, sk);
270 	spin_unlock(&unix_table_lock);
271 }
272 
273 static struct sock *__unix_find_socket_byname(struct net *net,
274 					      struct sockaddr_un *sunname,
275 					      int len, int type, unsigned int hash)
276 {
277 	struct sock *s;
278 
279 	sk_for_each(s, &unix_socket_table[hash ^ type]) {
280 		struct unix_sock *u = unix_sk(s);
281 
282 		if (!net_eq(sock_net(s), net))
283 			continue;
284 
285 		if (u->addr->len == len &&
286 		    !memcmp(u->addr->name, sunname, len))
287 			goto found;
288 	}
289 	s = NULL;
290 found:
291 	return s;
292 }
293 
294 static inline struct sock *unix_find_socket_byname(struct net *net,
295 						   struct sockaddr_un *sunname,
296 						   int len, int type,
297 						   unsigned int hash)
298 {
299 	struct sock *s;
300 
301 	spin_lock(&unix_table_lock);
302 	s = __unix_find_socket_byname(net, sunname, len, type, hash);
303 	if (s)
304 		sock_hold(s);
305 	spin_unlock(&unix_table_lock);
306 	return s;
307 }
308 
309 static struct sock *unix_find_socket_byinode(struct inode *i)
310 {
311 	struct sock *s;
312 
313 	spin_lock(&unix_table_lock);
314 	sk_for_each(s,
315 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
316 		struct dentry *dentry = unix_sk(s)->path.dentry;
317 
318 		if (dentry && d_backing_inode(dentry) == i) {
319 			sock_hold(s);
320 			goto found;
321 		}
322 	}
323 	s = NULL;
324 found:
325 	spin_unlock(&unix_table_lock);
326 	return s;
327 }
328 
329 /* Support code for asymmetrically connected dgram sockets
330  *
331  * If a datagram socket is connected to a socket not itself connected
332  * to the first socket (eg, /dev/log), clients may only enqueue more
333  * messages if the present receive queue of the server socket is not
334  * "too large". This means there's a second writeability condition
335  * poll and sendmsg need to test. The dgram recv code will do a wake
336  * up on the peer_wait wait queue of a socket upon reception of a
337  * datagram which needs to be propagated to sleeping would-be writers
338  * since these might not have sent anything so far. This can't be
339  * accomplished via poll_wait because the lifetime of the server
340  * socket might be less than that of its clients if these break their
341  * association with it or if the server socket is closed while clients
342  * are still connected to it and there's no way to inform "a polling
343  * implementation" that it should let go of a certain wait queue
344  *
345  * In order to propagate a wake up, a wait_queue_entry_t of the client
346  * socket is enqueued on the peer_wait queue of the server socket
347  * whose wake function does a wake_up on the ordinary client socket
348  * wait queue. This connection is established whenever a write (or
349  * poll for write) hit the flow control condition and broken when the
350  * association to the server socket is dissolved or after a wake up
351  * was relayed.
352  */
353 
354 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
355 				      void *key)
356 {
357 	struct unix_sock *u;
358 	wait_queue_head_t *u_sleep;
359 
360 	u = container_of(q, struct unix_sock, peer_wake);
361 
362 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
363 			    q);
364 	u->peer_wake.private = NULL;
365 
366 	/* relaying can only happen while the wq still exists */
367 	u_sleep = sk_sleep(&u->sk);
368 	if (u_sleep)
369 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
370 
371 	return 0;
372 }
373 
374 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
375 {
376 	struct unix_sock *u, *u_other;
377 	int rc;
378 
379 	u = unix_sk(sk);
380 	u_other = unix_sk(other);
381 	rc = 0;
382 	spin_lock(&u_other->peer_wait.lock);
383 
384 	if (!u->peer_wake.private) {
385 		u->peer_wake.private = other;
386 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
387 
388 		rc = 1;
389 	}
390 
391 	spin_unlock(&u_other->peer_wait.lock);
392 	return rc;
393 }
394 
395 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
396 					    struct sock *other)
397 {
398 	struct unix_sock *u, *u_other;
399 
400 	u = unix_sk(sk);
401 	u_other = unix_sk(other);
402 	spin_lock(&u_other->peer_wait.lock);
403 
404 	if (u->peer_wake.private == other) {
405 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
406 		u->peer_wake.private = NULL;
407 	}
408 
409 	spin_unlock(&u_other->peer_wait.lock);
410 }
411 
412 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
413 						   struct sock *other)
414 {
415 	unix_dgram_peer_wake_disconnect(sk, other);
416 	wake_up_interruptible_poll(sk_sleep(sk),
417 				   EPOLLOUT |
418 				   EPOLLWRNORM |
419 				   EPOLLWRBAND);
420 }
421 
422 /* preconditions:
423  *	- unix_peer(sk) == other
424  *	- association is stable
425  */
426 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
427 {
428 	int connected;
429 
430 	connected = unix_dgram_peer_wake_connect(sk, other);
431 
432 	/* If other is SOCK_DEAD, we want to make sure we signal
433 	 * POLLOUT, such that a subsequent write() can get a
434 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
435 	 * to other and its full, we will hang waiting for POLLOUT.
436 	 */
437 	if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
438 		return 1;
439 
440 	if (connected)
441 		unix_dgram_peer_wake_disconnect(sk, other);
442 
443 	return 0;
444 }
445 
446 static int unix_writable(const struct sock *sk)
447 {
448 	return sk->sk_state != TCP_LISTEN &&
449 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
450 }
451 
452 static void unix_write_space(struct sock *sk)
453 {
454 	struct socket_wq *wq;
455 
456 	rcu_read_lock();
457 	if (unix_writable(sk)) {
458 		wq = rcu_dereference(sk->sk_wq);
459 		if (skwq_has_sleeper(wq))
460 			wake_up_interruptible_sync_poll(&wq->wait,
461 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
462 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
463 	}
464 	rcu_read_unlock();
465 }
466 
467 /* When dgram socket disconnects (or changes its peer), we clear its receive
468  * queue of packets arrived from previous peer. First, it allows to do
469  * flow control based only on wmem_alloc; second, sk connected to peer
470  * may receive messages only from that peer. */
471 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
472 {
473 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
474 		skb_queue_purge(&sk->sk_receive_queue);
475 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
476 
477 		/* If one link of bidirectional dgram pipe is disconnected,
478 		 * we signal error. Messages are lost. Do not make this,
479 		 * when peer was not connected to us.
480 		 */
481 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
482 			other->sk_err = ECONNRESET;
483 			other->sk_error_report(other);
484 		}
485 	}
486 }
487 
488 static void unix_sock_destructor(struct sock *sk)
489 {
490 	struct unix_sock *u = unix_sk(sk);
491 
492 	skb_queue_purge(&sk->sk_receive_queue);
493 
494 	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
495 	WARN_ON(!sk_unhashed(sk));
496 	WARN_ON(sk->sk_socket);
497 	if (!sock_flag(sk, SOCK_DEAD)) {
498 		pr_info("Attempt to release alive unix socket: %p\n", sk);
499 		return;
500 	}
501 
502 	if (u->addr)
503 		unix_release_addr(u->addr);
504 
505 	atomic_long_dec(&unix_nr_socks);
506 	local_bh_disable();
507 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
508 	local_bh_enable();
509 #ifdef UNIX_REFCNT_DEBUG
510 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
511 		atomic_long_read(&unix_nr_socks));
512 #endif
513 }
514 
515 static void unix_release_sock(struct sock *sk, int embrion)
516 {
517 	struct unix_sock *u = unix_sk(sk);
518 	struct path path;
519 	struct sock *skpair;
520 	struct sk_buff *skb;
521 	int state;
522 
523 	unix_remove_socket(sk);
524 
525 	/* Clear state */
526 	unix_state_lock(sk);
527 	sock_orphan(sk);
528 	sk->sk_shutdown = SHUTDOWN_MASK;
529 	path	     = u->path;
530 	u->path.dentry = NULL;
531 	u->path.mnt = NULL;
532 	state = sk->sk_state;
533 	sk->sk_state = TCP_CLOSE;
534 	unix_state_unlock(sk);
535 
536 	wake_up_interruptible_all(&u->peer_wait);
537 
538 	skpair = unix_peer(sk);
539 
540 	if (skpair != NULL) {
541 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
542 			unix_state_lock(skpair);
543 			/* No more writes */
544 			skpair->sk_shutdown = SHUTDOWN_MASK;
545 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
546 				skpair->sk_err = ECONNRESET;
547 			unix_state_unlock(skpair);
548 			skpair->sk_state_change(skpair);
549 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
550 		}
551 
552 		unix_dgram_peer_wake_disconnect(sk, skpair);
553 		sock_put(skpair); /* It may now die */
554 		unix_peer(sk) = NULL;
555 	}
556 
557 	/* Try to flush out this socket. Throw out buffers at least */
558 
559 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
560 		if (state == TCP_LISTEN)
561 			unix_release_sock(skb->sk, 1);
562 		/* passed fds are erased in the kfree_skb hook	      */
563 		UNIXCB(skb).consumed = skb->len;
564 		kfree_skb(skb);
565 	}
566 
567 	if (path.dentry)
568 		path_put(&path);
569 
570 	sock_put(sk);
571 
572 	/* ---- Socket is dead now and most probably destroyed ---- */
573 
574 	/*
575 	 * Fixme: BSD difference: In BSD all sockets connected to us get
576 	 *	  ECONNRESET and we die on the spot. In Linux we behave
577 	 *	  like files and pipes do and wait for the last
578 	 *	  dereference.
579 	 *
580 	 * Can't we simply set sock->err?
581 	 *
582 	 *	  What the above comment does talk about? --ANK(980817)
583 	 */
584 
585 	if (unix_tot_inflight)
586 		unix_gc();		/* Garbage collect fds */
587 }
588 
589 static void init_peercred(struct sock *sk)
590 {
591 	put_pid(sk->sk_peer_pid);
592 	if (sk->sk_peer_cred)
593 		put_cred(sk->sk_peer_cred);
594 	sk->sk_peer_pid  = get_pid(task_tgid(current));
595 	sk->sk_peer_cred = get_current_cred();
596 }
597 
598 static void copy_peercred(struct sock *sk, struct sock *peersk)
599 {
600 	put_pid(sk->sk_peer_pid);
601 	if (sk->sk_peer_cred)
602 		put_cred(sk->sk_peer_cred);
603 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
604 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
605 }
606 
607 static int unix_listen(struct socket *sock, int backlog)
608 {
609 	int err;
610 	struct sock *sk = sock->sk;
611 	struct unix_sock *u = unix_sk(sk);
612 	struct pid *old_pid = NULL;
613 
614 	err = -EOPNOTSUPP;
615 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
616 		goto out;	/* Only stream/seqpacket sockets accept */
617 	err = -EINVAL;
618 	if (!u->addr)
619 		goto out;	/* No listens on an unbound socket */
620 	unix_state_lock(sk);
621 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
622 		goto out_unlock;
623 	if (backlog > sk->sk_max_ack_backlog)
624 		wake_up_interruptible_all(&u->peer_wait);
625 	sk->sk_max_ack_backlog	= backlog;
626 	sk->sk_state		= TCP_LISTEN;
627 	/* set credentials so connect can copy them */
628 	init_peercred(sk);
629 	err = 0;
630 
631 out_unlock:
632 	unix_state_unlock(sk);
633 	put_pid(old_pid);
634 out:
635 	return err;
636 }
637 
638 static int unix_release(struct socket *);
639 static int unix_bind(struct socket *, struct sockaddr *, int);
640 static int unix_stream_connect(struct socket *, struct sockaddr *,
641 			       int addr_len, int flags);
642 static int unix_socketpair(struct socket *, struct socket *);
643 static int unix_accept(struct socket *, struct socket *, int, bool);
644 static int unix_getname(struct socket *, struct sockaddr *, int);
645 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
646 static __poll_t unix_dgram_poll(struct file *, struct socket *,
647 				    poll_table *);
648 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
649 static int unix_shutdown(struct socket *, int);
650 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
651 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
652 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
653 				    size_t size, int flags);
654 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
655 				       struct pipe_inode_info *, size_t size,
656 				       unsigned int flags);
657 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
658 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
659 static int unix_dgram_connect(struct socket *, struct sockaddr *,
660 			      int, int);
661 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
662 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
663 				  int);
664 
665 static int unix_set_peek_off(struct sock *sk, int val)
666 {
667 	struct unix_sock *u = unix_sk(sk);
668 
669 	if (mutex_lock_interruptible(&u->iolock))
670 		return -EINTR;
671 
672 	sk->sk_peek_off = val;
673 	mutex_unlock(&u->iolock);
674 
675 	return 0;
676 }
677 
678 
679 static const struct proto_ops unix_stream_ops = {
680 	.family =	PF_UNIX,
681 	.owner =	THIS_MODULE,
682 	.release =	unix_release,
683 	.bind =		unix_bind,
684 	.connect =	unix_stream_connect,
685 	.socketpair =	unix_socketpair,
686 	.accept =	unix_accept,
687 	.getname =	unix_getname,
688 	.poll =		unix_poll,
689 	.ioctl =	unix_ioctl,
690 	.listen =	unix_listen,
691 	.shutdown =	unix_shutdown,
692 	.setsockopt =	sock_no_setsockopt,
693 	.getsockopt =	sock_no_getsockopt,
694 	.sendmsg =	unix_stream_sendmsg,
695 	.recvmsg =	unix_stream_recvmsg,
696 	.mmap =		sock_no_mmap,
697 	.sendpage =	unix_stream_sendpage,
698 	.splice_read =	unix_stream_splice_read,
699 	.set_peek_off =	unix_set_peek_off,
700 };
701 
702 static const struct proto_ops unix_dgram_ops = {
703 	.family =	PF_UNIX,
704 	.owner =	THIS_MODULE,
705 	.release =	unix_release,
706 	.bind =		unix_bind,
707 	.connect =	unix_dgram_connect,
708 	.socketpair =	unix_socketpair,
709 	.accept =	sock_no_accept,
710 	.getname =	unix_getname,
711 	.poll =		unix_dgram_poll,
712 	.ioctl =	unix_ioctl,
713 	.listen =	sock_no_listen,
714 	.shutdown =	unix_shutdown,
715 	.setsockopt =	sock_no_setsockopt,
716 	.getsockopt =	sock_no_getsockopt,
717 	.sendmsg =	unix_dgram_sendmsg,
718 	.recvmsg =	unix_dgram_recvmsg,
719 	.mmap =		sock_no_mmap,
720 	.sendpage =	sock_no_sendpage,
721 	.set_peek_off =	unix_set_peek_off,
722 };
723 
724 static const struct proto_ops unix_seqpacket_ops = {
725 	.family =	PF_UNIX,
726 	.owner =	THIS_MODULE,
727 	.release =	unix_release,
728 	.bind =		unix_bind,
729 	.connect =	unix_stream_connect,
730 	.socketpair =	unix_socketpair,
731 	.accept =	unix_accept,
732 	.getname =	unix_getname,
733 	.poll =		unix_dgram_poll,
734 	.ioctl =	unix_ioctl,
735 	.listen =	unix_listen,
736 	.shutdown =	unix_shutdown,
737 	.setsockopt =	sock_no_setsockopt,
738 	.getsockopt =	sock_no_getsockopt,
739 	.sendmsg =	unix_seqpacket_sendmsg,
740 	.recvmsg =	unix_seqpacket_recvmsg,
741 	.mmap =		sock_no_mmap,
742 	.sendpage =	sock_no_sendpage,
743 	.set_peek_off =	unix_set_peek_off,
744 };
745 
746 static struct proto unix_proto = {
747 	.name			= "UNIX",
748 	.owner			= THIS_MODULE,
749 	.obj_size		= sizeof(struct unix_sock),
750 };
751 
752 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
753 {
754 	struct sock *sk = NULL;
755 	struct unix_sock *u;
756 
757 	atomic_long_inc(&unix_nr_socks);
758 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
759 		goto out;
760 
761 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
762 	if (!sk)
763 		goto out;
764 
765 	sock_init_data(sock, sk);
766 
767 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
768 	sk->sk_write_space	= unix_write_space;
769 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
770 	sk->sk_destruct		= unix_sock_destructor;
771 	u	  = unix_sk(sk);
772 	u->path.dentry = NULL;
773 	u->path.mnt = NULL;
774 	spin_lock_init(&u->lock);
775 	atomic_long_set(&u->inflight, 0);
776 	INIT_LIST_HEAD(&u->link);
777 	mutex_init(&u->iolock); /* single task reading lock */
778 	mutex_init(&u->bindlock); /* single task binding lock */
779 	init_waitqueue_head(&u->peer_wait);
780 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
781 	unix_insert_socket(unix_sockets_unbound(sk), sk);
782 out:
783 	if (sk == NULL)
784 		atomic_long_dec(&unix_nr_socks);
785 	else {
786 		local_bh_disable();
787 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
788 		local_bh_enable();
789 	}
790 	return sk;
791 }
792 
793 static int unix_create(struct net *net, struct socket *sock, int protocol,
794 		       int kern)
795 {
796 	if (protocol && protocol != PF_UNIX)
797 		return -EPROTONOSUPPORT;
798 
799 	sock->state = SS_UNCONNECTED;
800 
801 	switch (sock->type) {
802 	case SOCK_STREAM:
803 		sock->ops = &unix_stream_ops;
804 		break;
805 		/*
806 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
807 		 *	nothing uses it.
808 		 */
809 	case SOCK_RAW:
810 		sock->type = SOCK_DGRAM;
811 		/* fall through */
812 	case SOCK_DGRAM:
813 		sock->ops = &unix_dgram_ops;
814 		break;
815 	case SOCK_SEQPACKET:
816 		sock->ops = &unix_seqpacket_ops;
817 		break;
818 	default:
819 		return -ESOCKTNOSUPPORT;
820 	}
821 
822 	return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
823 }
824 
825 static int unix_release(struct socket *sock)
826 {
827 	struct sock *sk = sock->sk;
828 
829 	if (!sk)
830 		return 0;
831 
832 	unix_release_sock(sk, 0);
833 	sock->sk = NULL;
834 
835 	return 0;
836 }
837 
838 static int unix_autobind(struct socket *sock)
839 {
840 	struct sock *sk = sock->sk;
841 	struct net *net = sock_net(sk);
842 	struct unix_sock *u = unix_sk(sk);
843 	static u32 ordernum = 1;
844 	struct unix_address *addr;
845 	int err;
846 	unsigned int retries = 0;
847 
848 	err = mutex_lock_interruptible(&u->bindlock);
849 	if (err)
850 		return err;
851 
852 	err = 0;
853 	if (u->addr)
854 		goto out;
855 
856 	err = -ENOMEM;
857 	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
858 	if (!addr)
859 		goto out;
860 
861 	addr->name->sun_family = AF_UNIX;
862 	refcount_set(&addr->refcnt, 1);
863 
864 retry:
865 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
866 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
867 
868 	spin_lock(&unix_table_lock);
869 	ordernum = (ordernum+1)&0xFFFFF;
870 
871 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
872 				      addr->hash)) {
873 		spin_unlock(&unix_table_lock);
874 		/*
875 		 * __unix_find_socket_byname() may take long time if many names
876 		 * are already in use.
877 		 */
878 		cond_resched();
879 		/* Give up if all names seems to be in use. */
880 		if (retries++ == 0xFFFFF) {
881 			err = -ENOSPC;
882 			kfree(addr);
883 			goto out;
884 		}
885 		goto retry;
886 	}
887 	addr->hash ^= sk->sk_type;
888 
889 	__unix_remove_socket(sk);
890 	smp_store_release(&u->addr, addr);
891 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
892 	spin_unlock(&unix_table_lock);
893 	err = 0;
894 
895 out:	mutex_unlock(&u->bindlock);
896 	return err;
897 }
898 
899 static struct sock *unix_find_other(struct net *net,
900 				    struct sockaddr_un *sunname, int len,
901 				    int type, unsigned int hash, int *error)
902 {
903 	struct sock *u;
904 	struct path path;
905 	int err = 0;
906 
907 	if (sunname->sun_path[0]) {
908 		struct inode *inode;
909 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
910 		if (err)
911 			goto fail;
912 		inode = d_backing_inode(path.dentry);
913 		err = inode_permission(inode, MAY_WRITE);
914 		if (err)
915 			goto put_fail;
916 
917 		err = -ECONNREFUSED;
918 		if (!S_ISSOCK(inode->i_mode))
919 			goto put_fail;
920 		u = unix_find_socket_byinode(inode);
921 		if (!u)
922 			goto put_fail;
923 
924 		if (u->sk_type == type)
925 			touch_atime(&path);
926 
927 		path_put(&path);
928 
929 		err = -EPROTOTYPE;
930 		if (u->sk_type != type) {
931 			sock_put(u);
932 			goto fail;
933 		}
934 	} else {
935 		err = -ECONNREFUSED;
936 		u = unix_find_socket_byname(net, sunname, len, type, hash);
937 		if (u) {
938 			struct dentry *dentry;
939 			dentry = unix_sk(u)->path.dentry;
940 			if (dentry)
941 				touch_atime(&unix_sk(u)->path);
942 		} else
943 			goto fail;
944 	}
945 	return u;
946 
947 put_fail:
948 	path_put(&path);
949 fail:
950 	*error = err;
951 	return NULL;
952 }
953 
954 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
955 {
956 	struct dentry *dentry;
957 	struct path path;
958 	int err = 0;
959 	/*
960 	 * Get the parent directory, calculate the hash for last
961 	 * component.
962 	 */
963 	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
964 	err = PTR_ERR(dentry);
965 	if (IS_ERR(dentry))
966 		return err;
967 
968 	/*
969 	 * All right, let's create it.
970 	 */
971 	err = security_path_mknod(&path, dentry, mode, 0);
972 	if (!err) {
973 		err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
974 		if (!err) {
975 			res->mnt = mntget(path.mnt);
976 			res->dentry = dget(dentry);
977 		}
978 	}
979 	done_path_create(&path, dentry);
980 	return err;
981 }
982 
983 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
984 {
985 	struct sock *sk = sock->sk;
986 	struct net *net = sock_net(sk);
987 	struct unix_sock *u = unix_sk(sk);
988 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
989 	char *sun_path = sunaddr->sun_path;
990 	int err;
991 	unsigned int hash;
992 	struct unix_address *addr;
993 	struct hlist_head *list;
994 	struct path path = { };
995 
996 	err = -EINVAL;
997 	if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
998 	    sunaddr->sun_family != AF_UNIX)
999 		goto out;
1000 
1001 	if (addr_len == sizeof(short)) {
1002 		err = unix_autobind(sock);
1003 		goto out;
1004 	}
1005 
1006 	err = unix_mkname(sunaddr, addr_len, &hash);
1007 	if (err < 0)
1008 		goto out;
1009 	addr_len = err;
1010 
1011 	if (sun_path[0]) {
1012 		umode_t mode = S_IFSOCK |
1013 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
1014 		err = unix_mknod(sun_path, mode, &path);
1015 		if (err) {
1016 			if (err == -EEXIST)
1017 				err = -EADDRINUSE;
1018 			goto out;
1019 		}
1020 	}
1021 
1022 	err = mutex_lock_interruptible(&u->bindlock);
1023 	if (err)
1024 		goto out_put;
1025 
1026 	err = -EINVAL;
1027 	if (u->addr)
1028 		goto out_up;
1029 
1030 	err = -ENOMEM;
1031 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1032 	if (!addr)
1033 		goto out_up;
1034 
1035 	memcpy(addr->name, sunaddr, addr_len);
1036 	addr->len = addr_len;
1037 	addr->hash = hash ^ sk->sk_type;
1038 	refcount_set(&addr->refcnt, 1);
1039 
1040 	if (sun_path[0]) {
1041 		addr->hash = UNIX_HASH_SIZE;
1042 		hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1043 		spin_lock(&unix_table_lock);
1044 		u->path = path;
1045 		list = &unix_socket_table[hash];
1046 	} else {
1047 		spin_lock(&unix_table_lock);
1048 		err = -EADDRINUSE;
1049 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
1050 					      sk->sk_type, hash)) {
1051 			unix_release_addr(addr);
1052 			goto out_unlock;
1053 		}
1054 
1055 		list = &unix_socket_table[addr->hash];
1056 	}
1057 
1058 	err = 0;
1059 	__unix_remove_socket(sk);
1060 	smp_store_release(&u->addr, addr);
1061 	__unix_insert_socket(list, sk);
1062 
1063 out_unlock:
1064 	spin_unlock(&unix_table_lock);
1065 out_up:
1066 	mutex_unlock(&u->bindlock);
1067 out_put:
1068 	if (err)
1069 		path_put(&path);
1070 out:
1071 	return err;
1072 }
1073 
1074 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1075 {
1076 	if (unlikely(sk1 == sk2) || !sk2) {
1077 		unix_state_lock(sk1);
1078 		return;
1079 	}
1080 	if (sk1 < sk2) {
1081 		unix_state_lock(sk1);
1082 		unix_state_lock_nested(sk2);
1083 	} else {
1084 		unix_state_lock(sk2);
1085 		unix_state_lock_nested(sk1);
1086 	}
1087 }
1088 
1089 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1090 {
1091 	if (unlikely(sk1 == sk2) || !sk2) {
1092 		unix_state_unlock(sk1);
1093 		return;
1094 	}
1095 	unix_state_unlock(sk1);
1096 	unix_state_unlock(sk2);
1097 }
1098 
1099 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1100 			      int alen, int flags)
1101 {
1102 	struct sock *sk = sock->sk;
1103 	struct net *net = sock_net(sk);
1104 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1105 	struct sock *other;
1106 	unsigned int hash;
1107 	int err;
1108 
1109 	err = -EINVAL;
1110 	if (alen < offsetofend(struct sockaddr, sa_family))
1111 		goto out;
1112 
1113 	if (addr->sa_family != AF_UNSPEC) {
1114 		err = unix_mkname(sunaddr, alen, &hash);
1115 		if (err < 0)
1116 			goto out;
1117 		alen = err;
1118 
1119 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1120 		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1121 			goto out;
1122 
1123 restart:
1124 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1125 		if (!other)
1126 			goto out;
1127 
1128 		unix_state_double_lock(sk, other);
1129 
1130 		/* Apparently VFS overslept socket death. Retry. */
1131 		if (sock_flag(other, SOCK_DEAD)) {
1132 			unix_state_double_unlock(sk, other);
1133 			sock_put(other);
1134 			goto restart;
1135 		}
1136 
1137 		err = -EPERM;
1138 		if (!unix_may_send(sk, other))
1139 			goto out_unlock;
1140 
1141 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1142 		if (err)
1143 			goto out_unlock;
1144 
1145 	} else {
1146 		/*
1147 		 *	1003.1g breaking connected state with AF_UNSPEC
1148 		 */
1149 		other = NULL;
1150 		unix_state_double_lock(sk, other);
1151 	}
1152 
1153 	/*
1154 	 * If it was connected, reconnect.
1155 	 */
1156 	if (unix_peer(sk)) {
1157 		struct sock *old_peer = unix_peer(sk);
1158 		unix_peer(sk) = other;
1159 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1160 
1161 		unix_state_double_unlock(sk, other);
1162 
1163 		if (other != old_peer)
1164 			unix_dgram_disconnected(sk, old_peer);
1165 		sock_put(old_peer);
1166 	} else {
1167 		unix_peer(sk) = other;
1168 		unix_state_double_unlock(sk, other);
1169 	}
1170 	return 0;
1171 
1172 out_unlock:
1173 	unix_state_double_unlock(sk, other);
1174 	sock_put(other);
1175 out:
1176 	return err;
1177 }
1178 
1179 static long unix_wait_for_peer(struct sock *other, long timeo)
1180 {
1181 	struct unix_sock *u = unix_sk(other);
1182 	int sched;
1183 	DEFINE_WAIT(wait);
1184 
1185 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1186 
1187 	sched = !sock_flag(other, SOCK_DEAD) &&
1188 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1189 		unix_recvq_full(other);
1190 
1191 	unix_state_unlock(other);
1192 
1193 	if (sched)
1194 		timeo = schedule_timeout(timeo);
1195 
1196 	finish_wait(&u->peer_wait, &wait);
1197 	return timeo;
1198 }
1199 
1200 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1201 			       int addr_len, int flags)
1202 {
1203 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1204 	struct sock *sk = sock->sk;
1205 	struct net *net = sock_net(sk);
1206 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1207 	struct sock *newsk = NULL;
1208 	struct sock *other = NULL;
1209 	struct sk_buff *skb = NULL;
1210 	unsigned int hash;
1211 	int st;
1212 	int err;
1213 	long timeo;
1214 
1215 	err = unix_mkname(sunaddr, addr_len, &hash);
1216 	if (err < 0)
1217 		goto out;
1218 	addr_len = err;
1219 
1220 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1221 	    (err = unix_autobind(sock)) != 0)
1222 		goto out;
1223 
1224 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1225 
1226 	/* First of all allocate resources.
1227 	   If we will make it after state is locked,
1228 	   we will have to recheck all again in any case.
1229 	 */
1230 
1231 	err = -ENOMEM;
1232 
1233 	/* create new sock for complete connection */
1234 	newsk = unix_create1(sock_net(sk), NULL, 0);
1235 	if (newsk == NULL)
1236 		goto out;
1237 
1238 	/* Allocate skb for sending to listening sock */
1239 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1240 	if (skb == NULL)
1241 		goto out;
1242 
1243 restart:
1244 	/*  Find listening sock. */
1245 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1246 	if (!other)
1247 		goto out;
1248 
1249 	/* Latch state of peer */
1250 	unix_state_lock(other);
1251 
1252 	/* Apparently VFS overslept socket death. Retry. */
1253 	if (sock_flag(other, SOCK_DEAD)) {
1254 		unix_state_unlock(other);
1255 		sock_put(other);
1256 		goto restart;
1257 	}
1258 
1259 	err = -ECONNREFUSED;
1260 	if (other->sk_state != TCP_LISTEN)
1261 		goto out_unlock;
1262 	if (other->sk_shutdown & RCV_SHUTDOWN)
1263 		goto out_unlock;
1264 
1265 	if (unix_recvq_full(other)) {
1266 		err = -EAGAIN;
1267 		if (!timeo)
1268 			goto out_unlock;
1269 
1270 		timeo = unix_wait_for_peer(other, timeo);
1271 
1272 		err = sock_intr_errno(timeo);
1273 		if (signal_pending(current))
1274 			goto out;
1275 		sock_put(other);
1276 		goto restart;
1277 	}
1278 
1279 	/* Latch our state.
1280 
1281 	   It is tricky place. We need to grab our state lock and cannot
1282 	   drop lock on peer. It is dangerous because deadlock is
1283 	   possible. Connect to self case and simultaneous
1284 	   attempt to connect are eliminated by checking socket
1285 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1286 	   check this before attempt to grab lock.
1287 
1288 	   Well, and we have to recheck the state after socket locked.
1289 	 */
1290 	st = sk->sk_state;
1291 
1292 	switch (st) {
1293 	case TCP_CLOSE:
1294 		/* This is ok... continue with connect */
1295 		break;
1296 	case TCP_ESTABLISHED:
1297 		/* Socket is already connected */
1298 		err = -EISCONN;
1299 		goto out_unlock;
1300 	default:
1301 		err = -EINVAL;
1302 		goto out_unlock;
1303 	}
1304 
1305 	unix_state_lock_nested(sk);
1306 
1307 	if (sk->sk_state != st) {
1308 		unix_state_unlock(sk);
1309 		unix_state_unlock(other);
1310 		sock_put(other);
1311 		goto restart;
1312 	}
1313 
1314 	err = security_unix_stream_connect(sk, other, newsk);
1315 	if (err) {
1316 		unix_state_unlock(sk);
1317 		goto out_unlock;
1318 	}
1319 
1320 	/* The way is open! Fastly set all the necessary fields... */
1321 
1322 	sock_hold(sk);
1323 	unix_peer(newsk)	= sk;
1324 	newsk->sk_state		= TCP_ESTABLISHED;
1325 	newsk->sk_type		= sk->sk_type;
1326 	init_peercred(newsk);
1327 	newu = unix_sk(newsk);
1328 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1329 	otheru = unix_sk(other);
1330 
1331 	/* copy address information from listening to new sock
1332 	 *
1333 	 * The contents of *(otheru->addr) and otheru->path
1334 	 * are seen fully set up here, since we have found
1335 	 * otheru in hash under unix_table_lock.  Insertion
1336 	 * into the hash chain we'd found it in had been done
1337 	 * in an earlier critical area protected by unix_table_lock,
1338 	 * the same one where we'd set *(otheru->addr) contents,
1339 	 * as well as otheru->path and otheru->addr itself.
1340 	 *
1341 	 * Using smp_store_release() here to set newu->addr
1342 	 * is enough to make those stores, as well as stores
1343 	 * to newu->path visible to anyone who gets newu->addr
1344 	 * by smp_load_acquire().  IOW, the same warranties
1345 	 * as for unix_sock instances bound in unix_bind() or
1346 	 * in unix_autobind().
1347 	 */
1348 	if (otheru->path.dentry) {
1349 		path_get(&otheru->path);
1350 		newu->path = otheru->path;
1351 	}
1352 	refcount_inc(&otheru->addr->refcnt);
1353 	smp_store_release(&newu->addr, otheru->addr);
1354 
1355 	/* Set credentials */
1356 	copy_peercred(sk, other);
1357 
1358 	sock->state	= SS_CONNECTED;
1359 	sk->sk_state	= TCP_ESTABLISHED;
1360 	sock_hold(newsk);
1361 
1362 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1363 	unix_peer(sk)	= newsk;
1364 
1365 	unix_state_unlock(sk);
1366 
1367 	/* take ten and and send info to listening sock */
1368 	spin_lock(&other->sk_receive_queue.lock);
1369 	__skb_queue_tail(&other->sk_receive_queue, skb);
1370 	spin_unlock(&other->sk_receive_queue.lock);
1371 	unix_state_unlock(other);
1372 	other->sk_data_ready(other);
1373 	sock_put(other);
1374 	return 0;
1375 
1376 out_unlock:
1377 	if (other)
1378 		unix_state_unlock(other);
1379 
1380 out:
1381 	kfree_skb(skb);
1382 	if (newsk)
1383 		unix_release_sock(newsk, 0);
1384 	if (other)
1385 		sock_put(other);
1386 	return err;
1387 }
1388 
1389 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1390 {
1391 	struct sock *ska = socka->sk, *skb = sockb->sk;
1392 
1393 	/* Join our sockets back to back */
1394 	sock_hold(ska);
1395 	sock_hold(skb);
1396 	unix_peer(ska) = skb;
1397 	unix_peer(skb) = ska;
1398 	init_peercred(ska);
1399 	init_peercred(skb);
1400 
1401 	if (ska->sk_type != SOCK_DGRAM) {
1402 		ska->sk_state = TCP_ESTABLISHED;
1403 		skb->sk_state = TCP_ESTABLISHED;
1404 		socka->state  = SS_CONNECTED;
1405 		sockb->state  = SS_CONNECTED;
1406 	}
1407 	return 0;
1408 }
1409 
1410 static void unix_sock_inherit_flags(const struct socket *old,
1411 				    struct socket *new)
1412 {
1413 	if (test_bit(SOCK_PASSCRED, &old->flags))
1414 		set_bit(SOCK_PASSCRED, &new->flags);
1415 	if (test_bit(SOCK_PASSSEC, &old->flags))
1416 		set_bit(SOCK_PASSSEC, &new->flags);
1417 }
1418 
1419 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1420 		       bool kern)
1421 {
1422 	struct sock *sk = sock->sk;
1423 	struct sock *tsk;
1424 	struct sk_buff *skb;
1425 	int err;
1426 
1427 	err = -EOPNOTSUPP;
1428 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1429 		goto out;
1430 
1431 	err = -EINVAL;
1432 	if (sk->sk_state != TCP_LISTEN)
1433 		goto out;
1434 
1435 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1436 	 * so that no locks are necessary.
1437 	 */
1438 
1439 	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1440 	if (!skb) {
1441 		/* This means receive shutdown. */
1442 		if (err == 0)
1443 			err = -EINVAL;
1444 		goto out;
1445 	}
1446 
1447 	tsk = skb->sk;
1448 	skb_free_datagram(sk, skb);
1449 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1450 
1451 	/* attach accepted sock to socket */
1452 	unix_state_lock(tsk);
1453 	newsock->state = SS_CONNECTED;
1454 	unix_sock_inherit_flags(sock, newsock);
1455 	sock_graft(tsk, newsock);
1456 	unix_state_unlock(tsk);
1457 	return 0;
1458 
1459 out:
1460 	return err;
1461 }
1462 
1463 
1464 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1465 {
1466 	struct sock *sk = sock->sk;
1467 	struct unix_address *addr;
1468 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1469 	int err = 0;
1470 
1471 	if (peer) {
1472 		sk = unix_peer_get(sk);
1473 
1474 		err = -ENOTCONN;
1475 		if (!sk)
1476 			goto out;
1477 		err = 0;
1478 	} else {
1479 		sock_hold(sk);
1480 	}
1481 
1482 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1483 	if (!addr) {
1484 		sunaddr->sun_family = AF_UNIX;
1485 		sunaddr->sun_path[0] = 0;
1486 		err = sizeof(short);
1487 	} else {
1488 		err = addr->len;
1489 		memcpy(sunaddr, addr->name, addr->len);
1490 	}
1491 	sock_put(sk);
1492 out:
1493 	return err;
1494 }
1495 
1496 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1497 {
1498 	int err = 0;
1499 
1500 	UNIXCB(skb).pid  = get_pid(scm->pid);
1501 	UNIXCB(skb).uid = scm->creds.uid;
1502 	UNIXCB(skb).gid = scm->creds.gid;
1503 	UNIXCB(skb).fp = NULL;
1504 	unix_get_secdata(scm, skb);
1505 	if (scm->fp && send_fds)
1506 		err = unix_attach_fds(scm, skb);
1507 
1508 	skb->destructor = unix_destruct_scm;
1509 	return err;
1510 }
1511 
1512 static bool unix_passcred_enabled(const struct socket *sock,
1513 				  const struct sock *other)
1514 {
1515 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1516 	       !other->sk_socket ||
1517 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1518 }
1519 
1520 /*
1521  * Some apps rely on write() giving SCM_CREDENTIALS
1522  * We include credentials if source or destination socket
1523  * asserted SOCK_PASSCRED.
1524  */
1525 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1526 			    const struct sock *other)
1527 {
1528 	if (UNIXCB(skb).pid)
1529 		return;
1530 	if (unix_passcred_enabled(sock, other)) {
1531 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1532 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1533 	}
1534 }
1535 
1536 static int maybe_init_creds(struct scm_cookie *scm,
1537 			    struct socket *socket,
1538 			    const struct sock *other)
1539 {
1540 	int err;
1541 	struct msghdr msg = { .msg_controllen = 0 };
1542 
1543 	err = scm_send(socket, &msg, scm, false);
1544 	if (err)
1545 		return err;
1546 
1547 	if (unix_passcred_enabled(socket, other)) {
1548 		scm->pid = get_pid(task_tgid(current));
1549 		current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1550 	}
1551 	return err;
1552 }
1553 
1554 static bool unix_skb_scm_eq(struct sk_buff *skb,
1555 			    struct scm_cookie *scm)
1556 {
1557 	const struct unix_skb_parms *u = &UNIXCB(skb);
1558 
1559 	return u->pid == scm->pid &&
1560 	       uid_eq(u->uid, scm->creds.uid) &&
1561 	       gid_eq(u->gid, scm->creds.gid) &&
1562 	       unix_secdata_eq(scm, skb);
1563 }
1564 
1565 /*
1566  *	Send AF_UNIX data.
1567  */
1568 
1569 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1570 			      size_t len)
1571 {
1572 	struct sock *sk = sock->sk;
1573 	struct net *net = sock_net(sk);
1574 	struct unix_sock *u = unix_sk(sk);
1575 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1576 	struct sock *other = NULL;
1577 	int namelen = 0; /* fake GCC */
1578 	int err;
1579 	unsigned int hash;
1580 	struct sk_buff *skb;
1581 	long timeo;
1582 	struct scm_cookie scm;
1583 	int data_len = 0;
1584 	int sk_locked;
1585 
1586 	wait_for_unix_gc();
1587 	err = scm_send(sock, msg, &scm, false);
1588 	if (err < 0)
1589 		return err;
1590 
1591 	err = -EOPNOTSUPP;
1592 	if (msg->msg_flags&MSG_OOB)
1593 		goto out;
1594 
1595 	if (msg->msg_namelen) {
1596 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1597 		if (err < 0)
1598 			goto out;
1599 		namelen = err;
1600 	} else {
1601 		sunaddr = NULL;
1602 		err = -ENOTCONN;
1603 		other = unix_peer_get(sk);
1604 		if (!other)
1605 			goto out;
1606 	}
1607 
1608 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1609 	    && (err = unix_autobind(sock)) != 0)
1610 		goto out;
1611 
1612 	err = -EMSGSIZE;
1613 	if (len > sk->sk_sndbuf - 32)
1614 		goto out;
1615 
1616 	if (len > SKB_MAX_ALLOC) {
1617 		data_len = min_t(size_t,
1618 				 len - SKB_MAX_ALLOC,
1619 				 MAX_SKB_FRAGS * PAGE_SIZE);
1620 		data_len = PAGE_ALIGN(data_len);
1621 
1622 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1623 	}
1624 
1625 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1626 				   msg->msg_flags & MSG_DONTWAIT, &err,
1627 				   PAGE_ALLOC_COSTLY_ORDER);
1628 	if (skb == NULL)
1629 		goto out;
1630 
1631 	err = unix_scm_to_skb(&scm, skb, true);
1632 	if (err < 0)
1633 		goto out_free;
1634 
1635 	skb_put(skb, len - data_len);
1636 	skb->data_len = data_len;
1637 	skb->len = len;
1638 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1639 	if (err)
1640 		goto out_free;
1641 
1642 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1643 
1644 restart:
1645 	if (!other) {
1646 		err = -ECONNRESET;
1647 		if (sunaddr == NULL)
1648 			goto out_free;
1649 
1650 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1651 					hash, &err);
1652 		if (other == NULL)
1653 			goto out_free;
1654 	}
1655 
1656 	if (sk_filter(other, skb) < 0) {
1657 		/* Toss the packet but do not return any error to the sender */
1658 		err = len;
1659 		goto out_free;
1660 	}
1661 
1662 	sk_locked = 0;
1663 	unix_state_lock(other);
1664 restart_locked:
1665 	err = -EPERM;
1666 	if (!unix_may_send(sk, other))
1667 		goto out_unlock;
1668 
1669 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1670 		/*
1671 		 *	Check with 1003.1g - what should
1672 		 *	datagram error
1673 		 */
1674 		unix_state_unlock(other);
1675 		sock_put(other);
1676 
1677 		if (!sk_locked)
1678 			unix_state_lock(sk);
1679 
1680 		err = 0;
1681 		if (unix_peer(sk) == other) {
1682 			unix_peer(sk) = NULL;
1683 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1684 
1685 			unix_state_unlock(sk);
1686 
1687 			unix_dgram_disconnected(sk, other);
1688 			sock_put(other);
1689 			err = -ECONNREFUSED;
1690 		} else {
1691 			unix_state_unlock(sk);
1692 		}
1693 
1694 		other = NULL;
1695 		if (err)
1696 			goto out_free;
1697 		goto restart;
1698 	}
1699 
1700 	err = -EPIPE;
1701 	if (other->sk_shutdown & RCV_SHUTDOWN)
1702 		goto out_unlock;
1703 
1704 	if (sk->sk_type != SOCK_SEQPACKET) {
1705 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1706 		if (err)
1707 			goto out_unlock;
1708 	}
1709 
1710 	/* other == sk && unix_peer(other) != sk if
1711 	 * - unix_peer(sk) == NULL, destination address bound to sk
1712 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
1713 	 */
1714 	if (other != sk &&
1715 	    unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
1716 		if (timeo) {
1717 			timeo = unix_wait_for_peer(other, timeo);
1718 
1719 			err = sock_intr_errno(timeo);
1720 			if (signal_pending(current))
1721 				goto out_free;
1722 
1723 			goto restart;
1724 		}
1725 
1726 		if (!sk_locked) {
1727 			unix_state_unlock(other);
1728 			unix_state_double_lock(sk, other);
1729 		}
1730 
1731 		if (unix_peer(sk) != other ||
1732 		    unix_dgram_peer_wake_me(sk, other)) {
1733 			err = -EAGAIN;
1734 			sk_locked = 1;
1735 			goto out_unlock;
1736 		}
1737 
1738 		if (!sk_locked) {
1739 			sk_locked = 1;
1740 			goto restart_locked;
1741 		}
1742 	}
1743 
1744 	if (unlikely(sk_locked))
1745 		unix_state_unlock(sk);
1746 
1747 	if (sock_flag(other, SOCK_RCVTSTAMP))
1748 		__net_timestamp(skb);
1749 	maybe_add_creds(skb, sock, other);
1750 	skb_queue_tail(&other->sk_receive_queue, skb);
1751 	unix_state_unlock(other);
1752 	other->sk_data_ready(other);
1753 	sock_put(other);
1754 	scm_destroy(&scm);
1755 	return len;
1756 
1757 out_unlock:
1758 	if (sk_locked)
1759 		unix_state_unlock(sk);
1760 	unix_state_unlock(other);
1761 out_free:
1762 	kfree_skb(skb);
1763 out:
1764 	if (other)
1765 		sock_put(other);
1766 	scm_destroy(&scm);
1767 	return err;
1768 }
1769 
1770 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1771  * bytes, and a minimum of a full page.
1772  */
1773 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1774 
1775 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1776 			       size_t len)
1777 {
1778 	struct sock *sk = sock->sk;
1779 	struct sock *other = NULL;
1780 	int err, size;
1781 	struct sk_buff *skb;
1782 	int sent = 0;
1783 	struct scm_cookie scm;
1784 	bool fds_sent = false;
1785 	int data_len;
1786 
1787 	wait_for_unix_gc();
1788 	err = scm_send(sock, msg, &scm, false);
1789 	if (err < 0)
1790 		return err;
1791 
1792 	err = -EOPNOTSUPP;
1793 	if (msg->msg_flags&MSG_OOB)
1794 		goto out_err;
1795 
1796 	if (msg->msg_namelen) {
1797 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1798 		goto out_err;
1799 	} else {
1800 		err = -ENOTCONN;
1801 		other = unix_peer(sk);
1802 		if (!other)
1803 			goto out_err;
1804 	}
1805 
1806 	if (sk->sk_shutdown & SEND_SHUTDOWN)
1807 		goto pipe_err;
1808 
1809 	while (sent < len) {
1810 		size = len - sent;
1811 
1812 		/* Keep two messages in the pipe so it schedules better */
1813 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1814 
1815 		/* allow fallback to order-0 allocations */
1816 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1817 
1818 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1819 
1820 		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1821 
1822 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1823 					   msg->msg_flags & MSG_DONTWAIT, &err,
1824 					   get_order(UNIX_SKB_FRAGS_SZ));
1825 		if (!skb)
1826 			goto out_err;
1827 
1828 		/* Only send the fds in the first buffer */
1829 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
1830 		if (err < 0) {
1831 			kfree_skb(skb);
1832 			goto out_err;
1833 		}
1834 		fds_sent = true;
1835 
1836 		skb_put(skb, size - data_len);
1837 		skb->data_len = data_len;
1838 		skb->len = size;
1839 		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1840 		if (err) {
1841 			kfree_skb(skb);
1842 			goto out_err;
1843 		}
1844 
1845 		unix_state_lock(other);
1846 
1847 		if (sock_flag(other, SOCK_DEAD) ||
1848 		    (other->sk_shutdown & RCV_SHUTDOWN))
1849 			goto pipe_err_free;
1850 
1851 		maybe_add_creds(skb, sock, other);
1852 		skb_queue_tail(&other->sk_receive_queue, skb);
1853 		unix_state_unlock(other);
1854 		other->sk_data_ready(other);
1855 		sent += size;
1856 	}
1857 
1858 	scm_destroy(&scm);
1859 
1860 	return sent;
1861 
1862 pipe_err_free:
1863 	unix_state_unlock(other);
1864 	kfree_skb(skb);
1865 pipe_err:
1866 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1867 		send_sig(SIGPIPE, current, 0);
1868 	err = -EPIPE;
1869 out_err:
1870 	scm_destroy(&scm);
1871 	return sent ? : err;
1872 }
1873 
1874 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1875 				    int offset, size_t size, int flags)
1876 {
1877 	int err;
1878 	bool send_sigpipe = false;
1879 	bool init_scm = true;
1880 	struct scm_cookie scm;
1881 	struct sock *other, *sk = socket->sk;
1882 	struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1883 
1884 	if (flags & MSG_OOB)
1885 		return -EOPNOTSUPP;
1886 
1887 	other = unix_peer(sk);
1888 	if (!other || sk->sk_state != TCP_ESTABLISHED)
1889 		return -ENOTCONN;
1890 
1891 	if (false) {
1892 alloc_skb:
1893 		unix_state_unlock(other);
1894 		mutex_unlock(&unix_sk(other)->iolock);
1895 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1896 					      &err, 0);
1897 		if (!newskb)
1898 			goto err;
1899 	}
1900 
1901 	/* we must acquire iolock as we modify already present
1902 	 * skbs in the sk_receive_queue and mess with skb->len
1903 	 */
1904 	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
1905 	if (err) {
1906 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1907 		goto err;
1908 	}
1909 
1910 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
1911 		err = -EPIPE;
1912 		send_sigpipe = true;
1913 		goto err_unlock;
1914 	}
1915 
1916 	unix_state_lock(other);
1917 
1918 	if (sock_flag(other, SOCK_DEAD) ||
1919 	    other->sk_shutdown & RCV_SHUTDOWN) {
1920 		err = -EPIPE;
1921 		send_sigpipe = true;
1922 		goto err_state_unlock;
1923 	}
1924 
1925 	if (init_scm) {
1926 		err = maybe_init_creds(&scm, socket, other);
1927 		if (err)
1928 			goto err_state_unlock;
1929 		init_scm = false;
1930 	}
1931 
1932 	skb = skb_peek_tail(&other->sk_receive_queue);
1933 	if (tail && tail == skb) {
1934 		skb = newskb;
1935 	} else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
1936 		if (newskb) {
1937 			skb = newskb;
1938 		} else {
1939 			tail = skb;
1940 			goto alloc_skb;
1941 		}
1942 	} else if (newskb) {
1943 		/* this is fast path, we don't necessarily need to
1944 		 * call to kfree_skb even though with newskb == NULL
1945 		 * this - does no harm
1946 		 */
1947 		consume_skb(newskb);
1948 		newskb = NULL;
1949 	}
1950 
1951 	if (skb_append_pagefrags(skb, page, offset, size)) {
1952 		tail = skb;
1953 		goto alloc_skb;
1954 	}
1955 
1956 	skb->len += size;
1957 	skb->data_len += size;
1958 	skb->truesize += size;
1959 	refcount_add(size, &sk->sk_wmem_alloc);
1960 
1961 	if (newskb) {
1962 		err = unix_scm_to_skb(&scm, skb, false);
1963 		if (err)
1964 			goto err_state_unlock;
1965 		spin_lock(&other->sk_receive_queue.lock);
1966 		__skb_queue_tail(&other->sk_receive_queue, newskb);
1967 		spin_unlock(&other->sk_receive_queue.lock);
1968 	}
1969 
1970 	unix_state_unlock(other);
1971 	mutex_unlock(&unix_sk(other)->iolock);
1972 
1973 	other->sk_data_ready(other);
1974 	scm_destroy(&scm);
1975 	return size;
1976 
1977 err_state_unlock:
1978 	unix_state_unlock(other);
1979 err_unlock:
1980 	mutex_unlock(&unix_sk(other)->iolock);
1981 err:
1982 	kfree_skb(newskb);
1983 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
1984 		send_sig(SIGPIPE, current, 0);
1985 	if (!init_scm)
1986 		scm_destroy(&scm);
1987 	return err;
1988 }
1989 
1990 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
1991 				  size_t len)
1992 {
1993 	int err;
1994 	struct sock *sk = sock->sk;
1995 
1996 	err = sock_error(sk);
1997 	if (err)
1998 		return err;
1999 
2000 	if (sk->sk_state != TCP_ESTABLISHED)
2001 		return -ENOTCONN;
2002 
2003 	if (msg->msg_namelen)
2004 		msg->msg_namelen = 0;
2005 
2006 	return unix_dgram_sendmsg(sock, msg, len);
2007 }
2008 
2009 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2010 				  size_t size, int flags)
2011 {
2012 	struct sock *sk = sock->sk;
2013 
2014 	if (sk->sk_state != TCP_ESTABLISHED)
2015 		return -ENOTCONN;
2016 
2017 	return unix_dgram_recvmsg(sock, msg, size, flags);
2018 }
2019 
2020 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2021 {
2022 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2023 
2024 	if (addr) {
2025 		msg->msg_namelen = addr->len;
2026 		memcpy(msg->msg_name, addr->name, addr->len);
2027 	}
2028 }
2029 
2030 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2031 			      size_t size, int flags)
2032 {
2033 	struct scm_cookie scm;
2034 	struct sock *sk = sock->sk;
2035 	struct unix_sock *u = unix_sk(sk);
2036 	struct sk_buff *skb, *last;
2037 	long timeo;
2038 	int skip;
2039 	int err;
2040 
2041 	err = -EOPNOTSUPP;
2042 	if (flags&MSG_OOB)
2043 		goto out;
2044 
2045 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2046 
2047 	do {
2048 		mutex_lock(&u->iolock);
2049 
2050 		skip = sk_peek_offset(sk, flags);
2051 		skb = __skb_try_recv_datagram(sk, flags, NULL, &skip, &err,
2052 					      &last);
2053 		if (skb)
2054 			break;
2055 
2056 		mutex_unlock(&u->iolock);
2057 
2058 		if (err != -EAGAIN)
2059 			break;
2060 	} while (timeo &&
2061 		 !__skb_wait_for_more_packets(sk, &err, &timeo, last));
2062 
2063 	if (!skb) { /* implies iolock unlocked */
2064 		unix_state_lock(sk);
2065 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2066 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2067 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2068 			err = 0;
2069 		unix_state_unlock(sk);
2070 		goto out;
2071 	}
2072 
2073 	if (wq_has_sleeper(&u->peer_wait))
2074 		wake_up_interruptible_sync_poll(&u->peer_wait,
2075 						EPOLLOUT | EPOLLWRNORM |
2076 						EPOLLWRBAND);
2077 
2078 	if (msg->msg_name)
2079 		unix_copy_addr(msg, skb->sk);
2080 
2081 	if (size > skb->len - skip)
2082 		size = skb->len - skip;
2083 	else if (size < skb->len - skip)
2084 		msg->msg_flags |= MSG_TRUNC;
2085 
2086 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2087 	if (err)
2088 		goto out_free;
2089 
2090 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2091 		__sock_recv_timestamp(msg, sk, skb);
2092 
2093 	memset(&scm, 0, sizeof(scm));
2094 
2095 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2096 	unix_set_secdata(&scm, skb);
2097 
2098 	if (!(flags & MSG_PEEK)) {
2099 		if (UNIXCB(skb).fp)
2100 			unix_detach_fds(&scm, skb);
2101 
2102 		sk_peek_offset_bwd(sk, skb->len);
2103 	} else {
2104 		/* It is questionable: on PEEK we could:
2105 		   - do not return fds - good, but too simple 8)
2106 		   - return fds, and do not return them on read (old strategy,
2107 		     apparently wrong)
2108 		   - clone fds (I chose it for now, it is the most universal
2109 		     solution)
2110 
2111 		   POSIX 1003.1g does not actually define this clearly
2112 		   at all. POSIX 1003.1g doesn't define a lot of things
2113 		   clearly however!
2114 
2115 		*/
2116 
2117 		sk_peek_offset_fwd(sk, size);
2118 
2119 		if (UNIXCB(skb).fp)
2120 			scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2121 	}
2122 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2123 
2124 	scm_recv(sock, msg, &scm, flags);
2125 
2126 out_free:
2127 	skb_free_datagram(sk, skb);
2128 	mutex_unlock(&u->iolock);
2129 out:
2130 	return err;
2131 }
2132 
2133 /*
2134  *	Sleep until more data has arrived. But check for races..
2135  */
2136 static long unix_stream_data_wait(struct sock *sk, long timeo,
2137 				  struct sk_buff *last, unsigned int last_len,
2138 				  bool freezable)
2139 {
2140 	struct sk_buff *tail;
2141 	DEFINE_WAIT(wait);
2142 
2143 	unix_state_lock(sk);
2144 
2145 	for (;;) {
2146 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2147 
2148 		tail = skb_peek_tail(&sk->sk_receive_queue);
2149 		if (tail != last ||
2150 		    (tail && tail->len != last_len) ||
2151 		    sk->sk_err ||
2152 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2153 		    signal_pending(current) ||
2154 		    !timeo)
2155 			break;
2156 
2157 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2158 		unix_state_unlock(sk);
2159 		if (freezable)
2160 			timeo = freezable_schedule_timeout(timeo);
2161 		else
2162 			timeo = schedule_timeout(timeo);
2163 		unix_state_lock(sk);
2164 
2165 		if (sock_flag(sk, SOCK_DEAD))
2166 			break;
2167 
2168 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2169 	}
2170 
2171 	finish_wait(sk_sleep(sk), &wait);
2172 	unix_state_unlock(sk);
2173 	return timeo;
2174 }
2175 
2176 static unsigned int unix_skb_len(const struct sk_buff *skb)
2177 {
2178 	return skb->len - UNIXCB(skb).consumed;
2179 }
2180 
2181 struct unix_stream_read_state {
2182 	int (*recv_actor)(struct sk_buff *, int, int,
2183 			  struct unix_stream_read_state *);
2184 	struct socket *socket;
2185 	struct msghdr *msg;
2186 	struct pipe_inode_info *pipe;
2187 	size_t size;
2188 	int flags;
2189 	unsigned int splice_flags;
2190 };
2191 
2192 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2193 				    bool freezable)
2194 {
2195 	struct scm_cookie scm;
2196 	struct socket *sock = state->socket;
2197 	struct sock *sk = sock->sk;
2198 	struct unix_sock *u = unix_sk(sk);
2199 	int copied = 0;
2200 	int flags = state->flags;
2201 	int noblock = flags & MSG_DONTWAIT;
2202 	bool check_creds = false;
2203 	int target;
2204 	int err = 0;
2205 	long timeo;
2206 	int skip;
2207 	size_t size = state->size;
2208 	unsigned int last_len;
2209 
2210 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2211 		err = -EINVAL;
2212 		goto out;
2213 	}
2214 
2215 	if (unlikely(flags & MSG_OOB)) {
2216 		err = -EOPNOTSUPP;
2217 		goto out;
2218 	}
2219 
2220 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2221 	timeo = sock_rcvtimeo(sk, noblock);
2222 
2223 	memset(&scm, 0, sizeof(scm));
2224 
2225 	/* Lock the socket to prevent queue disordering
2226 	 * while sleeps in memcpy_tomsg
2227 	 */
2228 	mutex_lock(&u->iolock);
2229 
2230 	skip = max(sk_peek_offset(sk, flags), 0);
2231 
2232 	do {
2233 		int chunk;
2234 		bool drop_skb;
2235 		struct sk_buff *skb, *last;
2236 
2237 redo:
2238 		unix_state_lock(sk);
2239 		if (sock_flag(sk, SOCK_DEAD)) {
2240 			err = -ECONNRESET;
2241 			goto unlock;
2242 		}
2243 		last = skb = skb_peek(&sk->sk_receive_queue);
2244 		last_len = last ? last->len : 0;
2245 again:
2246 		if (skb == NULL) {
2247 			if (copied >= target)
2248 				goto unlock;
2249 
2250 			/*
2251 			 *	POSIX 1003.1g mandates this order.
2252 			 */
2253 
2254 			err = sock_error(sk);
2255 			if (err)
2256 				goto unlock;
2257 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2258 				goto unlock;
2259 
2260 			unix_state_unlock(sk);
2261 			if (!timeo) {
2262 				err = -EAGAIN;
2263 				break;
2264 			}
2265 
2266 			mutex_unlock(&u->iolock);
2267 
2268 			timeo = unix_stream_data_wait(sk, timeo, last,
2269 						      last_len, freezable);
2270 
2271 			if (signal_pending(current)) {
2272 				err = sock_intr_errno(timeo);
2273 				scm_destroy(&scm);
2274 				goto out;
2275 			}
2276 
2277 			mutex_lock(&u->iolock);
2278 			goto redo;
2279 unlock:
2280 			unix_state_unlock(sk);
2281 			break;
2282 		}
2283 
2284 		while (skip >= unix_skb_len(skb)) {
2285 			skip -= unix_skb_len(skb);
2286 			last = skb;
2287 			last_len = skb->len;
2288 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2289 			if (!skb)
2290 				goto again;
2291 		}
2292 
2293 		unix_state_unlock(sk);
2294 
2295 		if (check_creds) {
2296 			/* Never glue messages from different writers */
2297 			if (!unix_skb_scm_eq(skb, &scm))
2298 				break;
2299 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2300 			/* Copy credentials */
2301 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2302 			unix_set_secdata(&scm, skb);
2303 			check_creds = true;
2304 		}
2305 
2306 		/* Copy address just once */
2307 		if (state->msg && state->msg->msg_name) {
2308 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2309 					 state->msg->msg_name);
2310 			unix_copy_addr(state->msg, skb->sk);
2311 			sunaddr = NULL;
2312 		}
2313 
2314 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2315 		skb_get(skb);
2316 		chunk = state->recv_actor(skb, skip, chunk, state);
2317 		drop_skb = !unix_skb_len(skb);
2318 		/* skb is only safe to use if !drop_skb */
2319 		consume_skb(skb);
2320 		if (chunk < 0) {
2321 			if (copied == 0)
2322 				copied = -EFAULT;
2323 			break;
2324 		}
2325 		copied += chunk;
2326 		size -= chunk;
2327 
2328 		if (drop_skb) {
2329 			/* the skb was touched by a concurrent reader;
2330 			 * we should not expect anything from this skb
2331 			 * anymore and assume it invalid - we can be
2332 			 * sure it was dropped from the socket queue
2333 			 *
2334 			 * let's report a short read
2335 			 */
2336 			err = 0;
2337 			break;
2338 		}
2339 
2340 		/* Mark read part of skb as used */
2341 		if (!(flags & MSG_PEEK)) {
2342 			UNIXCB(skb).consumed += chunk;
2343 
2344 			sk_peek_offset_bwd(sk, chunk);
2345 
2346 			if (UNIXCB(skb).fp)
2347 				unix_detach_fds(&scm, skb);
2348 
2349 			if (unix_skb_len(skb))
2350 				break;
2351 
2352 			skb_unlink(skb, &sk->sk_receive_queue);
2353 			consume_skb(skb);
2354 
2355 			if (scm.fp)
2356 				break;
2357 		} else {
2358 			/* It is questionable, see note in unix_dgram_recvmsg.
2359 			 */
2360 			if (UNIXCB(skb).fp)
2361 				scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2362 
2363 			sk_peek_offset_fwd(sk, chunk);
2364 
2365 			if (UNIXCB(skb).fp)
2366 				break;
2367 
2368 			skip = 0;
2369 			last = skb;
2370 			last_len = skb->len;
2371 			unix_state_lock(sk);
2372 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2373 			if (skb)
2374 				goto again;
2375 			unix_state_unlock(sk);
2376 			break;
2377 		}
2378 	} while (size);
2379 
2380 	mutex_unlock(&u->iolock);
2381 	if (state->msg)
2382 		scm_recv(sock, state->msg, &scm, flags);
2383 	else
2384 		scm_destroy(&scm);
2385 out:
2386 	return copied ? : err;
2387 }
2388 
2389 static int unix_stream_read_actor(struct sk_buff *skb,
2390 				  int skip, int chunk,
2391 				  struct unix_stream_read_state *state)
2392 {
2393 	int ret;
2394 
2395 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2396 				    state->msg, chunk);
2397 	return ret ?: chunk;
2398 }
2399 
2400 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2401 			       size_t size, int flags)
2402 {
2403 	struct unix_stream_read_state state = {
2404 		.recv_actor = unix_stream_read_actor,
2405 		.socket = sock,
2406 		.msg = msg,
2407 		.size = size,
2408 		.flags = flags
2409 	};
2410 
2411 	return unix_stream_read_generic(&state, true);
2412 }
2413 
2414 static int unix_stream_splice_actor(struct sk_buff *skb,
2415 				    int skip, int chunk,
2416 				    struct unix_stream_read_state *state)
2417 {
2418 	return skb_splice_bits(skb, state->socket->sk,
2419 			       UNIXCB(skb).consumed + skip,
2420 			       state->pipe, chunk, state->splice_flags);
2421 }
2422 
2423 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2424 				       struct pipe_inode_info *pipe,
2425 				       size_t size, unsigned int flags)
2426 {
2427 	struct unix_stream_read_state state = {
2428 		.recv_actor = unix_stream_splice_actor,
2429 		.socket = sock,
2430 		.pipe = pipe,
2431 		.size = size,
2432 		.splice_flags = flags,
2433 	};
2434 
2435 	if (unlikely(*ppos))
2436 		return -ESPIPE;
2437 
2438 	if (sock->file->f_flags & O_NONBLOCK ||
2439 	    flags & SPLICE_F_NONBLOCK)
2440 		state.flags = MSG_DONTWAIT;
2441 
2442 	return unix_stream_read_generic(&state, false);
2443 }
2444 
2445 static int unix_shutdown(struct socket *sock, int mode)
2446 {
2447 	struct sock *sk = sock->sk;
2448 	struct sock *other;
2449 
2450 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2451 		return -EINVAL;
2452 	/* This maps:
2453 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2454 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2455 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2456 	 */
2457 	++mode;
2458 
2459 	unix_state_lock(sk);
2460 	sk->sk_shutdown |= mode;
2461 	other = unix_peer(sk);
2462 	if (other)
2463 		sock_hold(other);
2464 	unix_state_unlock(sk);
2465 	sk->sk_state_change(sk);
2466 
2467 	if (other &&
2468 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2469 
2470 		int peer_mode = 0;
2471 
2472 		if (mode&RCV_SHUTDOWN)
2473 			peer_mode |= SEND_SHUTDOWN;
2474 		if (mode&SEND_SHUTDOWN)
2475 			peer_mode |= RCV_SHUTDOWN;
2476 		unix_state_lock(other);
2477 		other->sk_shutdown |= peer_mode;
2478 		unix_state_unlock(other);
2479 		other->sk_state_change(other);
2480 		if (peer_mode == SHUTDOWN_MASK)
2481 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2482 		else if (peer_mode & RCV_SHUTDOWN)
2483 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2484 	}
2485 	if (other)
2486 		sock_put(other);
2487 
2488 	return 0;
2489 }
2490 
2491 long unix_inq_len(struct sock *sk)
2492 {
2493 	struct sk_buff *skb;
2494 	long amount = 0;
2495 
2496 	if (sk->sk_state == TCP_LISTEN)
2497 		return -EINVAL;
2498 
2499 	spin_lock(&sk->sk_receive_queue.lock);
2500 	if (sk->sk_type == SOCK_STREAM ||
2501 	    sk->sk_type == SOCK_SEQPACKET) {
2502 		skb_queue_walk(&sk->sk_receive_queue, skb)
2503 			amount += unix_skb_len(skb);
2504 	} else {
2505 		skb = skb_peek(&sk->sk_receive_queue);
2506 		if (skb)
2507 			amount = skb->len;
2508 	}
2509 	spin_unlock(&sk->sk_receive_queue.lock);
2510 
2511 	return amount;
2512 }
2513 EXPORT_SYMBOL_GPL(unix_inq_len);
2514 
2515 long unix_outq_len(struct sock *sk)
2516 {
2517 	return sk_wmem_alloc_get(sk);
2518 }
2519 EXPORT_SYMBOL_GPL(unix_outq_len);
2520 
2521 static int unix_open_file(struct sock *sk)
2522 {
2523 	struct path path;
2524 	struct file *f;
2525 	int fd;
2526 
2527 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2528 		return -EPERM;
2529 
2530 	if (!smp_load_acquire(&unix_sk(sk)->addr))
2531 		return -ENOENT;
2532 
2533 	path = unix_sk(sk)->path;
2534 	if (!path.dentry)
2535 		return -ENOENT;
2536 
2537 	path_get(&path);
2538 
2539 	fd = get_unused_fd_flags(O_CLOEXEC);
2540 	if (fd < 0)
2541 		goto out;
2542 
2543 	f = dentry_open(&path, O_PATH, current_cred());
2544 	if (IS_ERR(f)) {
2545 		put_unused_fd(fd);
2546 		fd = PTR_ERR(f);
2547 		goto out;
2548 	}
2549 
2550 	fd_install(fd, f);
2551 out:
2552 	path_put(&path);
2553 
2554 	return fd;
2555 }
2556 
2557 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2558 {
2559 	struct sock *sk = sock->sk;
2560 	long amount = 0;
2561 	int err;
2562 
2563 	switch (cmd) {
2564 	case SIOCOUTQ:
2565 		amount = unix_outq_len(sk);
2566 		err = put_user(amount, (int __user *)arg);
2567 		break;
2568 	case SIOCINQ:
2569 		amount = unix_inq_len(sk);
2570 		if (amount < 0)
2571 			err = amount;
2572 		else
2573 			err = put_user(amount, (int __user *)arg);
2574 		break;
2575 	case SIOCUNIXFILE:
2576 		err = unix_open_file(sk);
2577 		break;
2578 	default:
2579 		err = -ENOIOCTLCMD;
2580 		break;
2581 	}
2582 	return err;
2583 }
2584 
2585 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2586 {
2587 	struct sock *sk = sock->sk;
2588 	__poll_t mask;
2589 
2590 	sock_poll_wait(file, sock, wait);
2591 	mask = 0;
2592 
2593 	/* exceptional events? */
2594 	if (sk->sk_err)
2595 		mask |= EPOLLERR;
2596 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2597 		mask |= EPOLLHUP;
2598 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2599 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2600 
2601 	/* readable? */
2602 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2603 		mask |= EPOLLIN | EPOLLRDNORM;
2604 
2605 	/* Connection-based need to check for termination and startup */
2606 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2607 	    sk->sk_state == TCP_CLOSE)
2608 		mask |= EPOLLHUP;
2609 
2610 	/*
2611 	 * we set writable also when the other side has shut down the
2612 	 * connection. This prevents stuck sockets.
2613 	 */
2614 	if (unix_writable(sk))
2615 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2616 
2617 	return mask;
2618 }
2619 
2620 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
2621 				    poll_table *wait)
2622 {
2623 	struct sock *sk = sock->sk, *other;
2624 	unsigned int writable;
2625 	__poll_t mask;
2626 
2627 	sock_poll_wait(file, sock, wait);
2628 	mask = 0;
2629 
2630 	/* exceptional events? */
2631 	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
2632 		mask |= EPOLLERR |
2633 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
2634 
2635 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2636 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2637 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2638 		mask |= EPOLLHUP;
2639 
2640 	/* readable? */
2641 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2642 		mask |= EPOLLIN | EPOLLRDNORM;
2643 
2644 	/* Connection-based need to check for termination and startup */
2645 	if (sk->sk_type == SOCK_SEQPACKET) {
2646 		if (sk->sk_state == TCP_CLOSE)
2647 			mask |= EPOLLHUP;
2648 		/* connection hasn't started yet? */
2649 		if (sk->sk_state == TCP_SYN_SENT)
2650 			return mask;
2651 	}
2652 
2653 	/* No write status requested, avoid expensive OUT tests. */
2654 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
2655 		return mask;
2656 
2657 	writable = unix_writable(sk);
2658 	if (writable) {
2659 		unix_state_lock(sk);
2660 
2661 		other = unix_peer(sk);
2662 		if (other && unix_peer(other) != sk &&
2663 		    unix_recvq_full(other) &&
2664 		    unix_dgram_peer_wake_me(sk, other))
2665 			writable = 0;
2666 
2667 		unix_state_unlock(sk);
2668 	}
2669 
2670 	if (writable)
2671 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2672 	else
2673 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2674 
2675 	return mask;
2676 }
2677 
2678 #ifdef CONFIG_PROC_FS
2679 
2680 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2681 
2682 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2683 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2684 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2685 
2686 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2687 {
2688 	unsigned long offset = get_offset(*pos);
2689 	unsigned long bucket = get_bucket(*pos);
2690 	struct sock *sk;
2691 	unsigned long count = 0;
2692 
2693 	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2694 		if (sock_net(sk) != seq_file_net(seq))
2695 			continue;
2696 		if (++count == offset)
2697 			break;
2698 	}
2699 
2700 	return sk;
2701 }
2702 
2703 static struct sock *unix_next_socket(struct seq_file *seq,
2704 				     struct sock *sk,
2705 				     loff_t *pos)
2706 {
2707 	unsigned long bucket;
2708 
2709 	while (sk > (struct sock *)SEQ_START_TOKEN) {
2710 		sk = sk_next(sk);
2711 		if (!sk)
2712 			goto next_bucket;
2713 		if (sock_net(sk) == seq_file_net(seq))
2714 			return sk;
2715 	}
2716 
2717 	do {
2718 		sk = unix_from_bucket(seq, pos);
2719 		if (sk)
2720 			return sk;
2721 
2722 next_bucket:
2723 		bucket = get_bucket(*pos) + 1;
2724 		*pos = set_bucket_offset(bucket, 1);
2725 	} while (bucket < ARRAY_SIZE(unix_socket_table));
2726 
2727 	return NULL;
2728 }
2729 
2730 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2731 	__acquires(unix_table_lock)
2732 {
2733 	spin_lock(&unix_table_lock);
2734 
2735 	if (!*pos)
2736 		return SEQ_START_TOKEN;
2737 
2738 	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2739 		return NULL;
2740 
2741 	return unix_next_socket(seq, NULL, pos);
2742 }
2743 
2744 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2745 {
2746 	++*pos;
2747 	return unix_next_socket(seq, v, pos);
2748 }
2749 
2750 static void unix_seq_stop(struct seq_file *seq, void *v)
2751 	__releases(unix_table_lock)
2752 {
2753 	spin_unlock(&unix_table_lock);
2754 }
2755 
2756 static int unix_seq_show(struct seq_file *seq, void *v)
2757 {
2758 
2759 	if (v == SEQ_START_TOKEN)
2760 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2761 			 "Inode Path\n");
2762 	else {
2763 		struct sock *s = v;
2764 		struct unix_sock *u = unix_sk(s);
2765 		unix_state_lock(s);
2766 
2767 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2768 			s,
2769 			refcount_read(&s->sk_refcnt),
2770 			0,
2771 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2772 			s->sk_type,
2773 			s->sk_socket ?
2774 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2775 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2776 			sock_i_ino(s));
2777 
2778 		if (u->addr) {	// under unix_table_lock here
2779 			int i, len;
2780 			seq_putc(seq, ' ');
2781 
2782 			i = 0;
2783 			len = u->addr->len - sizeof(short);
2784 			if (!UNIX_ABSTRACT(s))
2785 				len--;
2786 			else {
2787 				seq_putc(seq, '@');
2788 				i++;
2789 			}
2790 			for ( ; i < len; i++)
2791 				seq_putc(seq, u->addr->name->sun_path[i] ?:
2792 					 '@');
2793 		}
2794 		unix_state_unlock(s);
2795 		seq_putc(seq, '\n');
2796 	}
2797 
2798 	return 0;
2799 }
2800 
2801 static const struct seq_operations unix_seq_ops = {
2802 	.start  = unix_seq_start,
2803 	.next   = unix_seq_next,
2804 	.stop   = unix_seq_stop,
2805 	.show   = unix_seq_show,
2806 };
2807 #endif
2808 
2809 static const struct net_proto_family unix_family_ops = {
2810 	.family = PF_UNIX,
2811 	.create = unix_create,
2812 	.owner	= THIS_MODULE,
2813 };
2814 
2815 
2816 static int __net_init unix_net_init(struct net *net)
2817 {
2818 	int error = -ENOMEM;
2819 
2820 	net->unx.sysctl_max_dgram_qlen = 10;
2821 	if (unix_sysctl_register(net))
2822 		goto out;
2823 
2824 #ifdef CONFIG_PROC_FS
2825 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
2826 			sizeof(struct seq_net_private))) {
2827 		unix_sysctl_unregister(net);
2828 		goto out;
2829 	}
2830 #endif
2831 	error = 0;
2832 out:
2833 	return error;
2834 }
2835 
2836 static void __net_exit unix_net_exit(struct net *net)
2837 {
2838 	unix_sysctl_unregister(net);
2839 	remove_proc_entry("unix", net->proc_net);
2840 }
2841 
2842 static struct pernet_operations unix_net_ops = {
2843 	.init = unix_net_init,
2844 	.exit = unix_net_exit,
2845 };
2846 
2847 static int __init af_unix_init(void)
2848 {
2849 	int rc = -1;
2850 
2851 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2852 
2853 	rc = proto_register(&unix_proto, 1);
2854 	if (rc != 0) {
2855 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2856 		goto out;
2857 	}
2858 
2859 	sock_register(&unix_family_ops);
2860 	register_pernet_subsys(&unix_net_ops);
2861 out:
2862 	return rc;
2863 }
2864 
2865 static void __exit af_unix_exit(void)
2866 {
2867 	sock_unregister(PF_UNIX);
2868 	proto_unregister(&unix_proto);
2869 	unregister_pernet_subsys(&unix_net_ops);
2870 }
2871 
2872 /* Earlier than device_initcall() so that other drivers invoking
2873    request_module() don't end up in a loop when modprobe tries
2874    to use a UNIX socket. But later than subsys_initcall() because
2875    we depend on stuff initialised there */
2876 fs_initcall(af_unix_init);
2877 module_exit(af_unix_exit);
2878 
2879 MODULE_LICENSE("GPL");
2880 MODULE_ALIAS_NETPROTO(PF_UNIX);
2881