1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * NET4: Implementation of BSD Unix domain sockets.
4 *
5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
6 *
7 * Fixes:
8 * Linus Torvalds : Assorted bug cures.
9 * Niibe Yutaka : async I/O support.
10 * Carsten Paeth : PF_UNIX check, address fixes.
11 * Alan Cox : Limit size of allocated blocks.
12 * Alan Cox : Fixed the stupid socketpair bug.
13 * Alan Cox : BSD compatibility fine tuning.
14 * Alan Cox : Fixed a bug in connect when interrupted.
15 * Alan Cox : Sorted out a proper draft version of
16 * file descriptor passing hacked up from
17 * Mike Shaver's work.
18 * Marty Leisner : Fixes to fd passing
19 * Nick Nevin : recvmsg bugfix.
20 * Alan Cox : Started proper garbage collector
21 * Heiko EiBfeldt : Missing verify_area check
22 * Alan Cox : Started POSIXisms
23 * Andreas Schwab : Replace inode by dentry for proper
24 * reference counting
25 * Kirk Petersen : Made this a module
26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
27 * Lots of bug fixes.
28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
29 * by above two patches.
30 * Andrea Arcangeli : If possible we block in connect(2)
31 * if the max backlog of the listen socket
32 * is been reached. This won't break
33 * old apps and it will avoid huge amount
34 * of socks hashed (this for unix_gc()
35 * performances reasons).
36 * Security fix that limits the max
37 * number of socks to 2*max_files and
38 * the number of skb queueable in the
39 * dgram receiver.
40 * Artur Skawina : Hash function optimizations
41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42 * Malcolm Beattie : Set peercred for socketpair
43 * Michal Ostrowski : Module initialization cleanup.
44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45 * the core infrastructure is doing that
46 * for all net proto families now (2.5.69+)
47 *
48 * Known differences from reference BSD that was tested:
49 *
50 * [TO FIX]
51 * ECONNREFUSED is not returned from one end of a connected() socket to the
52 * other the moment one end closes.
53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
55 * [NOT TO FIX]
56 * accept() returns a path name even if the connecting socket has closed
57 * in the meantime (BSD loses the path and gives up).
58 * accept() returns 0 length path for an unbound connector. BSD returns 16
59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 * BSD af_unix apparently has connect forgetting to block properly.
62 * (need to check this with the POSIX spec in detail)
63 *
64 * Differences from 2.0.0-11-... (ANK)
65 * Bug fixes and improvements.
66 * - client shutdown killed server socket.
67 * - removed all useless cli/sti pairs.
68 *
69 * Semantic changes/extensions.
70 * - generic control message passing.
71 * - SCM_CREDENTIALS control message.
72 * - "Abstract" (not FS based) socket bindings.
73 * Abstract names are sequences of bytes (not zero terminated)
74 * started by 0, so that this name space does not intersect
75 * with BSD names.
76 */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124
125 /* SMP locking strategy:
126 * hash table is protected with spinlock.
127 * each socket state is protected by separate spinlock.
128 */
129 #ifdef CONFIG_PROVE_LOCKING
130 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r)))
131
unix_table_lock_cmp_fn(const struct lockdep_map * a,const struct lockdep_map * b)132 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
133 const struct lockdep_map *b)
134 {
135 return cmp_ptr(a, b);
136 }
137
unix_state_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)138 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
139 const struct lockdep_map *_b)
140 {
141 const struct unix_sock *a, *b;
142
143 a = container_of(_a, struct unix_sock, lock.dep_map);
144 b = container_of(_b, struct unix_sock, lock.dep_map);
145
146 if (a->sk.sk_state == TCP_LISTEN) {
147 /* unix_stream_connect(): Before the 2nd unix_state_lock(),
148 *
149 * 1. a is TCP_LISTEN.
150 * 2. b is not a.
151 * 3. concurrent connect(b -> a) must fail.
152 *
153 * Except for 2. & 3., the b's state can be any possible
154 * value due to concurrent connect() or listen().
155 *
156 * 2. is detected in debug_spin_lock_before(), and 3. cannot
157 * be expressed as lock_cmp_fn.
158 */
159 switch (b->sk.sk_state) {
160 case TCP_CLOSE:
161 case TCP_ESTABLISHED:
162 case TCP_LISTEN:
163 return -1;
164 default:
165 /* Invalid case. */
166 return 0;
167 }
168 }
169
170 /* Should never happen. Just to be symmetric. */
171 if (b->sk.sk_state == TCP_LISTEN) {
172 switch (b->sk.sk_state) {
173 case TCP_CLOSE:
174 case TCP_ESTABLISHED:
175 return 1;
176 default:
177 return 0;
178 }
179 }
180
181 /* unix_state_double_lock(): ascending address order. */
182 return cmp_ptr(a, b);
183 }
184
unix_recvq_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)185 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
186 const struct lockdep_map *_b)
187 {
188 const struct sock *a, *b;
189
190 a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
191 b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
192
193 /* unix_collect_skb(): listener -> embryo order. */
194 if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
195 return -1;
196
197 /* Should never happen. Just to be symmetric. */
198 if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
199 return 1;
200
201 return 0;
202 }
203 #endif
204
unix_unbound_hash(struct sock * sk)205 static unsigned int unix_unbound_hash(struct sock *sk)
206 {
207 unsigned long hash = (unsigned long)sk;
208
209 hash ^= hash >> 16;
210 hash ^= hash >> 8;
211 hash ^= sk->sk_type;
212
213 return hash & UNIX_HASH_MOD;
214 }
215
unix_bsd_hash(struct inode * i)216 static unsigned int unix_bsd_hash(struct inode *i)
217 {
218 return i->i_ino & UNIX_HASH_MOD;
219 }
220
unix_abstract_hash(struct sockaddr_un * sunaddr,int addr_len,int type)221 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
222 int addr_len, int type)
223 {
224 __wsum csum = csum_partial(sunaddr, addr_len, 0);
225 unsigned int hash;
226
227 hash = (__force unsigned int)csum_fold(csum);
228 hash ^= hash >> 8;
229 hash ^= type;
230
231 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
232 }
233
unix_table_double_lock(struct net * net,unsigned int hash1,unsigned int hash2)234 static void unix_table_double_lock(struct net *net,
235 unsigned int hash1, unsigned int hash2)
236 {
237 if (hash1 == hash2) {
238 spin_lock(&net->unx.table.locks[hash1]);
239 return;
240 }
241
242 if (hash1 > hash2)
243 swap(hash1, hash2);
244
245 spin_lock(&net->unx.table.locks[hash1]);
246 spin_lock(&net->unx.table.locks[hash2]);
247 }
248
unix_table_double_unlock(struct net * net,unsigned int hash1,unsigned int hash2)249 static void unix_table_double_unlock(struct net *net,
250 unsigned int hash1, unsigned int hash2)
251 {
252 if (hash1 == hash2) {
253 spin_unlock(&net->unx.table.locks[hash1]);
254 return;
255 }
256
257 spin_unlock(&net->unx.table.locks[hash1]);
258 spin_unlock(&net->unx.table.locks[hash2]);
259 }
260
261 #ifdef CONFIG_SECURITY_NETWORK
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)262 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
263 {
264 UNIXCB(skb).secid = scm->secid;
265 }
266
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)267 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
268 {
269 scm->secid = UNIXCB(skb).secid;
270 }
271
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)272 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
273 {
274 return (scm->secid == UNIXCB(skb).secid);
275 }
276 #else
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)277 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
278 { }
279
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)280 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
281 { }
282
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)283 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
284 {
285 return true;
286 }
287 #endif /* CONFIG_SECURITY_NETWORK */
288
unix_our_peer(struct sock * sk,struct sock * osk)289 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
290 {
291 return unix_peer(osk) == sk;
292 }
293
unix_may_send(struct sock * sk,struct sock * osk)294 static inline int unix_may_send(struct sock *sk, struct sock *osk)
295 {
296 return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
297 }
298
unix_recvq_full_lockless(const struct sock * sk)299 static inline int unix_recvq_full_lockless(const struct sock *sk)
300 {
301 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
302 }
303
unix_peer_get(struct sock * s)304 struct sock *unix_peer_get(struct sock *s)
305 {
306 struct sock *peer;
307
308 unix_state_lock(s);
309 peer = unix_peer(s);
310 if (peer)
311 sock_hold(peer);
312 unix_state_unlock(s);
313 return peer;
314 }
315 EXPORT_SYMBOL_GPL(unix_peer_get);
316
unix_create_addr(struct sockaddr_un * sunaddr,int addr_len)317 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
318 int addr_len)
319 {
320 struct unix_address *addr;
321
322 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
323 if (!addr)
324 return NULL;
325
326 refcount_set(&addr->refcnt, 1);
327 addr->len = addr_len;
328 memcpy(addr->name, sunaddr, addr_len);
329
330 return addr;
331 }
332
unix_release_addr(struct unix_address * addr)333 static inline void unix_release_addr(struct unix_address *addr)
334 {
335 if (refcount_dec_and_test(&addr->refcnt))
336 kfree(addr);
337 }
338
339 /*
340 * Check unix socket name:
341 * - should be not zero length.
342 * - if started by not zero, should be NULL terminated (FS object)
343 * - if started by zero, it is abstract name.
344 */
345
unix_validate_addr(struct sockaddr_un * sunaddr,int addr_len)346 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
347 {
348 if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
349 addr_len > sizeof(*sunaddr))
350 return -EINVAL;
351
352 if (sunaddr->sun_family != AF_UNIX)
353 return -EINVAL;
354
355 return 0;
356 }
357
unix_mkname_bsd(struct sockaddr_un * sunaddr,int addr_len)358 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
359 {
360 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
361 short offset = offsetof(struct sockaddr_storage, __data);
362
363 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
364
365 /* This may look like an off by one error but it is a bit more
366 * subtle. 108 is the longest valid AF_UNIX path for a binding.
367 * sun_path[108] doesn't as such exist. However in kernel space
368 * we are guaranteed that it is a valid memory location in our
369 * kernel address buffer because syscall functions always pass
370 * a pointer of struct sockaddr_storage which has a bigger buffer
371 * than 108. Also, we must terminate sun_path for strlen() in
372 * getname_kernel().
373 */
374 addr->__data[addr_len - offset] = 0;
375
376 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will
377 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen()
378 * know the actual buffer.
379 */
380 return strlen(addr->__data) + offset + 1;
381 }
382
__unix_remove_socket(struct sock * sk)383 static void __unix_remove_socket(struct sock *sk)
384 {
385 sk_del_node_init(sk);
386 }
387
__unix_insert_socket(struct net * net,struct sock * sk)388 static void __unix_insert_socket(struct net *net, struct sock *sk)
389 {
390 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
391 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
392 }
393
__unix_set_addr_hash(struct net * net,struct sock * sk,struct unix_address * addr,unsigned int hash)394 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
395 struct unix_address *addr, unsigned int hash)
396 {
397 __unix_remove_socket(sk);
398 smp_store_release(&unix_sk(sk)->addr, addr);
399
400 sk->sk_hash = hash;
401 __unix_insert_socket(net, sk);
402 }
403
unix_remove_socket(struct net * net,struct sock * sk)404 static void unix_remove_socket(struct net *net, struct sock *sk)
405 {
406 spin_lock(&net->unx.table.locks[sk->sk_hash]);
407 __unix_remove_socket(sk);
408 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
409 }
410
unix_insert_unbound_socket(struct net * net,struct sock * sk)411 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
412 {
413 spin_lock(&net->unx.table.locks[sk->sk_hash]);
414 __unix_insert_socket(net, sk);
415 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
416 }
417
unix_insert_bsd_socket(struct sock * sk)418 static void unix_insert_bsd_socket(struct sock *sk)
419 {
420 spin_lock(&bsd_socket_locks[sk->sk_hash]);
421 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
422 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
423 }
424
unix_remove_bsd_socket(struct sock * sk)425 static void unix_remove_bsd_socket(struct sock *sk)
426 {
427 if (!hlist_unhashed(&sk->sk_bind_node)) {
428 spin_lock(&bsd_socket_locks[sk->sk_hash]);
429 __sk_del_bind_node(sk);
430 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
431
432 sk_node_init(&sk->sk_bind_node);
433 }
434 }
435
__unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)436 static struct sock *__unix_find_socket_byname(struct net *net,
437 struct sockaddr_un *sunname,
438 int len, unsigned int hash)
439 {
440 struct sock *s;
441
442 sk_for_each(s, &net->unx.table.buckets[hash]) {
443 struct unix_sock *u = unix_sk(s);
444
445 if (u->addr->len == len &&
446 !memcmp(u->addr->name, sunname, len))
447 return s;
448 }
449 return NULL;
450 }
451
unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)452 static inline struct sock *unix_find_socket_byname(struct net *net,
453 struct sockaddr_un *sunname,
454 int len, unsigned int hash)
455 {
456 struct sock *s;
457
458 spin_lock(&net->unx.table.locks[hash]);
459 s = __unix_find_socket_byname(net, sunname, len, hash);
460 if (s)
461 sock_hold(s);
462 spin_unlock(&net->unx.table.locks[hash]);
463 return s;
464 }
465
unix_find_socket_byinode(struct inode * i)466 static struct sock *unix_find_socket_byinode(struct inode *i)
467 {
468 unsigned int hash = unix_bsd_hash(i);
469 struct sock *s;
470
471 spin_lock(&bsd_socket_locks[hash]);
472 sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
473 struct dentry *dentry = unix_sk(s)->path.dentry;
474
475 if (dentry && d_backing_inode(dentry) == i) {
476 sock_hold(s);
477 spin_unlock(&bsd_socket_locks[hash]);
478 return s;
479 }
480 }
481 spin_unlock(&bsd_socket_locks[hash]);
482 return NULL;
483 }
484
485 /* Support code for asymmetrically connected dgram sockets
486 *
487 * If a datagram socket is connected to a socket not itself connected
488 * to the first socket (eg, /dev/log), clients may only enqueue more
489 * messages if the present receive queue of the server socket is not
490 * "too large". This means there's a second writeability condition
491 * poll and sendmsg need to test. The dgram recv code will do a wake
492 * up on the peer_wait wait queue of a socket upon reception of a
493 * datagram which needs to be propagated to sleeping would-be writers
494 * since these might not have sent anything so far. This can't be
495 * accomplished via poll_wait because the lifetime of the server
496 * socket might be less than that of its clients if these break their
497 * association with it or if the server socket is closed while clients
498 * are still connected to it and there's no way to inform "a polling
499 * implementation" that it should let go of a certain wait queue
500 *
501 * In order to propagate a wake up, a wait_queue_entry_t of the client
502 * socket is enqueued on the peer_wait queue of the server socket
503 * whose wake function does a wake_up on the ordinary client socket
504 * wait queue. This connection is established whenever a write (or
505 * poll for write) hit the flow control condition and broken when the
506 * association to the server socket is dissolved or after a wake up
507 * was relayed.
508 */
509
unix_dgram_peer_wake_relay(wait_queue_entry_t * q,unsigned mode,int flags,void * key)510 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
511 void *key)
512 {
513 struct unix_sock *u;
514 wait_queue_head_t *u_sleep;
515
516 u = container_of(q, struct unix_sock, peer_wake);
517
518 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
519 q);
520 u->peer_wake.private = NULL;
521
522 /* relaying can only happen while the wq still exists */
523 u_sleep = sk_sleep(&u->sk);
524 if (u_sleep)
525 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
526
527 return 0;
528 }
529
unix_dgram_peer_wake_connect(struct sock * sk,struct sock * other)530 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
531 {
532 struct unix_sock *u, *u_other;
533 int rc;
534
535 u = unix_sk(sk);
536 u_other = unix_sk(other);
537 rc = 0;
538 spin_lock(&u_other->peer_wait.lock);
539
540 if (!u->peer_wake.private) {
541 u->peer_wake.private = other;
542 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
543
544 rc = 1;
545 }
546
547 spin_unlock(&u_other->peer_wait.lock);
548 return rc;
549 }
550
unix_dgram_peer_wake_disconnect(struct sock * sk,struct sock * other)551 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
552 struct sock *other)
553 {
554 struct unix_sock *u, *u_other;
555
556 u = unix_sk(sk);
557 u_other = unix_sk(other);
558 spin_lock(&u_other->peer_wait.lock);
559
560 if (u->peer_wake.private == other) {
561 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
562 u->peer_wake.private = NULL;
563 }
564
565 spin_unlock(&u_other->peer_wait.lock);
566 }
567
unix_dgram_peer_wake_disconnect_wakeup(struct sock * sk,struct sock * other)568 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
569 struct sock *other)
570 {
571 unix_dgram_peer_wake_disconnect(sk, other);
572 wake_up_interruptible_poll(sk_sleep(sk),
573 EPOLLOUT |
574 EPOLLWRNORM |
575 EPOLLWRBAND);
576 }
577
578 /* preconditions:
579 * - unix_peer(sk) == other
580 * - association is stable
581 */
unix_dgram_peer_wake_me(struct sock * sk,struct sock * other)582 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
583 {
584 int connected;
585
586 connected = unix_dgram_peer_wake_connect(sk, other);
587
588 /* If other is SOCK_DEAD, we want to make sure we signal
589 * POLLOUT, such that a subsequent write() can get a
590 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
591 * to other and its full, we will hang waiting for POLLOUT.
592 */
593 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
594 return 1;
595
596 if (connected)
597 unix_dgram_peer_wake_disconnect(sk, other);
598
599 return 0;
600 }
601
unix_writable(const struct sock * sk,unsigned char state)602 static int unix_writable(const struct sock *sk, unsigned char state)
603 {
604 return state != TCP_LISTEN &&
605 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
606 }
607
unix_write_space(struct sock * sk)608 static void unix_write_space(struct sock *sk)
609 {
610 struct socket_wq *wq;
611
612 rcu_read_lock();
613 if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
614 wq = rcu_dereference(sk->sk_wq);
615 if (skwq_has_sleeper(wq))
616 wake_up_interruptible_sync_poll(&wq->wait,
617 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
618 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
619 }
620 rcu_read_unlock();
621 }
622
623 /* When dgram socket disconnects (or changes its peer), we clear its receive
624 * queue of packets arrived from previous peer. First, it allows to do
625 * flow control based only on wmem_alloc; second, sk connected to peer
626 * may receive messages only from that peer. */
unix_dgram_disconnected(struct sock * sk,struct sock * other)627 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
628 {
629 if (!skb_queue_empty(&sk->sk_receive_queue)) {
630 skb_queue_purge(&sk->sk_receive_queue);
631 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
632
633 /* If one link of bidirectional dgram pipe is disconnected,
634 * we signal error. Messages are lost. Do not make this,
635 * when peer was not connected to us.
636 */
637 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
638 WRITE_ONCE(other->sk_err, ECONNRESET);
639 sk_error_report(other);
640 }
641 }
642 }
643
unix_sock_destructor(struct sock * sk)644 static void unix_sock_destructor(struct sock *sk)
645 {
646 struct unix_sock *u = unix_sk(sk);
647
648 skb_queue_purge(&sk->sk_receive_queue);
649
650 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
651 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
652 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
653 if (!sock_flag(sk, SOCK_DEAD)) {
654 pr_info("Attempt to release alive unix socket: %p\n", sk);
655 return;
656 }
657
658 if (u->addr)
659 unix_release_addr(u->addr);
660
661 atomic_long_dec(&unix_nr_socks);
662 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
663 #ifdef UNIX_REFCNT_DEBUG
664 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
665 atomic_long_read(&unix_nr_socks));
666 #endif
667 }
668
unix_release_sock(struct sock * sk,int embrion)669 static void unix_release_sock(struct sock *sk, int embrion)
670 {
671 struct unix_sock *u = unix_sk(sk);
672 struct sock *skpair;
673 struct sk_buff *skb;
674 struct path path;
675 int state;
676
677 unix_remove_socket(sock_net(sk), sk);
678 unix_remove_bsd_socket(sk);
679
680 /* Clear state */
681 unix_state_lock(sk);
682 sock_orphan(sk);
683 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
684 path = u->path;
685 u->path.dentry = NULL;
686 u->path.mnt = NULL;
687 state = sk->sk_state;
688 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
689
690 skpair = unix_peer(sk);
691 unix_peer(sk) = NULL;
692
693 unix_state_unlock(sk);
694
695 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
696 if (u->oob_skb) {
697 kfree_skb(u->oob_skb);
698 u->oob_skb = NULL;
699 }
700 #endif
701
702 wake_up_interruptible_all(&u->peer_wait);
703
704 if (skpair != NULL) {
705 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
706 unix_state_lock(skpair);
707 /* No more writes */
708 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
709 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
710 WRITE_ONCE(skpair->sk_err, ECONNRESET);
711 unix_state_unlock(skpair);
712 skpair->sk_state_change(skpair);
713 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
714 }
715
716 unix_dgram_peer_wake_disconnect(sk, skpair);
717 sock_put(skpair); /* It may now die */
718 }
719
720 /* Try to flush out this socket. Throw out buffers at least */
721
722 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
723 if (state == TCP_LISTEN)
724 unix_release_sock(skb->sk, 1);
725
726 /* passed fds are erased in the kfree_skb hook */
727 kfree_skb(skb);
728 }
729
730 if (path.dentry)
731 path_put(&path);
732
733 sock_put(sk);
734
735 /* ---- Socket is dead now and most probably destroyed ---- */
736
737 /*
738 * Fixme: BSD difference: In BSD all sockets connected to us get
739 * ECONNRESET and we die on the spot. In Linux we behave
740 * like files and pipes do and wait for the last
741 * dereference.
742 *
743 * Can't we simply set sock->err?
744 *
745 * What the above comment does talk about? --ANK(980817)
746 */
747
748 if (READ_ONCE(unix_tot_inflight))
749 unix_gc(); /* Garbage collect fds */
750 }
751
init_peercred(struct sock * sk)752 static void init_peercred(struct sock *sk)
753 {
754 sk->sk_peer_pid = get_pid(task_tgid(current));
755 sk->sk_peer_cred = get_current_cred();
756 }
757
update_peercred(struct sock * sk)758 static void update_peercred(struct sock *sk)
759 {
760 const struct cred *old_cred;
761 struct pid *old_pid;
762
763 spin_lock(&sk->sk_peer_lock);
764 old_pid = sk->sk_peer_pid;
765 old_cred = sk->sk_peer_cred;
766 init_peercred(sk);
767 spin_unlock(&sk->sk_peer_lock);
768
769 put_pid(old_pid);
770 put_cred(old_cred);
771 }
772
copy_peercred(struct sock * sk,struct sock * peersk)773 static void copy_peercred(struct sock *sk, struct sock *peersk)
774 {
775 lockdep_assert_held(&unix_sk(peersk)->lock);
776
777 spin_lock(&sk->sk_peer_lock);
778 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
779 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
780 spin_unlock(&sk->sk_peer_lock);
781 }
782
unix_listen(struct socket * sock,int backlog)783 static int unix_listen(struct socket *sock, int backlog)
784 {
785 int err;
786 struct sock *sk = sock->sk;
787 struct unix_sock *u = unix_sk(sk);
788
789 err = -EOPNOTSUPP;
790 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
791 goto out; /* Only stream/seqpacket sockets accept */
792 err = -EINVAL;
793 if (!READ_ONCE(u->addr))
794 goto out; /* No listens on an unbound socket */
795 unix_state_lock(sk);
796 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
797 goto out_unlock;
798 if (backlog > sk->sk_max_ack_backlog)
799 wake_up_interruptible_all(&u->peer_wait);
800 sk->sk_max_ack_backlog = backlog;
801 WRITE_ONCE(sk->sk_state, TCP_LISTEN);
802
803 /* set credentials so connect can copy them */
804 update_peercred(sk);
805 err = 0;
806
807 out_unlock:
808 unix_state_unlock(sk);
809 out:
810 return err;
811 }
812
813 static int unix_release(struct socket *);
814 static int unix_bind(struct socket *, struct sockaddr *, int);
815 static int unix_stream_connect(struct socket *, struct sockaddr *,
816 int addr_len, int flags);
817 static int unix_socketpair(struct socket *, struct socket *);
818 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
819 static int unix_getname(struct socket *, struct sockaddr *, int);
820 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
821 static __poll_t unix_dgram_poll(struct file *, struct socket *,
822 poll_table *);
823 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
824 #ifdef CONFIG_COMPAT
825 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
826 #endif
827 static int unix_shutdown(struct socket *, int);
828 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
829 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
830 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
831 struct pipe_inode_info *, size_t size,
832 unsigned int flags);
833 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
834 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
835 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
836 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
837 static int unix_dgram_connect(struct socket *, struct sockaddr *,
838 int, int);
839 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
840 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
841 int);
842
843 #ifdef CONFIG_PROC_FS
unix_count_nr_fds(struct sock * sk)844 static int unix_count_nr_fds(struct sock *sk)
845 {
846 struct sk_buff *skb;
847 struct unix_sock *u;
848 int nr_fds = 0;
849
850 spin_lock(&sk->sk_receive_queue.lock);
851 skb = skb_peek(&sk->sk_receive_queue);
852 while (skb) {
853 u = unix_sk(skb->sk);
854 nr_fds += atomic_read(&u->scm_stat.nr_fds);
855 skb = skb_peek_next(skb, &sk->sk_receive_queue);
856 }
857 spin_unlock(&sk->sk_receive_queue.lock);
858
859 return nr_fds;
860 }
861
unix_show_fdinfo(struct seq_file * m,struct socket * sock)862 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
863 {
864 struct sock *sk = sock->sk;
865 unsigned char s_state;
866 struct unix_sock *u;
867 int nr_fds = 0;
868
869 if (sk) {
870 s_state = READ_ONCE(sk->sk_state);
871 u = unix_sk(sk);
872
873 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
874 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
875 * SOCK_DGRAM is ordinary. So, no lock is needed.
876 */
877 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
878 nr_fds = atomic_read(&u->scm_stat.nr_fds);
879 else if (s_state == TCP_LISTEN)
880 nr_fds = unix_count_nr_fds(sk);
881
882 seq_printf(m, "scm_fds: %u\n", nr_fds);
883 }
884 }
885 #else
886 #define unix_show_fdinfo NULL
887 #endif
888
889 static const struct proto_ops unix_stream_ops = {
890 .family = PF_UNIX,
891 .owner = THIS_MODULE,
892 .release = unix_release,
893 .bind = unix_bind,
894 .connect = unix_stream_connect,
895 .socketpair = unix_socketpair,
896 .accept = unix_accept,
897 .getname = unix_getname,
898 .poll = unix_poll,
899 .ioctl = unix_ioctl,
900 #ifdef CONFIG_COMPAT
901 .compat_ioctl = unix_compat_ioctl,
902 #endif
903 .listen = unix_listen,
904 .shutdown = unix_shutdown,
905 .sendmsg = unix_stream_sendmsg,
906 .recvmsg = unix_stream_recvmsg,
907 .read_skb = unix_stream_read_skb,
908 .mmap = sock_no_mmap,
909 .splice_read = unix_stream_splice_read,
910 .set_peek_off = sk_set_peek_off,
911 .show_fdinfo = unix_show_fdinfo,
912 };
913
914 static const struct proto_ops unix_dgram_ops = {
915 .family = PF_UNIX,
916 .owner = THIS_MODULE,
917 .release = unix_release,
918 .bind = unix_bind,
919 .connect = unix_dgram_connect,
920 .socketpair = unix_socketpair,
921 .accept = sock_no_accept,
922 .getname = unix_getname,
923 .poll = unix_dgram_poll,
924 .ioctl = unix_ioctl,
925 #ifdef CONFIG_COMPAT
926 .compat_ioctl = unix_compat_ioctl,
927 #endif
928 .listen = sock_no_listen,
929 .shutdown = unix_shutdown,
930 .sendmsg = unix_dgram_sendmsg,
931 .read_skb = unix_read_skb,
932 .recvmsg = unix_dgram_recvmsg,
933 .mmap = sock_no_mmap,
934 .set_peek_off = sk_set_peek_off,
935 .show_fdinfo = unix_show_fdinfo,
936 };
937
938 static const struct proto_ops unix_seqpacket_ops = {
939 .family = PF_UNIX,
940 .owner = THIS_MODULE,
941 .release = unix_release,
942 .bind = unix_bind,
943 .connect = unix_stream_connect,
944 .socketpair = unix_socketpair,
945 .accept = unix_accept,
946 .getname = unix_getname,
947 .poll = unix_dgram_poll,
948 .ioctl = unix_ioctl,
949 #ifdef CONFIG_COMPAT
950 .compat_ioctl = unix_compat_ioctl,
951 #endif
952 .listen = unix_listen,
953 .shutdown = unix_shutdown,
954 .sendmsg = unix_seqpacket_sendmsg,
955 .recvmsg = unix_seqpacket_recvmsg,
956 .mmap = sock_no_mmap,
957 .set_peek_off = sk_set_peek_off,
958 .show_fdinfo = unix_show_fdinfo,
959 };
960
unix_close(struct sock * sk,long timeout)961 static void unix_close(struct sock *sk, long timeout)
962 {
963 /* Nothing to do here, unix socket does not need a ->close().
964 * This is merely for sockmap.
965 */
966 }
967
unix_unhash(struct sock * sk)968 static void unix_unhash(struct sock *sk)
969 {
970 /* Nothing to do here, unix socket does not need a ->unhash().
971 * This is merely for sockmap.
972 */
973 }
974
unix_bpf_bypass_getsockopt(int level,int optname)975 static bool unix_bpf_bypass_getsockopt(int level, int optname)
976 {
977 if (level == SOL_SOCKET) {
978 switch (optname) {
979 case SO_PEERPIDFD:
980 return true;
981 default:
982 return false;
983 }
984 }
985
986 return false;
987 }
988
989 struct proto unix_dgram_proto = {
990 .name = "UNIX",
991 .owner = THIS_MODULE,
992 .obj_size = sizeof(struct unix_sock),
993 .close = unix_close,
994 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
995 #ifdef CONFIG_BPF_SYSCALL
996 .psock_update_sk_prot = unix_dgram_bpf_update_proto,
997 #endif
998 };
999
1000 struct proto unix_stream_proto = {
1001 .name = "UNIX-STREAM",
1002 .owner = THIS_MODULE,
1003 .obj_size = sizeof(struct unix_sock),
1004 .close = unix_close,
1005 .unhash = unix_unhash,
1006 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
1007 #ifdef CONFIG_BPF_SYSCALL
1008 .psock_update_sk_prot = unix_stream_bpf_update_proto,
1009 #endif
1010 };
1011
unix_create1(struct net * net,struct socket * sock,int kern,int type)1012 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1013 {
1014 struct unix_sock *u;
1015 struct sock *sk;
1016 int err;
1017
1018 atomic_long_inc(&unix_nr_socks);
1019 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1020 err = -ENFILE;
1021 goto err;
1022 }
1023
1024 if (type == SOCK_STREAM)
1025 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1026 else /*dgram and seqpacket */
1027 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1028
1029 if (!sk) {
1030 err = -ENOMEM;
1031 goto err;
1032 }
1033
1034 sock_init_data(sock, sk);
1035
1036 sk->sk_hash = unix_unbound_hash(sk);
1037 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
1038 sk->sk_write_space = unix_write_space;
1039 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1040 sk->sk_destruct = unix_sock_destructor;
1041 lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1042
1043 u = unix_sk(sk);
1044 u->listener = NULL;
1045 u->vertex = NULL;
1046 u->path.dentry = NULL;
1047 u->path.mnt = NULL;
1048 spin_lock_init(&u->lock);
1049 lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1050 mutex_init(&u->iolock); /* single task reading lock */
1051 mutex_init(&u->bindlock); /* single task binding lock */
1052 init_waitqueue_head(&u->peer_wait);
1053 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1054 memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1055 unix_insert_unbound_socket(net, sk);
1056
1057 sock_prot_inuse_add(net, sk->sk_prot, 1);
1058
1059 return sk;
1060
1061 err:
1062 atomic_long_dec(&unix_nr_socks);
1063 return ERR_PTR(err);
1064 }
1065
unix_create(struct net * net,struct socket * sock,int protocol,int kern)1066 static int unix_create(struct net *net, struct socket *sock, int protocol,
1067 int kern)
1068 {
1069 struct sock *sk;
1070
1071 if (protocol && protocol != PF_UNIX)
1072 return -EPROTONOSUPPORT;
1073
1074 sock->state = SS_UNCONNECTED;
1075
1076 switch (sock->type) {
1077 case SOCK_STREAM:
1078 sock->ops = &unix_stream_ops;
1079 break;
1080 /*
1081 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
1082 * nothing uses it.
1083 */
1084 case SOCK_RAW:
1085 sock->type = SOCK_DGRAM;
1086 fallthrough;
1087 case SOCK_DGRAM:
1088 sock->ops = &unix_dgram_ops;
1089 break;
1090 case SOCK_SEQPACKET:
1091 sock->ops = &unix_seqpacket_ops;
1092 break;
1093 default:
1094 return -ESOCKTNOSUPPORT;
1095 }
1096
1097 sk = unix_create1(net, sock, kern, sock->type);
1098 if (IS_ERR(sk))
1099 return PTR_ERR(sk);
1100
1101 return 0;
1102 }
1103
unix_release(struct socket * sock)1104 static int unix_release(struct socket *sock)
1105 {
1106 struct sock *sk = sock->sk;
1107
1108 if (!sk)
1109 return 0;
1110
1111 sk->sk_prot->close(sk, 0);
1112 unix_release_sock(sk, 0);
1113 sock->sk = NULL;
1114
1115 return 0;
1116 }
1117
unix_find_bsd(struct sockaddr_un * sunaddr,int addr_len,int type)1118 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1119 int type)
1120 {
1121 struct inode *inode;
1122 struct path path;
1123 struct sock *sk;
1124 int err;
1125
1126 unix_mkname_bsd(sunaddr, addr_len);
1127 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1128 if (err)
1129 goto fail;
1130
1131 err = path_permission(&path, MAY_WRITE);
1132 if (err)
1133 goto path_put;
1134
1135 err = -ECONNREFUSED;
1136 inode = d_backing_inode(path.dentry);
1137 if (!S_ISSOCK(inode->i_mode))
1138 goto path_put;
1139
1140 sk = unix_find_socket_byinode(inode);
1141 if (!sk)
1142 goto path_put;
1143
1144 err = -EPROTOTYPE;
1145 if (sk->sk_type == type)
1146 touch_atime(&path);
1147 else
1148 goto sock_put;
1149
1150 path_put(&path);
1151
1152 return sk;
1153
1154 sock_put:
1155 sock_put(sk);
1156 path_put:
1157 path_put(&path);
1158 fail:
1159 return ERR_PTR(err);
1160 }
1161
unix_find_abstract(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1162 static struct sock *unix_find_abstract(struct net *net,
1163 struct sockaddr_un *sunaddr,
1164 int addr_len, int type)
1165 {
1166 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1167 struct dentry *dentry;
1168 struct sock *sk;
1169
1170 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1171 if (!sk)
1172 return ERR_PTR(-ECONNREFUSED);
1173
1174 dentry = unix_sk(sk)->path.dentry;
1175 if (dentry)
1176 touch_atime(&unix_sk(sk)->path);
1177
1178 return sk;
1179 }
1180
unix_find_other(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1181 static struct sock *unix_find_other(struct net *net,
1182 struct sockaddr_un *sunaddr,
1183 int addr_len, int type)
1184 {
1185 struct sock *sk;
1186
1187 if (sunaddr->sun_path[0])
1188 sk = unix_find_bsd(sunaddr, addr_len, type);
1189 else
1190 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1191
1192 return sk;
1193 }
1194
unix_autobind(struct sock * sk)1195 static int unix_autobind(struct sock *sk)
1196 {
1197 struct unix_sock *u = unix_sk(sk);
1198 unsigned int new_hash, old_hash;
1199 struct net *net = sock_net(sk);
1200 struct unix_address *addr;
1201 u32 lastnum, ordernum;
1202 int err;
1203
1204 err = mutex_lock_interruptible(&u->bindlock);
1205 if (err)
1206 return err;
1207
1208 if (u->addr)
1209 goto out;
1210
1211 err = -ENOMEM;
1212 addr = kzalloc(sizeof(*addr) +
1213 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1214 if (!addr)
1215 goto out;
1216
1217 addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1218 addr->name->sun_family = AF_UNIX;
1219 refcount_set(&addr->refcnt, 1);
1220
1221 old_hash = sk->sk_hash;
1222 ordernum = get_random_u32();
1223 lastnum = ordernum & 0xFFFFF;
1224 retry:
1225 ordernum = (ordernum + 1) & 0xFFFFF;
1226 sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1227
1228 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1229 unix_table_double_lock(net, old_hash, new_hash);
1230
1231 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1232 unix_table_double_unlock(net, old_hash, new_hash);
1233
1234 /* __unix_find_socket_byname() may take long time if many names
1235 * are already in use.
1236 */
1237 cond_resched();
1238
1239 if (ordernum == lastnum) {
1240 /* Give up if all names seems to be in use. */
1241 err = -ENOSPC;
1242 unix_release_addr(addr);
1243 goto out;
1244 }
1245
1246 goto retry;
1247 }
1248
1249 __unix_set_addr_hash(net, sk, addr, new_hash);
1250 unix_table_double_unlock(net, old_hash, new_hash);
1251 err = 0;
1252
1253 out: mutex_unlock(&u->bindlock);
1254 return err;
1255 }
1256
unix_bind_bsd(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1257 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1258 int addr_len)
1259 {
1260 umode_t mode = S_IFSOCK |
1261 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1262 struct unix_sock *u = unix_sk(sk);
1263 unsigned int new_hash, old_hash;
1264 struct net *net = sock_net(sk);
1265 struct mnt_idmap *idmap;
1266 struct unix_address *addr;
1267 struct dentry *dentry;
1268 struct path parent;
1269 int err;
1270
1271 addr_len = unix_mkname_bsd(sunaddr, addr_len);
1272 addr = unix_create_addr(sunaddr, addr_len);
1273 if (!addr)
1274 return -ENOMEM;
1275
1276 /*
1277 * Get the parent directory, calculate the hash for last
1278 * component.
1279 */
1280 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1281 if (IS_ERR(dentry)) {
1282 err = PTR_ERR(dentry);
1283 goto out;
1284 }
1285
1286 /*
1287 * All right, let's create it.
1288 */
1289 idmap = mnt_idmap(parent.mnt);
1290 err = security_path_mknod(&parent, dentry, mode, 0);
1291 if (!err)
1292 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1293 if (err)
1294 goto out_path;
1295 err = mutex_lock_interruptible(&u->bindlock);
1296 if (err)
1297 goto out_unlink;
1298 if (u->addr)
1299 goto out_unlock;
1300
1301 old_hash = sk->sk_hash;
1302 new_hash = unix_bsd_hash(d_backing_inode(dentry));
1303 unix_table_double_lock(net, old_hash, new_hash);
1304 u->path.mnt = mntget(parent.mnt);
1305 u->path.dentry = dget(dentry);
1306 __unix_set_addr_hash(net, sk, addr, new_hash);
1307 unix_table_double_unlock(net, old_hash, new_hash);
1308 unix_insert_bsd_socket(sk);
1309 mutex_unlock(&u->bindlock);
1310 done_path_create(&parent, dentry);
1311 return 0;
1312
1313 out_unlock:
1314 mutex_unlock(&u->bindlock);
1315 err = -EINVAL;
1316 out_unlink:
1317 /* failed after successful mknod? unlink what we'd created... */
1318 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1319 out_path:
1320 done_path_create(&parent, dentry);
1321 out:
1322 unix_release_addr(addr);
1323 return err == -EEXIST ? -EADDRINUSE : err;
1324 }
1325
unix_bind_abstract(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1326 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1327 int addr_len)
1328 {
1329 struct unix_sock *u = unix_sk(sk);
1330 unsigned int new_hash, old_hash;
1331 struct net *net = sock_net(sk);
1332 struct unix_address *addr;
1333 int err;
1334
1335 addr = unix_create_addr(sunaddr, addr_len);
1336 if (!addr)
1337 return -ENOMEM;
1338
1339 err = mutex_lock_interruptible(&u->bindlock);
1340 if (err)
1341 goto out;
1342
1343 if (u->addr) {
1344 err = -EINVAL;
1345 goto out_mutex;
1346 }
1347
1348 old_hash = sk->sk_hash;
1349 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1350 unix_table_double_lock(net, old_hash, new_hash);
1351
1352 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1353 goto out_spin;
1354
1355 __unix_set_addr_hash(net, sk, addr, new_hash);
1356 unix_table_double_unlock(net, old_hash, new_hash);
1357 mutex_unlock(&u->bindlock);
1358 return 0;
1359
1360 out_spin:
1361 unix_table_double_unlock(net, old_hash, new_hash);
1362 err = -EADDRINUSE;
1363 out_mutex:
1364 mutex_unlock(&u->bindlock);
1365 out:
1366 unix_release_addr(addr);
1367 return err;
1368 }
1369
unix_bind(struct socket * sock,struct sockaddr * uaddr,int addr_len)1370 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1371 {
1372 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1373 struct sock *sk = sock->sk;
1374 int err;
1375
1376 if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1377 sunaddr->sun_family == AF_UNIX)
1378 return unix_autobind(sk);
1379
1380 err = unix_validate_addr(sunaddr, addr_len);
1381 if (err)
1382 return err;
1383
1384 if (sunaddr->sun_path[0])
1385 err = unix_bind_bsd(sk, sunaddr, addr_len);
1386 else
1387 err = unix_bind_abstract(sk, sunaddr, addr_len);
1388
1389 return err;
1390 }
1391
unix_state_double_lock(struct sock * sk1,struct sock * sk2)1392 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1393 {
1394 if (unlikely(sk1 == sk2) || !sk2) {
1395 unix_state_lock(sk1);
1396 return;
1397 }
1398
1399 if (sk1 > sk2)
1400 swap(sk1, sk2);
1401
1402 unix_state_lock(sk1);
1403 unix_state_lock(sk2);
1404 }
1405
unix_state_double_unlock(struct sock * sk1,struct sock * sk2)1406 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1407 {
1408 if (unlikely(sk1 == sk2) || !sk2) {
1409 unix_state_unlock(sk1);
1410 return;
1411 }
1412 unix_state_unlock(sk1);
1413 unix_state_unlock(sk2);
1414 }
1415
unix_dgram_connect(struct socket * sock,struct sockaddr * addr,int alen,int flags)1416 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1417 int alen, int flags)
1418 {
1419 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1420 struct sock *sk = sock->sk;
1421 struct sock *other;
1422 int err;
1423
1424 err = -EINVAL;
1425 if (alen < offsetofend(struct sockaddr, sa_family))
1426 goto out;
1427
1428 if (addr->sa_family != AF_UNSPEC) {
1429 err = unix_validate_addr(sunaddr, alen);
1430 if (err)
1431 goto out;
1432
1433 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1434 if (err)
1435 goto out;
1436
1437 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1438 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1439 !READ_ONCE(unix_sk(sk)->addr)) {
1440 err = unix_autobind(sk);
1441 if (err)
1442 goto out;
1443 }
1444
1445 restart:
1446 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1447 if (IS_ERR(other)) {
1448 err = PTR_ERR(other);
1449 goto out;
1450 }
1451
1452 unix_state_double_lock(sk, other);
1453
1454 /* Apparently VFS overslept socket death. Retry. */
1455 if (sock_flag(other, SOCK_DEAD)) {
1456 unix_state_double_unlock(sk, other);
1457 sock_put(other);
1458 goto restart;
1459 }
1460
1461 err = -EPERM;
1462 if (!unix_may_send(sk, other))
1463 goto out_unlock;
1464
1465 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1466 if (err)
1467 goto out_unlock;
1468
1469 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1470 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1471 } else {
1472 /*
1473 * 1003.1g breaking connected state with AF_UNSPEC
1474 */
1475 other = NULL;
1476 unix_state_double_lock(sk, other);
1477 }
1478
1479 /*
1480 * If it was connected, reconnect.
1481 */
1482 if (unix_peer(sk)) {
1483 struct sock *old_peer = unix_peer(sk);
1484
1485 unix_peer(sk) = other;
1486 if (!other)
1487 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1488 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1489
1490 unix_state_double_unlock(sk, other);
1491
1492 if (other != old_peer) {
1493 unix_dgram_disconnected(sk, old_peer);
1494
1495 unix_state_lock(old_peer);
1496 if (!unix_peer(old_peer))
1497 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1498 unix_state_unlock(old_peer);
1499 }
1500
1501 sock_put(old_peer);
1502 } else {
1503 unix_peer(sk) = other;
1504 unix_state_double_unlock(sk, other);
1505 }
1506
1507 return 0;
1508
1509 out_unlock:
1510 unix_state_double_unlock(sk, other);
1511 sock_put(other);
1512 out:
1513 return err;
1514 }
1515
unix_wait_for_peer(struct sock * other,long timeo)1516 static long unix_wait_for_peer(struct sock *other, long timeo)
1517 __releases(&unix_sk(other)->lock)
1518 {
1519 struct unix_sock *u = unix_sk(other);
1520 int sched;
1521 DEFINE_WAIT(wait);
1522
1523 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1524
1525 sched = !sock_flag(other, SOCK_DEAD) &&
1526 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1527 unix_recvq_full_lockless(other);
1528
1529 unix_state_unlock(other);
1530
1531 if (sched)
1532 timeo = schedule_timeout(timeo);
1533
1534 finish_wait(&u->peer_wait, &wait);
1535 return timeo;
1536 }
1537
unix_stream_connect(struct socket * sock,struct sockaddr * uaddr,int addr_len,int flags)1538 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1539 int addr_len, int flags)
1540 {
1541 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1542 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1543 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1544 struct net *net = sock_net(sk);
1545 struct sk_buff *skb = NULL;
1546 unsigned char state;
1547 long timeo;
1548 int err;
1549
1550 err = unix_validate_addr(sunaddr, addr_len);
1551 if (err)
1552 goto out;
1553
1554 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1555 if (err)
1556 goto out;
1557
1558 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1559 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1560 !READ_ONCE(u->addr)) {
1561 err = unix_autobind(sk);
1562 if (err)
1563 goto out;
1564 }
1565
1566 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1567
1568 /* First of all allocate resources.
1569 If we will make it after state is locked,
1570 we will have to recheck all again in any case.
1571 */
1572
1573 /* create new sock for complete connection */
1574 newsk = unix_create1(net, NULL, 0, sock->type);
1575 if (IS_ERR(newsk)) {
1576 err = PTR_ERR(newsk);
1577 newsk = NULL;
1578 goto out;
1579 }
1580
1581 err = -ENOMEM;
1582
1583 /* Allocate skb for sending to listening sock */
1584 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1585 if (skb == NULL)
1586 goto out;
1587
1588 restart:
1589 /* Find listening sock. */
1590 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1591 if (IS_ERR(other)) {
1592 err = PTR_ERR(other);
1593 other = NULL;
1594 goto out;
1595 }
1596
1597 unix_state_lock(other);
1598
1599 /* Apparently VFS overslept socket death. Retry. */
1600 if (sock_flag(other, SOCK_DEAD)) {
1601 unix_state_unlock(other);
1602 sock_put(other);
1603 goto restart;
1604 }
1605
1606 err = -ECONNREFUSED;
1607 if (other->sk_state != TCP_LISTEN)
1608 goto out_unlock;
1609 if (other->sk_shutdown & RCV_SHUTDOWN)
1610 goto out_unlock;
1611
1612 if (unix_recvq_full_lockless(other)) {
1613 err = -EAGAIN;
1614 if (!timeo)
1615 goto out_unlock;
1616
1617 timeo = unix_wait_for_peer(other, timeo);
1618
1619 err = sock_intr_errno(timeo);
1620 if (signal_pending(current))
1621 goto out;
1622 sock_put(other);
1623 goto restart;
1624 }
1625
1626 /* self connect and simultaneous connect are eliminated
1627 * by rejecting TCP_LISTEN socket to avoid deadlock.
1628 */
1629 state = READ_ONCE(sk->sk_state);
1630 if (unlikely(state != TCP_CLOSE)) {
1631 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1632 goto out_unlock;
1633 }
1634
1635 unix_state_lock(sk);
1636
1637 if (unlikely(sk->sk_state != TCP_CLOSE)) {
1638 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1639 unix_state_unlock(sk);
1640 goto out_unlock;
1641 }
1642
1643 err = security_unix_stream_connect(sk, other, newsk);
1644 if (err) {
1645 unix_state_unlock(sk);
1646 goto out_unlock;
1647 }
1648
1649 /* The way is open! Fastly set all the necessary fields... */
1650
1651 sock_hold(sk);
1652 unix_peer(newsk) = sk;
1653 newsk->sk_state = TCP_ESTABLISHED;
1654 newsk->sk_type = sk->sk_type;
1655 init_peercred(newsk);
1656 newu = unix_sk(newsk);
1657 newu->listener = other;
1658 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1659 otheru = unix_sk(other);
1660
1661 /* copy address information from listening to new sock
1662 *
1663 * The contents of *(otheru->addr) and otheru->path
1664 * are seen fully set up here, since we have found
1665 * otheru in hash under its lock. Insertion into the
1666 * hash chain we'd found it in had been done in an
1667 * earlier critical area protected by the chain's lock,
1668 * the same one where we'd set *(otheru->addr) contents,
1669 * as well as otheru->path and otheru->addr itself.
1670 *
1671 * Using smp_store_release() here to set newu->addr
1672 * is enough to make those stores, as well as stores
1673 * to newu->path visible to anyone who gets newu->addr
1674 * by smp_load_acquire(). IOW, the same warranties
1675 * as for unix_sock instances bound in unix_bind() or
1676 * in unix_autobind().
1677 */
1678 if (otheru->path.dentry) {
1679 path_get(&otheru->path);
1680 newu->path = otheru->path;
1681 }
1682 refcount_inc(&otheru->addr->refcnt);
1683 smp_store_release(&newu->addr, otheru->addr);
1684
1685 /* Set credentials */
1686 copy_peercred(sk, other);
1687
1688 sock->state = SS_CONNECTED;
1689 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1690 sock_hold(newsk);
1691
1692 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1693 unix_peer(sk) = newsk;
1694
1695 unix_state_unlock(sk);
1696
1697 /* take ten and send info to listening sock */
1698 spin_lock(&other->sk_receive_queue.lock);
1699 __skb_queue_tail(&other->sk_receive_queue, skb);
1700 spin_unlock(&other->sk_receive_queue.lock);
1701 unix_state_unlock(other);
1702 other->sk_data_ready(other);
1703 sock_put(other);
1704 return 0;
1705
1706 out_unlock:
1707 if (other)
1708 unix_state_unlock(other);
1709
1710 out:
1711 kfree_skb(skb);
1712 if (newsk)
1713 unix_release_sock(newsk, 0);
1714 if (other)
1715 sock_put(other);
1716 return err;
1717 }
1718
unix_socketpair(struct socket * socka,struct socket * sockb)1719 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1720 {
1721 struct sock *ska = socka->sk, *skb = sockb->sk;
1722
1723 /* Join our sockets back to back */
1724 sock_hold(ska);
1725 sock_hold(skb);
1726 unix_peer(ska) = skb;
1727 unix_peer(skb) = ska;
1728 init_peercred(ska);
1729 init_peercred(skb);
1730
1731 ska->sk_state = TCP_ESTABLISHED;
1732 skb->sk_state = TCP_ESTABLISHED;
1733 socka->state = SS_CONNECTED;
1734 sockb->state = SS_CONNECTED;
1735 return 0;
1736 }
1737
unix_sock_inherit_flags(const struct socket * old,struct socket * new)1738 static void unix_sock_inherit_flags(const struct socket *old,
1739 struct socket *new)
1740 {
1741 if (test_bit(SOCK_PASSCRED, &old->flags))
1742 set_bit(SOCK_PASSCRED, &new->flags);
1743 if (test_bit(SOCK_PASSPIDFD, &old->flags))
1744 set_bit(SOCK_PASSPIDFD, &new->flags);
1745 if (test_bit(SOCK_PASSSEC, &old->flags))
1746 set_bit(SOCK_PASSSEC, &new->flags);
1747 }
1748
unix_accept(struct socket * sock,struct socket * newsock,struct proto_accept_arg * arg)1749 static int unix_accept(struct socket *sock, struct socket *newsock,
1750 struct proto_accept_arg *arg)
1751 {
1752 struct sock *sk = sock->sk;
1753 struct sk_buff *skb;
1754 struct sock *tsk;
1755
1756 arg->err = -EOPNOTSUPP;
1757 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1758 goto out;
1759
1760 arg->err = -EINVAL;
1761 if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1762 goto out;
1763
1764 /* If socket state is TCP_LISTEN it cannot change (for now...),
1765 * so that no locks are necessary.
1766 */
1767
1768 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1769 &arg->err);
1770 if (!skb) {
1771 /* This means receive shutdown. */
1772 if (arg->err == 0)
1773 arg->err = -EINVAL;
1774 goto out;
1775 }
1776
1777 tsk = skb->sk;
1778 skb_free_datagram(sk, skb);
1779 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1780
1781 /* attach accepted sock to socket */
1782 unix_state_lock(tsk);
1783 unix_update_edges(unix_sk(tsk));
1784 newsock->state = SS_CONNECTED;
1785 unix_sock_inherit_flags(sock, newsock);
1786 sock_graft(tsk, newsock);
1787 unix_state_unlock(tsk);
1788 return 0;
1789
1790 out:
1791 return arg->err;
1792 }
1793
1794
unix_getname(struct socket * sock,struct sockaddr * uaddr,int peer)1795 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1796 {
1797 struct sock *sk = sock->sk;
1798 struct unix_address *addr;
1799 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1800 int err = 0;
1801
1802 if (peer) {
1803 sk = unix_peer_get(sk);
1804
1805 err = -ENOTCONN;
1806 if (!sk)
1807 goto out;
1808 err = 0;
1809 } else {
1810 sock_hold(sk);
1811 }
1812
1813 addr = smp_load_acquire(&unix_sk(sk)->addr);
1814 if (!addr) {
1815 sunaddr->sun_family = AF_UNIX;
1816 sunaddr->sun_path[0] = 0;
1817 err = offsetof(struct sockaddr_un, sun_path);
1818 } else {
1819 err = addr->len;
1820 memcpy(sunaddr, addr->name, addr->len);
1821
1822 if (peer)
1823 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1824 CGROUP_UNIX_GETPEERNAME);
1825 else
1826 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1827 CGROUP_UNIX_GETSOCKNAME);
1828 }
1829 sock_put(sk);
1830 out:
1831 return err;
1832 }
1833
1834 /* The "user->unix_inflight" variable is protected by the garbage
1835 * collection lock, and we just read it locklessly here. If you go
1836 * over the limit, there might be a tiny race in actually noticing
1837 * it across threads. Tough.
1838 */
too_many_unix_fds(struct task_struct * p)1839 static inline bool too_many_unix_fds(struct task_struct *p)
1840 {
1841 struct user_struct *user = current_user();
1842
1843 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1844 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1845 return false;
1846 }
1847
unix_attach_fds(struct scm_cookie * scm,struct sk_buff * skb)1848 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1849 {
1850 if (too_many_unix_fds(current))
1851 return -ETOOMANYREFS;
1852
1853 UNIXCB(skb).fp = scm->fp;
1854 scm->fp = NULL;
1855
1856 if (unix_prepare_fpl(UNIXCB(skb).fp))
1857 return -ENOMEM;
1858
1859 return 0;
1860 }
1861
unix_detach_fds(struct scm_cookie * scm,struct sk_buff * skb)1862 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1863 {
1864 scm->fp = UNIXCB(skb).fp;
1865 UNIXCB(skb).fp = NULL;
1866
1867 unix_destroy_fpl(scm->fp);
1868 }
1869
unix_peek_fds(struct scm_cookie * scm,struct sk_buff * skb)1870 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1871 {
1872 scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1873 }
1874
unix_destruct_scm(struct sk_buff * skb)1875 static void unix_destruct_scm(struct sk_buff *skb)
1876 {
1877 struct scm_cookie scm;
1878
1879 memset(&scm, 0, sizeof(scm));
1880 scm.pid = UNIXCB(skb).pid;
1881 if (UNIXCB(skb).fp)
1882 unix_detach_fds(&scm, skb);
1883
1884 /* Alas, it calls VFS */
1885 /* So fscking what? fput() had been SMP-safe since the last Summer */
1886 scm_destroy(&scm);
1887 sock_wfree(skb);
1888 }
1889
unix_scm_to_skb(struct scm_cookie * scm,struct sk_buff * skb,bool send_fds)1890 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1891 {
1892 int err = 0;
1893
1894 UNIXCB(skb).pid = get_pid(scm->pid);
1895 UNIXCB(skb).uid = scm->creds.uid;
1896 UNIXCB(skb).gid = scm->creds.gid;
1897 UNIXCB(skb).fp = NULL;
1898 unix_get_secdata(scm, skb);
1899 if (scm->fp && send_fds)
1900 err = unix_attach_fds(scm, skb);
1901
1902 skb->destructor = unix_destruct_scm;
1903 return err;
1904 }
1905
unix_passcred_enabled(const struct socket * sock,const struct sock * other)1906 static bool unix_passcred_enabled(const struct socket *sock,
1907 const struct sock *other)
1908 {
1909 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1910 test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1911 !other->sk_socket ||
1912 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1913 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1914 }
1915
1916 /*
1917 * Some apps rely on write() giving SCM_CREDENTIALS
1918 * We include credentials if source or destination socket
1919 * asserted SOCK_PASSCRED.
1920 */
maybe_add_creds(struct sk_buff * skb,const struct socket * sock,const struct sock * other)1921 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1922 const struct sock *other)
1923 {
1924 if (UNIXCB(skb).pid)
1925 return;
1926 if (unix_passcred_enabled(sock, other)) {
1927 UNIXCB(skb).pid = get_pid(task_tgid(current));
1928 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1929 }
1930 }
1931
unix_skb_scm_eq(struct sk_buff * skb,struct scm_cookie * scm)1932 static bool unix_skb_scm_eq(struct sk_buff *skb,
1933 struct scm_cookie *scm)
1934 {
1935 return UNIXCB(skb).pid == scm->pid &&
1936 uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1937 gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1938 unix_secdata_eq(scm, skb);
1939 }
1940
scm_stat_add(struct sock * sk,struct sk_buff * skb)1941 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1942 {
1943 struct scm_fp_list *fp = UNIXCB(skb).fp;
1944 struct unix_sock *u = unix_sk(sk);
1945
1946 if (unlikely(fp && fp->count)) {
1947 atomic_add(fp->count, &u->scm_stat.nr_fds);
1948 unix_add_edges(fp, u);
1949 }
1950 }
1951
scm_stat_del(struct sock * sk,struct sk_buff * skb)1952 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1953 {
1954 struct scm_fp_list *fp = UNIXCB(skb).fp;
1955 struct unix_sock *u = unix_sk(sk);
1956
1957 if (unlikely(fp && fp->count)) {
1958 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1959 unix_del_edges(fp);
1960 }
1961 }
1962
1963 /*
1964 * Send AF_UNIX data.
1965 */
1966
unix_dgram_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)1967 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1968 size_t len)
1969 {
1970 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1971 struct sock *sk = sock->sk, *other = NULL;
1972 struct unix_sock *u = unix_sk(sk);
1973 struct scm_cookie scm;
1974 struct sk_buff *skb;
1975 int data_len = 0;
1976 int sk_locked;
1977 long timeo;
1978 int err;
1979
1980 err = scm_send(sock, msg, &scm, false);
1981 if (err < 0)
1982 return err;
1983
1984 wait_for_unix_gc(scm.fp);
1985
1986 err = -EOPNOTSUPP;
1987 if (msg->msg_flags&MSG_OOB)
1988 goto out;
1989
1990 if (msg->msg_namelen) {
1991 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1992 if (err)
1993 goto out;
1994
1995 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1996 msg->msg_name,
1997 &msg->msg_namelen,
1998 NULL);
1999 if (err)
2000 goto out;
2001 } else {
2002 sunaddr = NULL;
2003 err = -ENOTCONN;
2004 other = unix_peer_get(sk);
2005 if (!other)
2006 goto out;
2007 }
2008
2009 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
2010 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
2011 !READ_ONCE(u->addr)) {
2012 err = unix_autobind(sk);
2013 if (err)
2014 goto out;
2015 }
2016
2017 err = -EMSGSIZE;
2018 if (len > READ_ONCE(sk->sk_sndbuf) - 32)
2019 goto out;
2020
2021 if (len > SKB_MAX_ALLOC) {
2022 data_len = min_t(size_t,
2023 len - SKB_MAX_ALLOC,
2024 MAX_SKB_FRAGS * PAGE_SIZE);
2025 data_len = PAGE_ALIGN(data_len);
2026
2027 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2028 }
2029
2030 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2031 msg->msg_flags & MSG_DONTWAIT, &err,
2032 PAGE_ALLOC_COSTLY_ORDER);
2033 if (skb == NULL)
2034 goto out;
2035
2036 err = unix_scm_to_skb(&scm, skb, true);
2037 if (err < 0)
2038 goto out_free;
2039
2040 skb_put(skb, len - data_len);
2041 skb->data_len = data_len;
2042 skb->len = len;
2043 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2044 if (err)
2045 goto out_free;
2046
2047 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2048
2049 restart:
2050 if (!other) {
2051 err = -ECONNRESET;
2052 if (sunaddr == NULL)
2053 goto out_free;
2054
2055 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2056 sk->sk_type);
2057 if (IS_ERR(other)) {
2058 err = PTR_ERR(other);
2059 other = NULL;
2060 goto out_free;
2061 }
2062 }
2063
2064 if (sk_filter(other, skb) < 0) {
2065 /* Toss the packet but do not return any error to the sender */
2066 err = len;
2067 goto out_free;
2068 }
2069
2070 sk_locked = 0;
2071 unix_state_lock(other);
2072 restart_locked:
2073 err = -EPERM;
2074 if (!unix_may_send(sk, other))
2075 goto out_unlock;
2076
2077 if (unlikely(sock_flag(other, SOCK_DEAD))) {
2078 /*
2079 * Check with 1003.1g - what should
2080 * datagram error
2081 */
2082 unix_state_unlock(other);
2083 sock_put(other);
2084
2085 if (!sk_locked)
2086 unix_state_lock(sk);
2087
2088 err = 0;
2089 if (sk->sk_type == SOCK_SEQPACKET) {
2090 /* We are here only when racing with unix_release_sock()
2091 * is clearing @other. Never change state to TCP_CLOSE
2092 * unlike SOCK_DGRAM wants.
2093 */
2094 unix_state_unlock(sk);
2095 err = -EPIPE;
2096 } else if (unix_peer(sk) == other) {
2097 unix_peer(sk) = NULL;
2098 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2099
2100 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2101 unix_state_unlock(sk);
2102
2103 unix_dgram_disconnected(sk, other);
2104 sock_put(other);
2105 err = -ECONNREFUSED;
2106 } else {
2107 unix_state_unlock(sk);
2108 }
2109
2110 other = NULL;
2111 if (err)
2112 goto out_free;
2113 goto restart;
2114 }
2115
2116 err = -EPIPE;
2117 if (other->sk_shutdown & RCV_SHUTDOWN)
2118 goto out_unlock;
2119
2120 if (sk->sk_type != SOCK_SEQPACKET) {
2121 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2122 if (err)
2123 goto out_unlock;
2124 }
2125
2126 /* other == sk && unix_peer(other) != sk if
2127 * - unix_peer(sk) == NULL, destination address bound to sk
2128 * - unix_peer(sk) == sk by time of get but disconnected before lock
2129 */
2130 if (other != sk &&
2131 unlikely(unix_peer(other) != sk &&
2132 unix_recvq_full_lockless(other))) {
2133 if (timeo) {
2134 timeo = unix_wait_for_peer(other, timeo);
2135
2136 err = sock_intr_errno(timeo);
2137 if (signal_pending(current))
2138 goto out_free;
2139
2140 goto restart;
2141 }
2142
2143 if (!sk_locked) {
2144 unix_state_unlock(other);
2145 unix_state_double_lock(sk, other);
2146 }
2147
2148 if (unix_peer(sk) != other ||
2149 unix_dgram_peer_wake_me(sk, other)) {
2150 err = -EAGAIN;
2151 sk_locked = 1;
2152 goto out_unlock;
2153 }
2154
2155 if (!sk_locked) {
2156 sk_locked = 1;
2157 goto restart_locked;
2158 }
2159 }
2160
2161 if (unlikely(sk_locked))
2162 unix_state_unlock(sk);
2163
2164 if (sock_flag(other, SOCK_RCVTSTAMP))
2165 __net_timestamp(skb);
2166 maybe_add_creds(skb, sock, other);
2167 scm_stat_add(other, skb);
2168 skb_queue_tail(&other->sk_receive_queue, skb);
2169 unix_state_unlock(other);
2170 other->sk_data_ready(other);
2171 sock_put(other);
2172 scm_destroy(&scm);
2173 return len;
2174
2175 out_unlock:
2176 if (sk_locked)
2177 unix_state_unlock(sk);
2178 unix_state_unlock(other);
2179 out_free:
2180 kfree_skb(skb);
2181 out:
2182 if (other)
2183 sock_put(other);
2184 scm_destroy(&scm);
2185 return err;
2186 }
2187
2188 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2189 * bytes, and a minimum of a full page.
2190 */
2191 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2192
2193 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
queue_oob(struct socket * sock,struct msghdr * msg,struct sock * other,struct scm_cookie * scm,bool fds_sent)2194 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2195 struct scm_cookie *scm, bool fds_sent)
2196 {
2197 struct unix_sock *ousk = unix_sk(other);
2198 struct sk_buff *skb;
2199 int err = 0;
2200
2201 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2202
2203 if (!skb)
2204 return err;
2205
2206 err = unix_scm_to_skb(scm, skb, !fds_sent);
2207 if (err < 0) {
2208 kfree_skb(skb);
2209 return err;
2210 }
2211 skb_put(skb, 1);
2212 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2213
2214 if (err) {
2215 kfree_skb(skb);
2216 return err;
2217 }
2218
2219 unix_state_lock(other);
2220
2221 if (sock_flag(other, SOCK_DEAD) ||
2222 (other->sk_shutdown & RCV_SHUTDOWN)) {
2223 unix_state_unlock(other);
2224 kfree_skb(skb);
2225 return -EPIPE;
2226 }
2227
2228 maybe_add_creds(skb, sock, other);
2229 skb_get(skb);
2230
2231 scm_stat_add(other, skb);
2232
2233 spin_lock(&other->sk_receive_queue.lock);
2234 if (ousk->oob_skb)
2235 consume_skb(ousk->oob_skb);
2236 WRITE_ONCE(ousk->oob_skb, skb);
2237 __skb_queue_tail(&other->sk_receive_queue, skb);
2238 spin_unlock(&other->sk_receive_queue.lock);
2239
2240 sk_send_sigurg(other);
2241 unix_state_unlock(other);
2242 other->sk_data_ready(other);
2243
2244 return err;
2245 }
2246 #endif
2247
unix_stream_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2248 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2249 size_t len)
2250 {
2251 struct sock *sk = sock->sk;
2252 struct sock *other = NULL;
2253 int err, size;
2254 struct sk_buff *skb;
2255 int sent = 0;
2256 struct scm_cookie scm;
2257 bool fds_sent = false;
2258 int data_len;
2259
2260 err = scm_send(sock, msg, &scm, false);
2261 if (err < 0)
2262 return err;
2263
2264 wait_for_unix_gc(scm.fp);
2265
2266 err = -EOPNOTSUPP;
2267 if (msg->msg_flags & MSG_OOB) {
2268 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2269 if (len)
2270 len--;
2271 else
2272 #endif
2273 goto out_err;
2274 }
2275
2276 if (msg->msg_namelen) {
2277 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2278 goto out_err;
2279 } else {
2280 err = -ENOTCONN;
2281 other = unix_peer(sk);
2282 if (!other)
2283 goto out_err;
2284 }
2285
2286 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2287 goto pipe_err;
2288
2289 while (sent < len) {
2290 size = len - sent;
2291
2292 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2293 skb = sock_alloc_send_pskb(sk, 0, 0,
2294 msg->msg_flags & MSG_DONTWAIT,
2295 &err, 0);
2296 } else {
2297 /* Keep two messages in the pipe so it schedules better */
2298 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2299
2300 /* allow fallback to order-0 allocations */
2301 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2302
2303 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2304
2305 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2306
2307 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2308 msg->msg_flags & MSG_DONTWAIT, &err,
2309 get_order(UNIX_SKB_FRAGS_SZ));
2310 }
2311 if (!skb)
2312 goto out_err;
2313
2314 /* Only send the fds in the first buffer */
2315 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2316 if (err < 0) {
2317 kfree_skb(skb);
2318 goto out_err;
2319 }
2320 fds_sent = true;
2321
2322 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2323 err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2324 sk->sk_allocation);
2325 if (err < 0) {
2326 kfree_skb(skb);
2327 goto out_err;
2328 }
2329 size = err;
2330 refcount_add(size, &sk->sk_wmem_alloc);
2331 } else {
2332 skb_put(skb, size - data_len);
2333 skb->data_len = data_len;
2334 skb->len = size;
2335 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2336 if (err) {
2337 kfree_skb(skb);
2338 goto out_err;
2339 }
2340 }
2341
2342 unix_state_lock(other);
2343
2344 if (sock_flag(other, SOCK_DEAD) ||
2345 (other->sk_shutdown & RCV_SHUTDOWN))
2346 goto pipe_err_free;
2347
2348 maybe_add_creds(skb, sock, other);
2349 scm_stat_add(other, skb);
2350 skb_queue_tail(&other->sk_receive_queue, skb);
2351 unix_state_unlock(other);
2352 other->sk_data_ready(other);
2353 sent += size;
2354 }
2355
2356 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2357 if (msg->msg_flags & MSG_OOB) {
2358 err = queue_oob(sock, msg, other, &scm, fds_sent);
2359 if (err)
2360 goto out_err;
2361 sent++;
2362 }
2363 #endif
2364
2365 scm_destroy(&scm);
2366
2367 return sent;
2368
2369 pipe_err_free:
2370 unix_state_unlock(other);
2371 kfree_skb(skb);
2372 pipe_err:
2373 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2374 send_sig(SIGPIPE, current, 0);
2375 err = -EPIPE;
2376 out_err:
2377 scm_destroy(&scm);
2378 return sent ? : err;
2379 }
2380
unix_seqpacket_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2381 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2382 size_t len)
2383 {
2384 int err;
2385 struct sock *sk = sock->sk;
2386
2387 err = sock_error(sk);
2388 if (err)
2389 return err;
2390
2391 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2392 return -ENOTCONN;
2393
2394 if (msg->msg_namelen)
2395 msg->msg_namelen = 0;
2396
2397 return unix_dgram_sendmsg(sock, msg, len);
2398 }
2399
unix_seqpacket_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2400 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2401 size_t size, int flags)
2402 {
2403 struct sock *sk = sock->sk;
2404
2405 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2406 return -ENOTCONN;
2407
2408 return unix_dgram_recvmsg(sock, msg, size, flags);
2409 }
2410
unix_copy_addr(struct msghdr * msg,struct sock * sk)2411 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2412 {
2413 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2414
2415 if (addr) {
2416 msg->msg_namelen = addr->len;
2417 memcpy(msg->msg_name, addr->name, addr->len);
2418 }
2419 }
2420
__unix_dgram_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2421 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2422 int flags)
2423 {
2424 struct scm_cookie scm;
2425 struct socket *sock = sk->sk_socket;
2426 struct unix_sock *u = unix_sk(sk);
2427 struct sk_buff *skb, *last;
2428 long timeo;
2429 int skip;
2430 int err;
2431
2432 err = -EOPNOTSUPP;
2433 if (flags&MSG_OOB)
2434 goto out;
2435
2436 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2437
2438 do {
2439 mutex_lock(&u->iolock);
2440
2441 skip = sk_peek_offset(sk, flags);
2442 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2443 &skip, &err, &last);
2444 if (skb) {
2445 if (!(flags & MSG_PEEK))
2446 scm_stat_del(sk, skb);
2447 break;
2448 }
2449
2450 mutex_unlock(&u->iolock);
2451
2452 if (err != -EAGAIN)
2453 break;
2454 } while (timeo &&
2455 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2456 &err, &timeo, last));
2457
2458 if (!skb) { /* implies iolock unlocked */
2459 unix_state_lock(sk);
2460 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2461 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2462 (sk->sk_shutdown & RCV_SHUTDOWN))
2463 err = 0;
2464 unix_state_unlock(sk);
2465 goto out;
2466 }
2467
2468 if (wq_has_sleeper(&u->peer_wait))
2469 wake_up_interruptible_sync_poll(&u->peer_wait,
2470 EPOLLOUT | EPOLLWRNORM |
2471 EPOLLWRBAND);
2472
2473 if (msg->msg_name) {
2474 unix_copy_addr(msg, skb->sk);
2475
2476 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2477 msg->msg_name,
2478 &msg->msg_namelen);
2479 }
2480
2481 if (size > skb->len - skip)
2482 size = skb->len - skip;
2483 else if (size < skb->len - skip)
2484 msg->msg_flags |= MSG_TRUNC;
2485
2486 err = skb_copy_datagram_msg(skb, skip, msg, size);
2487 if (err)
2488 goto out_free;
2489
2490 if (sock_flag(sk, SOCK_RCVTSTAMP))
2491 __sock_recv_timestamp(msg, sk, skb);
2492
2493 memset(&scm, 0, sizeof(scm));
2494
2495 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2496 unix_set_secdata(&scm, skb);
2497
2498 if (!(flags & MSG_PEEK)) {
2499 if (UNIXCB(skb).fp)
2500 unix_detach_fds(&scm, skb);
2501
2502 sk_peek_offset_bwd(sk, skb->len);
2503 } else {
2504 /* It is questionable: on PEEK we could:
2505 - do not return fds - good, but too simple 8)
2506 - return fds, and do not return them on read (old strategy,
2507 apparently wrong)
2508 - clone fds (I chose it for now, it is the most universal
2509 solution)
2510
2511 POSIX 1003.1g does not actually define this clearly
2512 at all. POSIX 1003.1g doesn't define a lot of things
2513 clearly however!
2514
2515 */
2516
2517 sk_peek_offset_fwd(sk, size);
2518
2519 if (UNIXCB(skb).fp)
2520 unix_peek_fds(&scm, skb);
2521 }
2522 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2523
2524 scm_recv_unix(sock, msg, &scm, flags);
2525
2526 out_free:
2527 skb_free_datagram(sk, skb);
2528 mutex_unlock(&u->iolock);
2529 out:
2530 return err;
2531 }
2532
unix_dgram_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2533 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2534 int flags)
2535 {
2536 struct sock *sk = sock->sk;
2537
2538 #ifdef CONFIG_BPF_SYSCALL
2539 const struct proto *prot = READ_ONCE(sk->sk_prot);
2540
2541 if (prot != &unix_dgram_proto)
2542 return prot->recvmsg(sk, msg, size, flags, NULL);
2543 #endif
2544 return __unix_dgram_recvmsg(sk, msg, size, flags);
2545 }
2546
unix_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2547 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2548 {
2549 struct unix_sock *u = unix_sk(sk);
2550 struct sk_buff *skb;
2551 int err;
2552
2553 mutex_lock(&u->iolock);
2554 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2555 mutex_unlock(&u->iolock);
2556 if (!skb)
2557 return err;
2558
2559 return recv_actor(sk, skb);
2560 }
2561
2562 /*
2563 * Sleep until more data has arrived. But check for races..
2564 */
unix_stream_data_wait(struct sock * sk,long timeo,struct sk_buff * last,unsigned int last_len,bool freezable)2565 static long unix_stream_data_wait(struct sock *sk, long timeo,
2566 struct sk_buff *last, unsigned int last_len,
2567 bool freezable)
2568 {
2569 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2570 struct sk_buff *tail;
2571 DEFINE_WAIT(wait);
2572
2573 unix_state_lock(sk);
2574
2575 for (;;) {
2576 prepare_to_wait(sk_sleep(sk), &wait, state);
2577
2578 tail = skb_peek_tail(&sk->sk_receive_queue);
2579 if (tail != last ||
2580 (tail && tail->len != last_len) ||
2581 sk->sk_err ||
2582 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2583 signal_pending(current) ||
2584 !timeo)
2585 break;
2586
2587 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2588 unix_state_unlock(sk);
2589 timeo = schedule_timeout(timeo);
2590 unix_state_lock(sk);
2591
2592 if (sock_flag(sk, SOCK_DEAD))
2593 break;
2594
2595 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2596 }
2597
2598 finish_wait(sk_sleep(sk), &wait);
2599 unix_state_unlock(sk);
2600 return timeo;
2601 }
2602
unix_skb_len(const struct sk_buff * skb)2603 static unsigned int unix_skb_len(const struct sk_buff *skb)
2604 {
2605 return skb->len - UNIXCB(skb).consumed;
2606 }
2607
2608 struct unix_stream_read_state {
2609 int (*recv_actor)(struct sk_buff *, int, int,
2610 struct unix_stream_read_state *);
2611 struct socket *socket;
2612 struct msghdr *msg;
2613 struct pipe_inode_info *pipe;
2614 size_t size;
2615 int flags;
2616 unsigned int splice_flags;
2617 };
2618
2619 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
unix_stream_recv_urg(struct unix_stream_read_state * state)2620 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2621 {
2622 struct socket *sock = state->socket;
2623 struct sock *sk = sock->sk;
2624 struct unix_sock *u = unix_sk(sk);
2625 int chunk = 1;
2626 struct sk_buff *oob_skb;
2627
2628 mutex_lock(&u->iolock);
2629 unix_state_lock(sk);
2630 spin_lock(&sk->sk_receive_queue.lock);
2631
2632 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2633 spin_unlock(&sk->sk_receive_queue.lock);
2634 unix_state_unlock(sk);
2635 mutex_unlock(&u->iolock);
2636 return -EINVAL;
2637 }
2638
2639 oob_skb = u->oob_skb;
2640
2641 if (!(state->flags & MSG_PEEK))
2642 WRITE_ONCE(u->oob_skb, NULL);
2643 else
2644 skb_get(oob_skb);
2645
2646 spin_unlock(&sk->sk_receive_queue.lock);
2647 unix_state_unlock(sk);
2648
2649 chunk = state->recv_actor(oob_skb, 0, chunk, state);
2650
2651 if (!(state->flags & MSG_PEEK))
2652 UNIXCB(oob_skb).consumed += 1;
2653
2654 consume_skb(oob_skb);
2655
2656 mutex_unlock(&u->iolock);
2657
2658 if (chunk < 0)
2659 return -EFAULT;
2660
2661 state->msg->msg_flags |= MSG_OOB;
2662 return 1;
2663 }
2664
manage_oob(struct sk_buff * skb,struct sock * sk,int flags,int copied)2665 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2666 int flags, int copied)
2667 {
2668 struct unix_sock *u = unix_sk(sk);
2669
2670 if (!unix_skb_len(skb)) {
2671 struct sk_buff *unlinked_skb = NULL;
2672
2673 spin_lock(&sk->sk_receive_queue.lock);
2674
2675 if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2676 skb = NULL;
2677 } else if (flags & MSG_PEEK) {
2678 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2679 } else {
2680 unlinked_skb = skb;
2681 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2682 __skb_unlink(unlinked_skb, &sk->sk_receive_queue);
2683 }
2684
2685 spin_unlock(&sk->sk_receive_queue.lock);
2686
2687 consume_skb(unlinked_skb);
2688 } else {
2689 struct sk_buff *unlinked_skb = NULL;
2690
2691 spin_lock(&sk->sk_receive_queue.lock);
2692
2693 if (skb == u->oob_skb) {
2694 if (copied) {
2695 skb = NULL;
2696 } else if (!(flags & MSG_PEEK)) {
2697 if (sock_flag(sk, SOCK_URGINLINE)) {
2698 WRITE_ONCE(u->oob_skb, NULL);
2699 consume_skb(skb);
2700 } else {
2701 __skb_unlink(skb, &sk->sk_receive_queue);
2702 WRITE_ONCE(u->oob_skb, NULL);
2703 unlinked_skb = skb;
2704 skb = skb_peek(&sk->sk_receive_queue);
2705 }
2706 } else if (!sock_flag(sk, SOCK_URGINLINE)) {
2707 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2708 }
2709 }
2710
2711 spin_unlock(&sk->sk_receive_queue.lock);
2712
2713 if (unlinked_skb) {
2714 WARN_ON_ONCE(skb_unref(unlinked_skb));
2715 kfree_skb(unlinked_skb);
2716 }
2717 }
2718 return skb;
2719 }
2720 #endif
2721
unix_stream_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2722 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2723 {
2724 struct unix_sock *u = unix_sk(sk);
2725 struct sk_buff *skb;
2726 int err;
2727
2728 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2729 return -ENOTCONN;
2730
2731 mutex_lock(&u->iolock);
2732 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2733 mutex_unlock(&u->iolock);
2734 if (!skb)
2735 return err;
2736
2737 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2738 if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2739 bool drop = false;
2740
2741 unix_state_lock(sk);
2742
2743 if (sock_flag(sk, SOCK_DEAD)) {
2744 unix_state_unlock(sk);
2745 kfree_skb(skb);
2746 return -ECONNRESET;
2747 }
2748
2749 spin_lock(&sk->sk_receive_queue.lock);
2750 if (likely(skb == u->oob_skb)) {
2751 WRITE_ONCE(u->oob_skb, NULL);
2752 drop = true;
2753 }
2754 spin_unlock(&sk->sk_receive_queue.lock);
2755
2756 unix_state_unlock(sk);
2757
2758 if (drop) {
2759 WARN_ON_ONCE(skb_unref(skb));
2760 kfree_skb(skb);
2761 return -EAGAIN;
2762 }
2763 }
2764 #endif
2765
2766 return recv_actor(sk, skb);
2767 }
2768
unix_stream_read_generic(struct unix_stream_read_state * state,bool freezable)2769 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2770 bool freezable)
2771 {
2772 struct scm_cookie scm;
2773 struct socket *sock = state->socket;
2774 struct sock *sk = sock->sk;
2775 struct unix_sock *u = unix_sk(sk);
2776 int copied = 0;
2777 int flags = state->flags;
2778 int noblock = flags & MSG_DONTWAIT;
2779 bool check_creds = false;
2780 int target;
2781 int err = 0;
2782 long timeo;
2783 int skip;
2784 size_t size = state->size;
2785 unsigned int last_len;
2786
2787 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2788 err = -EINVAL;
2789 goto out;
2790 }
2791
2792 if (unlikely(flags & MSG_OOB)) {
2793 err = -EOPNOTSUPP;
2794 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2795 err = unix_stream_recv_urg(state);
2796 #endif
2797 goto out;
2798 }
2799
2800 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2801 timeo = sock_rcvtimeo(sk, noblock);
2802
2803 memset(&scm, 0, sizeof(scm));
2804
2805 /* Lock the socket to prevent queue disordering
2806 * while sleeps in memcpy_tomsg
2807 */
2808 mutex_lock(&u->iolock);
2809
2810 skip = max(sk_peek_offset(sk, flags), 0);
2811
2812 do {
2813 struct sk_buff *skb, *last;
2814 int chunk;
2815
2816 redo:
2817 unix_state_lock(sk);
2818 if (sock_flag(sk, SOCK_DEAD)) {
2819 err = -ECONNRESET;
2820 goto unlock;
2821 }
2822 last = skb = skb_peek(&sk->sk_receive_queue);
2823 last_len = last ? last->len : 0;
2824
2825 again:
2826 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2827 if (skb) {
2828 skb = manage_oob(skb, sk, flags, copied);
2829 if (!skb && copied) {
2830 unix_state_unlock(sk);
2831 break;
2832 }
2833 }
2834 #endif
2835 if (skb == NULL) {
2836 if (copied >= target)
2837 goto unlock;
2838
2839 /*
2840 * POSIX 1003.1g mandates this order.
2841 */
2842
2843 err = sock_error(sk);
2844 if (err)
2845 goto unlock;
2846 if (sk->sk_shutdown & RCV_SHUTDOWN)
2847 goto unlock;
2848
2849 unix_state_unlock(sk);
2850 if (!timeo) {
2851 err = -EAGAIN;
2852 break;
2853 }
2854
2855 mutex_unlock(&u->iolock);
2856
2857 timeo = unix_stream_data_wait(sk, timeo, last,
2858 last_len, freezable);
2859
2860 if (signal_pending(current)) {
2861 err = sock_intr_errno(timeo);
2862 scm_destroy(&scm);
2863 goto out;
2864 }
2865
2866 mutex_lock(&u->iolock);
2867 goto redo;
2868 unlock:
2869 unix_state_unlock(sk);
2870 break;
2871 }
2872
2873 while (skip >= unix_skb_len(skb)) {
2874 skip -= unix_skb_len(skb);
2875 last = skb;
2876 last_len = skb->len;
2877 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2878 if (!skb)
2879 goto again;
2880 }
2881
2882 unix_state_unlock(sk);
2883
2884 if (check_creds) {
2885 /* Never glue messages from different writers */
2886 if (!unix_skb_scm_eq(skb, &scm))
2887 break;
2888 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2889 test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2890 /* Copy credentials */
2891 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2892 unix_set_secdata(&scm, skb);
2893 check_creds = true;
2894 }
2895
2896 /* Copy address just once */
2897 if (state->msg && state->msg->msg_name) {
2898 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2899 state->msg->msg_name);
2900 unix_copy_addr(state->msg, skb->sk);
2901
2902 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2903 state->msg->msg_name,
2904 &state->msg->msg_namelen);
2905
2906 sunaddr = NULL;
2907 }
2908
2909 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2910 chunk = state->recv_actor(skb, skip, chunk, state);
2911 if (chunk < 0) {
2912 if (copied == 0)
2913 copied = -EFAULT;
2914 break;
2915 }
2916 copied += chunk;
2917 size -= chunk;
2918
2919 /* Mark read part of skb as used */
2920 if (!(flags & MSG_PEEK)) {
2921 UNIXCB(skb).consumed += chunk;
2922
2923 sk_peek_offset_bwd(sk, chunk);
2924
2925 if (UNIXCB(skb).fp) {
2926 scm_stat_del(sk, skb);
2927 unix_detach_fds(&scm, skb);
2928 }
2929
2930 if (unix_skb_len(skb))
2931 break;
2932
2933 skb_unlink(skb, &sk->sk_receive_queue);
2934 consume_skb(skb);
2935
2936 if (scm.fp)
2937 break;
2938 } else {
2939 /* It is questionable, see note in unix_dgram_recvmsg.
2940 */
2941 if (UNIXCB(skb).fp)
2942 unix_peek_fds(&scm, skb);
2943
2944 sk_peek_offset_fwd(sk, chunk);
2945
2946 if (UNIXCB(skb).fp)
2947 break;
2948
2949 skip = 0;
2950 last = skb;
2951 last_len = skb->len;
2952 unix_state_lock(sk);
2953 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2954 if (skb)
2955 goto again;
2956 unix_state_unlock(sk);
2957 break;
2958 }
2959 } while (size);
2960
2961 mutex_unlock(&u->iolock);
2962 if (state->msg)
2963 scm_recv_unix(sock, state->msg, &scm, flags);
2964 else
2965 scm_destroy(&scm);
2966 out:
2967 return copied ? : err;
2968 }
2969
unix_stream_read_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2970 static int unix_stream_read_actor(struct sk_buff *skb,
2971 int skip, int chunk,
2972 struct unix_stream_read_state *state)
2973 {
2974 int ret;
2975
2976 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2977 state->msg, chunk);
2978 return ret ?: chunk;
2979 }
2980
__unix_stream_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2981 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2982 size_t size, int flags)
2983 {
2984 struct unix_stream_read_state state = {
2985 .recv_actor = unix_stream_read_actor,
2986 .socket = sk->sk_socket,
2987 .msg = msg,
2988 .size = size,
2989 .flags = flags
2990 };
2991
2992 return unix_stream_read_generic(&state, true);
2993 }
2994
unix_stream_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2995 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2996 size_t size, int flags)
2997 {
2998 struct unix_stream_read_state state = {
2999 .recv_actor = unix_stream_read_actor,
3000 .socket = sock,
3001 .msg = msg,
3002 .size = size,
3003 .flags = flags
3004 };
3005
3006 #ifdef CONFIG_BPF_SYSCALL
3007 struct sock *sk = sock->sk;
3008 const struct proto *prot = READ_ONCE(sk->sk_prot);
3009
3010 if (prot != &unix_stream_proto)
3011 return prot->recvmsg(sk, msg, size, flags, NULL);
3012 #endif
3013 return unix_stream_read_generic(&state, true);
3014 }
3015
unix_stream_splice_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)3016 static int unix_stream_splice_actor(struct sk_buff *skb,
3017 int skip, int chunk,
3018 struct unix_stream_read_state *state)
3019 {
3020 return skb_splice_bits(skb, state->socket->sk,
3021 UNIXCB(skb).consumed + skip,
3022 state->pipe, chunk, state->splice_flags);
3023 }
3024
unix_stream_splice_read(struct socket * sock,loff_t * ppos,struct pipe_inode_info * pipe,size_t size,unsigned int flags)3025 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
3026 struct pipe_inode_info *pipe,
3027 size_t size, unsigned int flags)
3028 {
3029 struct unix_stream_read_state state = {
3030 .recv_actor = unix_stream_splice_actor,
3031 .socket = sock,
3032 .pipe = pipe,
3033 .size = size,
3034 .splice_flags = flags,
3035 };
3036
3037 if (unlikely(*ppos))
3038 return -ESPIPE;
3039
3040 if (sock->file->f_flags & O_NONBLOCK ||
3041 flags & SPLICE_F_NONBLOCK)
3042 state.flags = MSG_DONTWAIT;
3043
3044 return unix_stream_read_generic(&state, false);
3045 }
3046
unix_shutdown(struct socket * sock,int mode)3047 static int unix_shutdown(struct socket *sock, int mode)
3048 {
3049 struct sock *sk = sock->sk;
3050 struct sock *other;
3051
3052 if (mode < SHUT_RD || mode > SHUT_RDWR)
3053 return -EINVAL;
3054 /* This maps:
3055 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
3056 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
3057 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3058 */
3059 ++mode;
3060
3061 unix_state_lock(sk);
3062 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3063 other = unix_peer(sk);
3064 if (other)
3065 sock_hold(other);
3066 unix_state_unlock(sk);
3067 sk->sk_state_change(sk);
3068
3069 if (other &&
3070 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3071
3072 int peer_mode = 0;
3073 const struct proto *prot = READ_ONCE(other->sk_prot);
3074
3075 if (prot->unhash)
3076 prot->unhash(other);
3077 if (mode&RCV_SHUTDOWN)
3078 peer_mode |= SEND_SHUTDOWN;
3079 if (mode&SEND_SHUTDOWN)
3080 peer_mode |= RCV_SHUTDOWN;
3081 unix_state_lock(other);
3082 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3083 unix_state_unlock(other);
3084 other->sk_state_change(other);
3085 if (peer_mode == SHUTDOWN_MASK)
3086 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3087 else if (peer_mode & RCV_SHUTDOWN)
3088 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3089 }
3090 if (other)
3091 sock_put(other);
3092
3093 return 0;
3094 }
3095
unix_inq_len(struct sock * sk)3096 long unix_inq_len(struct sock *sk)
3097 {
3098 struct sk_buff *skb;
3099 long amount = 0;
3100
3101 if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3102 return -EINVAL;
3103
3104 spin_lock(&sk->sk_receive_queue.lock);
3105 if (sk->sk_type == SOCK_STREAM ||
3106 sk->sk_type == SOCK_SEQPACKET) {
3107 skb_queue_walk(&sk->sk_receive_queue, skb)
3108 amount += unix_skb_len(skb);
3109 } else {
3110 skb = skb_peek(&sk->sk_receive_queue);
3111 if (skb)
3112 amount = skb->len;
3113 }
3114 spin_unlock(&sk->sk_receive_queue.lock);
3115
3116 return amount;
3117 }
3118 EXPORT_SYMBOL_GPL(unix_inq_len);
3119
unix_outq_len(struct sock * sk)3120 long unix_outq_len(struct sock *sk)
3121 {
3122 return sk_wmem_alloc_get(sk);
3123 }
3124 EXPORT_SYMBOL_GPL(unix_outq_len);
3125
unix_open_file(struct sock * sk)3126 static int unix_open_file(struct sock *sk)
3127 {
3128 struct path path;
3129 struct file *f;
3130 int fd;
3131
3132 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3133 return -EPERM;
3134
3135 if (!smp_load_acquire(&unix_sk(sk)->addr))
3136 return -ENOENT;
3137
3138 path = unix_sk(sk)->path;
3139 if (!path.dentry)
3140 return -ENOENT;
3141
3142 path_get(&path);
3143
3144 fd = get_unused_fd_flags(O_CLOEXEC);
3145 if (fd < 0)
3146 goto out;
3147
3148 f = dentry_open(&path, O_PATH, current_cred());
3149 if (IS_ERR(f)) {
3150 put_unused_fd(fd);
3151 fd = PTR_ERR(f);
3152 goto out;
3153 }
3154
3155 fd_install(fd, f);
3156 out:
3157 path_put(&path);
3158
3159 return fd;
3160 }
3161
unix_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3162 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3163 {
3164 struct sock *sk = sock->sk;
3165 long amount = 0;
3166 int err;
3167
3168 switch (cmd) {
3169 case SIOCOUTQ:
3170 amount = unix_outq_len(sk);
3171 err = put_user(amount, (int __user *)arg);
3172 break;
3173 case SIOCINQ:
3174 amount = unix_inq_len(sk);
3175 if (amount < 0)
3176 err = amount;
3177 else
3178 err = put_user(amount, (int __user *)arg);
3179 break;
3180 case SIOCUNIXFILE:
3181 err = unix_open_file(sk);
3182 break;
3183 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3184 case SIOCATMARK:
3185 {
3186 struct unix_sock *u = unix_sk(sk);
3187 struct sk_buff *skb;
3188 int answ = 0;
3189
3190 mutex_lock(&u->iolock);
3191
3192 skb = skb_peek(&sk->sk_receive_queue);
3193 if (skb) {
3194 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3195
3196 if (skb == oob_skb ||
3197 (!oob_skb && !unix_skb_len(skb)))
3198 answ = 1;
3199 }
3200
3201 mutex_unlock(&u->iolock);
3202
3203 err = put_user(answ, (int __user *)arg);
3204 }
3205 break;
3206 #endif
3207 default:
3208 err = -ENOIOCTLCMD;
3209 break;
3210 }
3211 return err;
3212 }
3213
3214 #ifdef CONFIG_COMPAT
unix_compat_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3215 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3216 {
3217 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3218 }
3219 #endif
3220
unix_poll(struct file * file,struct socket * sock,poll_table * wait)3221 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3222 {
3223 struct sock *sk = sock->sk;
3224 unsigned char state;
3225 __poll_t mask;
3226 u8 shutdown;
3227
3228 sock_poll_wait(file, sock, wait);
3229 mask = 0;
3230 shutdown = READ_ONCE(sk->sk_shutdown);
3231 state = READ_ONCE(sk->sk_state);
3232
3233 /* exceptional events? */
3234 if (READ_ONCE(sk->sk_err))
3235 mask |= EPOLLERR;
3236 if (shutdown == SHUTDOWN_MASK)
3237 mask |= EPOLLHUP;
3238 if (shutdown & RCV_SHUTDOWN)
3239 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3240
3241 /* readable? */
3242 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3243 mask |= EPOLLIN | EPOLLRDNORM;
3244 if (sk_is_readable(sk))
3245 mask |= EPOLLIN | EPOLLRDNORM;
3246 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3247 if (READ_ONCE(unix_sk(sk)->oob_skb))
3248 mask |= EPOLLPRI;
3249 #endif
3250
3251 /* Connection-based need to check for termination and startup */
3252 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3253 state == TCP_CLOSE)
3254 mask |= EPOLLHUP;
3255
3256 /*
3257 * we set writable also when the other side has shut down the
3258 * connection. This prevents stuck sockets.
3259 */
3260 if (unix_writable(sk, state))
3261 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3262
3263 return mask;
3264 }
3265
unix_dgram_poll(struct file * file,struct socket * sock,poll_table * wait)3266 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3267 poll_table *wait)
3268 {
3269 struct sock *sk = sock->sk, *other;
3270 unsigned int writable;
3271 unsigned char state;
3272 __poll_t mask;
3273 u8 shutdown;
3274
3275 sock_poll_wait(file, sock, wait);
3276 mask = 0;
3277 shutdown = READ_ONCE(sk->sk_shutdown);
3278 state = READ_ONCE(sk->sk_state);
3279
3280 /* exceptional events? */
3281 if (READ_ONCE(sk->sk_err) ||
3282 !skb_queue_empty_lockless(&sk->sk_error_queue))
3283 mask |= EPOLLERR |
3284 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3285
3286 if (shutdown & RCV_SHUTDOWN)
3287 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3288 if (shutdown == SHUTDOWN_MASK)
3289 mask |= EPOLLHUP;
3290
3291 /* readable? */
3292 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3293 mask |= EPOLLIN | EPOLLRDNORM;
3294 if (sk_is_readable(sk))
3295 mask |= EPOLLIN | EPOLLRDNORM;
3296
3297 /* Connection-based need to check for termination and startup */
3298 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3299 mask |= EPOLLHUP;
3300
3301 /* No write status requested, avoid expensive OUT tests. */
3302 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3303 return mask;
3304
3305 writable = unix_writable(sk, state);
3306 if (writable) {
3307 unix_state_lock(sk);
3308
3309 other = unix_peer(sk);
3310 if (other && unix_peer(other) != sk &&
3311 unix_recvq_full_lockless(other) &&
3312 unix_dgram_peer_wake_me(sk, other))
3313 writable = 0;
3314
3315 unix_state_unlock(sk);
3316 }
3317
3318 if (writable)
3319 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3320 else
3321 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3322
3323 return mask;
3324 }
3325
3326 #ifdef CONFIG_PROC_FS
3327
3328 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3329
3330 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3331 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3332 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3333
unix_from_bucket(struct seq_file * seq,loff_t * pos)3334 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3335 {
3336 unsigned long offset = get_offset(*pos);
3337 unsigned long bucket = get_bucket(*pos);
3338 unsigned long count = 0;
3339 struct sock *sk;
3340
3341 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3342 sk; sk = sk_next(sk)) {
3343 if (++count == offset)
3344 break;
3345 }
3346
3347 return sk;
3348 }
3349
unix_get_first(struct seq_file * seq,loff_t * pos)3350 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3351 {
3352 unsigned long bucket = get_bucket(*pos);
3353 struct net *net = seq_file_net(seq);
3354 struct sock *sk;
3355
3356 while (bucket < UNIX_HASH_SIZE) {
3357 spin_lock(&net->unx.table.locks[bucket]);
3358
3359 sk = unix_from_bucket(seq, pos);
3360 if (sk)
3361 return sk;
3362
3363 spin_unlock(&net->unx.table.locks[bucket]);
3364
3365 *pos = set_bucket_offset(++bucket, 1);
3366 }
3367
3368 return NULL;
3369 }
3370
unix_get_next(struct seq_file * seq,struct sock * sk,loff_t * pos)3371 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3372 loff_t *pos)
3373 {
3374 unsigned long bucket = get_bucket(*pos);
3375
3376 sk = sk_next(sk);
3377 if (sk)
3378 return sk;
3379
3380
3381 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3382
3383 *pos = set_bucket_offset(++bucket, 1);
3384
3385 return unix_get_first(seq, pos);
3386 }
3387
unix_seq_start(struct seq_file * seq,loff_t * pos)3388 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3389 {
3390 if (!*pos)
3391 return SEQ_START_TOKEN;
3392
3393 return unix_get_first(seq, pos);
3394 }
3395
unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3396 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3397 {
3398 ++*pos;
3399
3400 if (v == SEQ_START_TOKEN)
3401 return unix_get_first(seq, pos);
3402
3403 return unix_get_next(seq, v, pos);
3404 }
3405
unix_seq_stop(struct seq_file * seq,void * v)3406 static void unix_seq_stop(struct seq_file *seq, void *v)
3407 {
3408 struct sock *sk = v;
3409
3410 if (sk)
3411 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3412 }
3413
unix_seq_show(struct seq_file * seq,void * v)3414 static int unix_seq_show(struct seq_file *seq, void *v)
3415 {
3416
3417 if (v == SEQ_START_TOKEN)
3418 seq_puts(seq, "Num RefCount Protocol Flags Type St "
3419 "Inode Path\n");
3420 else {
3421 struct sock *s = v;
3422 struct unix_sock *u = unix_sk(s);
3423 unix_state_lock(s);
3424
3425 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3426 s,
3427 refcount_read(&s->sk_refcnt),
3428 0,
3429 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3430 s->sk_type,
3431 s->sk_socket ?
3432 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3433 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3434 sock_i_ino(s));
3435
3436 if (u->addr) { // under a hash table lock here
3437 int i, len;
3438 seq_putc(seq, ' ');
3439
3440 i = 0;
3441 len = u->addr->len -
3442 offsetof(struct sockaddr_un, sun_path);
3443 if (u->addr->name->sun_path[0]) {
3444 len--;
3445 } else {
3446 seq_putc(seq, '@');
3447 i++;
3448 }
3449 for ( ; i < len; i++)
3450 seq_putc(seq, u->addr->name->sun_path[i] ?:
3451 '@');
3452 }
3453 unix_state_unlock(s);
3454 seq_putc(seq, '\n');
3455 }
3456
3457 return 0;
3458 }
3459
3460 static const struct seq_operations unix_seq_ops = {
3461 .start = unix_seq_start,
3462 .next = unix_seq_next,
3463 .stop = unix_seq_stop,
3464 .show = unix_seq_show,
3465 };
3466
3467 #ifdef CONFIG_BPF_SYSCALL
3468 struct bpf_unix_iter_state {
3469 struct seq_net_private p;
3470 unsigned int cur_sk;
3471 unsigned int end_sk;
3472 unsigned int max_sk;
3473 struct sock **batch;
3474 bool st_bucket_done;
3475 };
3476
3477 struct bpf_iter__unix {
3478 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3479 __bpf_md_ptr(struct unix_sock *, unix_sk);
3480 uid_t uid __aligned(8);
3481 };
3482
unix_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3483 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3484 struct unix_sock *unix_sk, uid_t uid)
3485 {
3486 struct bpf_iter__unix ctx;
3487
3488 meta->seq_num--; /* skip SEQ_START_TOKEN */
3489 ctx.meta = meta;
3490 ctx.unix_sk = unix_sk;
3491 ctx.uid = uid;
3492 return bpf_iter_run_prog(prog, &ctx);
3493 }
3494
bpf_iter_unix_hold_batch(struct seq_file * seq,struct sock * start_sk)3495 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3496
3497 {
3498 struct bpf_unix_iter_state *iter = seq->private;
3499 unsigned int expected = 1;
3500 struct sock *sk;
3501
3502 sock_hold(start_sk);
3503 iter->batch[iter->end_sk++] = start_sk;
3504
3505 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3506 if (iter->end_sk < iter->max_sk) {
3507 sock_hold(sk);
3508 iter->batch[iter->end_sk++] = sk;
3509 }
3510
3511 expected++;
3512 }
3513
3514 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3515
3516 return expected;
3517 }
3518
bpf_iter_unix_put_batch(struct bpf_unix_iter_state * iter)3519 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3520 {
3521 while (iter->cur_sk < iter->end_sk)
3522 sock_put(iter->batch[iter->cur_sk++]);
3523 }
3524
bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state * iter,unsigned int new_batch_sz)3525 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3526 unsigned int new_batch_sz)
3527 {
3528 struct sock **new_batch;
3529
3530 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3531 GFP_USER | __GFP_NOWARN);
3532 if (!new_batch)
3533 return -ENOMEM;
3534
3535 bpf_iter_unix_put_batch(iter);
3536 kvfree(iter->batch);
3537 iter->batch = new_batch;
3538 iter->max_sk = new_batch_sz;
3539
3540 return 0;
3541 }
3542
bpf_iter_unix_batch(struct seq_file * seq,loff_t * pos)3543 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3544 loff_t *pos)
3545 {
3546 struct bpf_unix_iter_state *iter = seq->private;
3547 unsigned int expected;
3548 bool resized = false;
3549 struct sock *sk;
3550
3551 if (iter->st_bucket_done)
3552 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3553
3554 again:
3555 /* Get a new batch */
3556 iter->cur_sk = 0;
3557 iter->end_sk = 0;
3558
3559 sk = unix_get_first(seq, pos);
3560 if (!sk)
3561 return NULL; /* Done */
3562
3563 expected = bpf_iter_unix_hold_batch(seq, sk);
3564
3565 if (iter->end_sk == expected) {
3566 iter->st_bucket_done = true;
3567 return sk;
3568 }
3569
3570 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3571 resized = true;
3572 goto again;
3573 }
3574
3575 return sk;
3576 }
3577
bpf_iter_unix_seq_start(struct seq_file * seq,loff_t * pos)3578 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3579 {
3580 if (!*pos)
3581 return SEQ_START_TOKEN;
3582
3583 /* bpf iter does not support lseek, so it always
3584 * continue from where it was stop()-ped.
3585 */
3586 return bpf_iter_unix_batch(seq, pos);
3587 }
3588
bpf_iter_unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3589 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3590 {
3591 struct bpf_unix_iter_state *iter = seq->private;
3592 struct sock *sk;
3593
3594 /* Whenever seq_next() is called, the iter->cur_sk is
3595 * done with seq_show(), so advance to the next sk in
3596 * the batch.
3597 */
3598 if (iter->cur_sk < iter->end_sk)
3599 sock_put(iter->batch[iter->cur_sk++]);
3600
3601 ++*pos;
3602
3603 if (iter->cur_sk < iter->end_sk)
3604 sk = iter->batch[iter->cur_sk];
3605 else
3606 sk = bpf_iter_unix_batch(seq, pos);
3607
3608 return sk;
3609 }
3610
bpf_iter_unix_seq_show(struct seq_file * seq,void * v)3611 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3612 {
3613 struct bpf_iter_meta meta;
3614 struct bpf_prog *prog;
3615 struct sock *sk = v;
3616 uid_t uid;
3617 bool slow;
3618 int ret;
3619
3620 if (v == SEQ_START_TOKEN)
3621 return 0;
3622
3623 slow = lock_sock_fast(sk);
3624
3625 if (unlikely(sk_unhashed(sk))) {
3626 ret = SEQ_SKIP;
3627 goto unlock;
3628 }
3629
3630 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3631 meta.seq = seq;
3632 prog = bpf_iter_get_info(&meta, false);
3633 ret = unix_prog_seq_show(prog, &meta, v, uid);
3634 unlock:
3635 unlock_sock_fast(sk, slow);
3636 return ret;
3637 }
3638
bpf_iter_unix_seq_stop(struct seq_file * seq,void * v)3639 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3640 {
3641 struct bpf_unix_iter_state *iter = seq->private;
3642 struct bpf_iter_meta meta;
3643 struct bpf_prog *prog;
3644
3645 if (!v) {
3646 meta.seq = seq;
3647 prog = bpf_iter_get_info(&meta, true);
3648 if (prog)
3649 (void)unix_prog_seq_show(prog, &meta, v, 0);
3650 }
3651
3652 if (iter->cur_sk < iter->end_sk)
3653 bpf_iter_unix_put_batch(iter);
3654 }
3655
3656 static const struct seq_operations bpf_iter_unix_seq_ops = {
3657 .start = bpf_iter_unix_seq_start,
3658 .next = bpf_iter_unix_seq_next,
3659 .stop = bpf_iter_unix_seq_stop,
3660 .show = bpf_iter_unix_seq_show,
3661 };
3662 #endif
3663 #endif
3664
3665 static const struct net_proto_family unix_family_ops = {
3666 .family = PF_UNIX,
3667 .create = unix_create,
3668 .owner = THIS_MODULE,
3669 };
3670
3671
unix_net_init(struct net * net)3672 static int __net_init unix_net_init(struct net *net)
3673 {
3674 int i;
3675
3676 net->unx.sysctl_max_dgram_qlen = 10;
3677 if (unix_sysctl_register(net))
3678 goto out;
3679
3680 #ifdef CONFIG_PROC_FS
3681 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3682 sizeof(struct seq_net_private)))
3683 goto err_sysctl;
3684 #endif
3685
3686 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3687 sizeof(spinlock_t), GFP_KERNEL);
3688 if (!net->unx.table.locks)
3689 goto err_proc;
3690
3691 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3692 sizeof(struct hlist_head),
3693 GFP_KERNEL);
3694 if (!net->unx.table.buckets)
3695 goto free_locks;
3696
3697 for (i = 0; i < UNIX_HASH_SIZE; i++) {
3698 spin_lock_init(&net->unx.table.locks[i]);
3699 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3700 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3701 }
3702
3703 return 0;
3704
3705 free_locks:
3706 kvfree(net->unx.table.locks);
3707 err_proc:
3708 #ifdef CONFIG_PROC_FS
3709 remove_proc_entry("unix", net->proc_net);
3710 err_sysctl:
3711 #endif
3712 unix_sysctl_unregister(net);
3713 out:
3714 return -ENOMEM;
3715 }
3716
unix_net_exit(struct net * net)3717 static void __net_exit unix_net_exit(struct net *net)
3718 {
3719 kvfree(net->unx.table.buckets);
3720 kvfree(net->unx.table.locks);
3721 unix_sysctl_unregister(net);
3722 remove_proc_entry("unix", net->proc_net);
3723 }
3724
3725 static struct pernet_operations unix_net_ops = {
3726 .init = unix_net_init,
3727 .exit = unix_net_exit,
3728 };
3729
3730 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(unix,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3731 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3732 struct unix_sock *unix_sk, uid_t uid)
3733
3734 #define INIT_BATCH_SZ 16
3735
3736 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3737 {
3738 struct bpf_unix_iter_state *iter = priv_data;
3739 int err;
3740
3741 err = bpf_iter_init_seq_net(priv_data, aux);
3742 if (err)
3743 return err;
3744
3745 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3746 if (err) {
3747 bpf_iter_fini_seq_net(priv_data);
3748 return err;
3749 }
3750
3751 return 0;
3752 }
3753
bpf_iter_fini_unix(void * priv_data)3754 static void bpf_iter_fini_unix(void *priv_data)
3755 {
3756 struct bpf_unix_iter_state *iter = priv_data;
3757
3758 bpf_iter_fini_seq_net(priv_data);
3759 kvfree(iter->batch);
3760 }
3761
3762 static const struct bpf_iter_seq_info unix_seq_info = {
3763 .seq_ops = &bpf_iter_unix_seq_ops,
3764 .init_seq_private = bpf_iter_init_unix,
3765 .fini_seq_private = bpf_iter_fini_unix,
3766 .seq_priv_size = sizeof(struct bpf_unix_iter_state),
3767 };
3768
3769 static const struct bpf_func_proto *
bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3770 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3771 const struct bpf_prog *prog)
3772 {
3773 switch (func_id) {
3774 case BPF_FUNC_setsockopt:
3775 return &bpf_sk_setsockopt_proto;
3776 case BPF_FUNC_getsockopt:
3777 return &bpf_sk_getsockopt_proto;
3778 default:
3779 return NULL;
3780 }
3781 }
3782
3783 static struct bpf_iter_reg unix_reg_info = {
3784 .target = "unix",
3785 .ctx_arg_info_size = 1,
3786 .ctx_arg_info = {
3787 { offsetof(struct bpf_iter__unix, unix_sk),
3788 PTR_TO_BTF_ID_OR_NULL },
3789 },
3790 .get_func_proto = bpf_iter_unix_get_func_proto,
3791 .seq_info = &unix_seq_info,
3792 };
3793
bpf_iter_register(void)3794 static void __init bpf_iter_register(void)
3795 {
3796 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3797 if (bpf_iter_reg_target(&unix_reg_info))
3798 pr_warn("Warning: could not register bpf iterator unix\n");
3799 }
3800 #endif
3801
af_unix_init(void)3802 static int __init af_unix_init(void)
3803 {
3804 int i, rc = -1;
3805
3806 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3807
3808 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3809 spin_lock_init(&bsd_socket_locks[i]);
3810 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3811 }
3812
3813 rc = proto_register(&unix_dgram_proto, 1);
3814 if (rc != 0) {
3815 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3816 goto out;
3817 }
3818
3819 rc = proto_register(&unix_stream_proto, 1);
3820 if (rc != 0) {
3821 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3822 proto_unregister(&unix_dgram_proto);
3823 goto out;
3824 }
3825
3826 sock_register(&unix_family_ops);
3827 register_pernet_subsys(&unix_net_ops);
3828 unix_bpf_build_proto();
3829
3830 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3831 bpf_iter_register();
3832 #endif
3833
3834 out:
3835 return rc;
3836 }
3837
3838 /* Later than subsys_initcall() because we depend on stuff initialised there */
3839 fs_initcall(af_unix_init);
3840