1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Code related to the io_uring_register() syscall
4 *
5 * Copyright (C) 2023 Jens Axboe
6 */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20
21 #include "io_uring.h"
22 #include "opdef.h"
23 #include "tctx.h"
24 #include "rsrc.h"
25 #include "sqpoll.h"
26 #include "register.h"
27 #include "cancel.h"
28 #include "kbuf.h"
29 #include "napi.h"
30
31 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
32 IORING_REGISTER_LAST + IORING_OP_LAST)
33
io_eventfd_register(struct io_ring_ctx * ctx,void __user * arg,unsigned int eventfd_async)34 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
35 unsigned int eventfd_async)
36 {
37 struct io_ev_fd *ev_fd;
38 __s32 __user *fds = arg;
39 int fd;
40
41 ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
42 lockdep_is_held(&ctx->uring_lock));
43 if (ev_fd)
44 return -EBUSY;
45
46 if (copy_from_user(&fd, fds, sizeof(*fds)))
47 return -EFAULT;
48
49 ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
50 if (!ev_fd)
51 return -ENOMEM;
52
53 ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
54 if (IS_ERR(ev_fd->cq_ev_fd)) {
55 int ret = PTR_ERR(ev_fd->cq_ev_fd);
56 kfree(ev_fd);
57 return ret;
58 }
59
60 spin_lock(&ctx->completion_lock);
61 ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
62 spin_unlock(&ctx->completion_lock);
63
64 ev_fd->eventfd_async = eventfd_async;
65 ctx->has_evfd = true;
66 rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
67 atomic_set(&ev_fd->refs, 1);
68 atomic_set(&ev_fd->ops, 0);
69 return 0;
70 }
71
io_eventfd_unregister(struct io_ring_ctx * ctx)72 int io_eventfd_unregister(struct io_ring_ctx *ctx)
73 {
74 struct io_ev_fd *ev_fd;
75
76 ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
77 lockdep_is_held(&ctx->uring_lock));
78 if (ev_fd) {
79 ctx->has_evfd = false;
80 rcu_assign_pointer(ctx->io_ev_fd, NULL);
81 if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
82 call_rcu(&ev_fd->rcu, io_eventfd_ops);
83 return 0;
84 }
85
86 return -ENXIO;
87 }
88
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)89 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
90 unsigned nr_args)
91 {
92 struct io_uring_probe *p;
93 size_t size;
94 int i, ret;
95
96 size = struct_size(p, ops, nr_args);
97 if (size == SIZE_MAX)
98 return -EOVERFLOW;
99 p = kzalloc(size, GFP_KERNEL);
100 if (!p)
101 return -ENOMEM;
102
103 ret = -EFAULT;
104 if (copy_from_user(p, arg, size))
105 goto out;
106 ret = -EINVAL;
107 if (memchr_inv(p, 0, size))
108 goto out;
109
110 p->last_op = IORING_OP_LAST - 1;
111 if (nr_args > IORING_OP_LAST)
112 nr_args = IORING_OP_LAST;
113
114 for (i = 0; i < nr_args; i++) {
115 p->ops[i].op = i;
116 if (!io_issue_defs[i].not_supported)
117 p->ops[i].flags = IO_URING_OP_SUPPORTED;
118 }
119 p->ops_len = i;
120
121 ret = 0;
122 if (copy_to_user(arg, p, size))
123 ret = -EFAULT;
124 out:
125 kfree(p);
126 return ret;
127 }
128
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)129 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
130 {
131 const struct cred *creds;
132
133 creds = xa_erase(&ctx->personalities, id);
134 if (creds) {
135 put_cred(creds);
136 return 0;
137 }
138
139 return -EINVAL;
140 }
141
142
io_register_personality(struct io_ring_ctx * ctx)143 static int io_register_personality(struct io_ring_ctx *ctx)
144 {
145 const struct cred *creds;
146 u32 id;
147 int ret;
148
149 creds = get_current_cred();
150
151 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
152 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
153 if (ret < 0) {
154 put_cred(creds);
155 return ret;
156 }
157 return id;
158 }
159
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)160 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
161 void __user *arg, unsigned int nr_args)
162 {
163 struct io_uring_restriction *res;
164 size_t size;
165 int i, ret;
166
167 /* Restrictions allowed only if rings started disabled */
168 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
169 return -EBADFD;
170
171 /* We allow only a single restrictions registration */
172 if (ctx->restrictions.registered)
173 return -EBUSY;
174
175 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
176 return -EINVAL;
177
178 size = array_size(nr_args, sizeof(*res));
179 if (size == SIZE_MAX)
180 return -EOVERFLOW;
181
182 res = memdup_user(arg, size);
183 if (IS_ERR(res))
184 return PTR_ERR(res);
185
186 ret = 0;
187
188 for (i = 0; i < nr_args; i++) {
189 switch (res[i].opcode) {
190 case IORING_RESTRICTION_REGISTER_OP:
191 if (res[i].register_op >= IORING_REGISTER_LAST) {
192 ret = -EINVAL;
193 goto out;
194 }
195
196 __set_bit(res[i].register_op,
197 ctx->restrictions.register_op);
198 break;
199 case IORING_RESTRICTION_SQE_OP:
200 if (res[i].sqe_op >= IORING_OP_LAST) {
201 ret = -EINVAL;
202 goto out;
203 }
204
205 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
206 break;
207 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
208 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
209 break;
210 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
211 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
212 break;
213 default:
214 ret = -EINVAL;
215 goto out;
216 }
217 }
218
219 out:
220 /* Reset all restrictions if an error happened */
221 if (ret != 0)
222 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
223 else
224 ctx->restrictions.registered = true;
225
226 kfree(res);
227 return ret;
228 }
229
io_register_enable_rings(struct io_ring_ctx * ctx)230 static int io_register_enable_rings(struct io_ring_ctx *ctx)
231 {
232 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
233 return -EBADFD;
234
235 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
236 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
237 /*
238 * Lazy activation attempts would fail if it was polled before
239 * submitter_task is set.
240 */
241 if (wq_has_sleeper(&ctx->poll_wq))
242 io_activate_pollwq(ctx);
243 }
244
245 if (ctx->restrictions.registered)
246 ctx->restricted = 1;
247
248 ctx->flags &= ~IORING_SETUP_R_DISABLED;
249 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
250 wake_up(&ctx->sq_data->wait);
251 return 0;
252 }
253
__io_register_iowq_aff(struct io_ring_ctx * ctx,cpumask_var_t new_mask)254 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
255 cpumask_var_t new_mask)
256 {
257 int ret;
258
259 if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
260 ret = io_wq_cpu_affinity(current->io_uring, new_mask);
261 } else {
262 mutex_unlock(&ctx->uring_lock);
263 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
264 mutex_lock(&ctx->uring_lock);
265 }
266
267 return ret;
268 }
269
io_register_iowq_aff(struct io_ring_ctx * ctx,void __user * arg,unsigned len)270 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
271 void __user *arg, unsigned len)
272 {
273 cpumask_var_t new_mask;
274 int ret;
275
276 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
277 return -ENOMEM;
278
279 cpumask_clear(new_mask);
280 if (len > cpumask_size())
281 len = cpumask_size();
282
283 #ifdef CONFIG_COMPAT
284 if (in_compat_syscall())
285 ret = compat_get_bitmap(cpumask_bits(new_mask),
286 (const compat_ulong_t __user *)arg,
287 len * 8 /* CHAR_BIT */);
288 else
289 #endif
290 ret = copy_from_user(new_mask, arg, len);
291
292 if (ret) {
293 free_cpumask_var(new_mask);
294 return -EFAULT;
295 }
296
297 ret = __io_register_iowq_aff(ctx, new_mask);
298 free_cpumask_var(new_mask);
299 return ret;
300 }
301
io_unregister_iowq_aff(struct io_ring_ctx * ctx)302 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
303 {
304 return __io_register_iowq_aff(ctx, NULL);
305 }
306
io_register_iowq_max_workers(struct io_ring_ctx * ctx,void __user * arg)307 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
308 void __user *arg)
309 __must_hold(&ctx->uring_lock)
310 {
311 struct io_tctx_node *node;
312 struct io_uring_task *tctx = NULL;
313 struct io_sq_data *sqd = NULL;
314 __u32 new_count[2];
315 int i, ret;
316
317 if (copy_from_user(new_count, arg, sizeof(new_count)))
318 return -EFAULT;
319 for (i = 0; i < ARRAY_SIZE(new_count); i++)
320 if (new_count[i] > INT_MAX)
321 return -EINVAL;
322
323 if (ctx->flags & IORING_SETUP_SQPOLL) {
324 sqd = ctx->sq_data;
325 if (sqd) {
326 /*
327 * Observe the correct sqd->lock -> ctx->uring_lock
328 * ordering. Fine to drop uring_lock here, we hold
329 * a ref to the ctx.
330 */
331 refcount_inc(&sqd->refs);
332 mutex_unlock(&ctx->uring_lock);
333 mutex_lock(&sqd->lock);
334 mutex_lock(&ctx->uring_lock);
335 if (sqd->thread)
336 tctx = sqd->thread->io_uring;
337 }
338 } else {
339 tctx = current->io_uring;
340 }
341
342 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
343
344 for (i = 0; i < ARRAY_SIZE(new_count); i++)
345 if (new_count[i])
346 ctx->iowq_limits[i] = new_count[i];
347 ctx->iowq_limits_set = true;
348
349 if (tctx && tctx->io_wq) {
350 ret = io_wq_max_workers(tctx->io_wq, new_count);
351 if (ret)
352 goto err;
353 } else {
354 memset(new_count, 0, sizeof(new_count));
355 }
356
357 if (sqd) {
358 mutex_unlock(&ctx->uring_lock);
359 mutex_unlock(&sqd->lock);
360 io_put_sq_data(sqd);
361 mutex_lock(&ctx->uring_lock);
362 }
363
364 if (copy_to_user(arg, new_count, sizeof(new_count)))
365 return -EFAULT;
366
367 /* that's it for SQPOLL, only the SQPOLL task creates requests */
368 if (sqd)
369 return 0;
370
371 /* now propagate the restriction to all registered users */
372 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
373 tctx = node->task->io_uring;
374 if (WARN_ON_ONCE(!tctx->io_wq))
375 continue;
376
377 for (i = 0; i < ARRAY_SIZE(new_count); i++)
378 new_count[i] = ctx->iowq_limits[i];
379 /* ignore errors, it always returns zero anyway */
380 (void)io_wq_max_workers(tctx->io_wq, new_count);
381 }
382 return 0;
383 err:
384 if (sqd) {
385 mutex_unlock(&ctx->uring_lock);
386 mutex_unlock(&sqd->lock);
387 io_put_sq_data(sqd);
388 mutex_lock(&ctx->uring_lock);
389 }
390 return ret;
391 }
392
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)393 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
394 void __user *arg, unsigned nr_args)
395 __releases(ctx->uring_lock)
396 __acquires(ctx->uring_lock)
397 {
398 int ret;
399
400 /*
401 * We don't quiesce the refs for register anymore and so it can't be
402 * dying as we're holding a file ref here.
403 */
404 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
405 return -ENXIO;
406
407 if (ctx->submitter_task && ctx->submitter_task != current)
408 return -EEXIST;
409
410 if (ctx->restricted) {
411 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
412 if (!test_bit(opcode, ctx->restrictions.register_op))
413 return -EACCES;
414 }
415
416 switch (opcode) {
417 case IORING_REGISTER_BUFFERS:
418 ret = -EFAULT;
419 if (!arg)
420 break;
421 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
422 break;
423 case IORING_UNREGISTER_BUFFERS:
424 ret = -EINVAL;
425 if (arg || nr_args)
426 break;
427 ret = io_sqe_buffers_unregister(ctx);
428 break;
429 case IORING_REGISTER_FILES:
430 ret = -EFAULT;
431 if (!arg)
432 break;
433 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
434 break;
435 case IORING_UNREGISTER_FILES:
436 ret = -EINVAL;
437 if (arg || nr_args)
438 break;
439 ret = io_sqe_files_unregister(ctx);
440 break;
441 case IORING_REGISTER_FILES_UPDATE:
442 ret = io_register_files_update(ctx, arg, nr_args);
443 break;
444 case IORING_REGISTER_EVENTFD:
445 ret = -EINVAL;
446 if (nr_args != 1)
447 break;
448 ret = io_eventfd_register(ctx, arg, 0);
449 break;
450 case IORING_REGISTER_EVENTFD_ASYNC:
451 ret = -EINVAL;
452 if (nr_args != 1)
453 break;
454 ret = io_eventfd_register(ctx, arg, 1);
455 break;
456 case IORING_UNREGISTER_EVENTFD:
457 ret = -EINVAL;
458 if (arg || nr_args)
459 break;
460 ret = io_eventfd_unregister(ctx);
461 break;
462 case IORING_REGISTER_PROBE:
463 ret = -EINVAL;
464 if (!arg || nr_args > 256)
465 break;
466 ret = io_probe(ctx, arg, nr_args);
467 break;
468 case IORING_REGISTER_PERSONALITY:
469 ret = -EINVAL;
470 if (arg || nr_args)
471 break;
472 ret = io_register_personality(ctx);
473 break;
474 case IORING_UNREGISTER_PERSONALITY:
475 ret = -EINVAL;
476 if (arg)
477 break;
478 ret = io_unregister_personality(ctx, nr_args);
479 break;
480 case IORING_REGISTER_ENABLE_RINGS:
481 ret = -EINVAL;
482 if (arg || nr_args)
483 break;
484 ret = io_register_enable_rings(ctx);
485 break;
486 case IORING_REGISTER_RESTRICTIONS:
487 ret = io_register_restrictions(ctx, arg, nr_args);
488 break;
489 case IORING_REGISTER_FILES2:
490 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
491 break;
492 case IORING_REGISTER_FILES_UPDATE2:
493 ret = io_register_rsrc_update(ctx, arg, nr_args,
494 IORING_RSRC_FILE);
495 break;
496 case IORING_REGISTER_BUFFERS2:
497 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
498 break;
499 case IORING_REGISTER_BUFFERS_UPDATE:
500 ret = io_register_rsrc_update(ctx, arg, nr_args,
501 IORING_RSRC_BUFFER);
502 break;
503 case IORING_REGISTER_IOWQ_AFF:
504 ret = -EINVAL;
505 if (!arg || !nr_args)
506 break;
507 ret = io_register_iowq_aff(ctx, arg, nr_args);
508 break;
509 case IORING_UNREGISTER_IOWQ_AFF:
510 ret = -EINVAL;
511 if (arg || nr_args)
512 break;
513 ret = io_unregister_iowq_aff(ctx);
514 break;
515 case IORING_REGISTER_IOWQ_MAX_WORKERS:
516 ret = -EINVAL;
517 if (!arg || nr_args != 2)
518 break;
519 ret = io_register_iowq_max_workers(ctx, arg);
520 break;
521 case IORING_REGISTER_RING_FDS:
522 ret = io_ringfd_register(ctx, arg, nr_args);
523 break;
524 case IORING_UNREGISTER_RING_FDS:
525 ret = io_ringfd_unregister(ctx, arg, nr_args);
526 break;
527 case IORING_REGISTER_PBUF_RING:
528 ret = -EINVAL;
529 if (!arg || nr_args != 1)
530 break;
531 ret = io_register_pbuf_ring(ctx, arg);
532 break;
533 case IORING_UNREGISTER_PBUF_RING:
534 ret = -EINVAL;
535 if (!arg || nr_args != 1)
536 break;
537 ret = io_unregister_pbuf_ring(ctx, arg);
538 break;
539 case IORING_REGISTER_SYNC_CANCEL:
540 ret = -EINVAL;
541 if (!arg || nr_args != 1)
542 break;
543 ret = io_sync_cancel(ctx, arg);
544 break;
545 case IORING_REGISTER_FILE_ALLOC_RANGE:
546 ret = -EINVAL;
547 if (!arg || nr_args)
548 break;
549 ret = io_register_file_alloc_range(ctx, arg);
550 break;
551 case IORING_REGISTER_PBUF_STATUS:
552 ret = -EINVAL;
553 if (!arg || nr_args != 1)
554 break;
555 ret = io_register_pbuf_status(ctx, arg);
556 break;
557 case IORING_REGISTER_NAPI:
558 ret = -EINVAL;
559 if (!arg || nr_args != 1)
560 break;
561 ret = io_register_napi(ctx, arg);
562 break;
563 case IORING_UNREGISTER_NAPI:
564 ret = -EINVAL;
565 if (nr_args != 1)
566 break;
567 ret = io_unregister_napi(ctx, arg);
568 break;
569 default:
570 ret = -EINVAL;
571 break;
572 }
573
574 return ret;
575 }
576
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)577 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
578 void __user *, arg, unsigned int, nr_args)
579 {
580 struct io_ring_ctx *ctx;
581 long ret = -EBADF;
582 struct file *file;
583 bool use_registered_ring;
584
585 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
586 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
587
588 if (opcode >= IORING_REGISTER_LAST)
589 return -EINVAL;
590
591 if (use_registered_ring) {
592 /*
593 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
594 * need only dereference our task private array to find it.
595 */
596 struct io_uring_task *tctx = current->io_uring;
597
598 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
599 return -EINVAL;
600 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
601 file = tctx->registered_rings[fd];
602 if (unlikely(!file))
603 return -EBADF;
604 } else {
605 file = fget(fd);
606 if (unlikely(!file))
607 return -EBADF;
608 ret = -EOPNOTSUPP;
609 if (!io_is_uring_fops(file))
610 goto out_fput;
611 }
612
613 ctx = file->private_data;
614
615 mutex_lock(&ctx->uring_lock);
616 ret = __io_uring_register(ctx, opcode, arg, nr_args);
617 mutex_unlock(&ctx->uring_lock);
618 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
619 out_fput:
620 if (!use_registered_ring)
621 fput(file);
622 return ret;
623 }
624