xref: /linux/io_uring/register.c (revision 021bc4b9)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "io_uring.h"
22 #include "opdef.h"
23 #include "tctx.h"
24 #include "rsrc.h"
25 #include "sqpoll.h"
26 #include "register.h"
27 #include "cancel.h"
28 #include "kbuf.h"
29 
30 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
31 				 IORING_REGISTER_LAST + IORING_OP_LAST)
32 
33 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
34 			       unsigned int eventfd_async)
35 {
36 	struct io_ev_fd *ev_fd;
37 	__s32 __user *fds = arg;
38 	int fd;
39 
40 	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
41 					lockdep_is_held(&ctx->uring_lock));
42 	if (ev_fd)
43 		return -EBUSY;
44 
45 	if (copy_from_user(&fd, fds, sizeof(*fds)))
46 		return -EFAULT;
47 
48 	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
49 	if (!ev_fd)
50 		return -ENOMEM;
51 
52 	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
53 	if (IS_ERR(ev_fd->cq_ev_fd)) {
54 		int ret = PTR_ERR(ev_fd->cq_ev_fd);
55 		kfree(ev_fd);
56 		return ret;
57 	}
58 
59 	spin_lock(&ctx->completion_lock);
60 	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
61 	spin_unlock(&ctx->completion_lock);
62 
63 	ev_fd->eventfd_async = eventfd_async;
64 	ctx->has_evfd = true;
65 	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
66 	atomic_set(&ev_fd->refs, 1);
67 	atomic_set(&ev_fd->ops, 0);
68 	return 0;
69 }
70 
71 int io_eventfd_unregister(struct io_ring_ctx *ctx)
72 {
73 	struct io_ev_fd *ev_fd;
74 
75 	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
76 					lockdep_is_held(&ctx->uring_lock));
77 	if (ev_fd) {
78 		ctx->has_evfd = false;
79 		rcu_assign_pointer(ctx->io_ev_fd, NULL);
80 		if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
81 			call_rcu(&ev_fd->rcu, io_eventfd_ops);
82 		return 0;
83 	}
84 
85 	return -ENXIO;
86 }
87 
88 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
89 			   unsigned nr_args)
90 {
91 	struct io_uring_probe *p;
92 	size_t size;
93 	int i, ret;
94 
95 	size = struct_size(p, ops, nr_args);
96 	if (size == SIZE_MAX)
97 		return -EOVERFLOW;
98 	p = kzalloc(size, GFP_KERNEL);
99 	if (!p)
100 		return -ENOMEM;
101 
102 	ret = -EFAULT;
103 	if (copy_from_user(p, arg, size))
104 		goto out;
105 	ret = -EINVAL;
106 	if (memchr_inv(p, 0, size))
107 		goto out;
108 
109 	p->last_op = IORING_OP_LAST - 1;
110 	if (nr_args > IORING_OP_LAST)
111 		nr_args = IORING_OP_LAST;
112 
113 	for (i = 0; i < nr_args; i++) {
114 		p->ops[i].op = i;
115 		if (!io_issue_defs[i].not_supported)
116 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
117 	}
118 	p->ops_len = i;
119 
120 	ret = 0;
121 	if (copy_to_user(arg, p, size))
122 		ret = -EFAULT;
123 out:
124 	kfree(p);
125 	return ret;
126 }
127 
128 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
129 {
130 	const struct cred *creds;
131 
132 	creds = xa_erase(&ctx->personalities, id);
133 	if (creds) {
134 		put_cred(creds);
135 		return 0;
136 	}
137 
138 	return -EINVAL;
139 }
140 
141 
142 static int io_register_personality(struct io_ring_ctx *ctx)
143 {
144 	const struct cred *creds;
145 	u32 id;
146 	int ret;
147 
148 	creds = get_current_cred();
149 
150 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
151 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
152 	if (ret < 0) {
153 		put_cred(creds);
154 		return ret;
155 	}
156 	return id;
157 }
158 
159 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
160 					   void __user *arg, unsigned int nr_args)
161 {
162 	struct io_uring_restriction *res;
163 	size_t size;
164 	int i, ret;
165 
166 	/* Restrictions allowed only if rings started disabled */
167 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
168 		return -EBADFD;
169 
170 	/* We allow only a single restrictions registration */
171 	if (ctx->restrictions.registered)
172 		return -EBUSY;
173 
174 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
175 		return -EINVAL;
176 
177 	size = array_size(nr_args, sizeof(*res));
178 	if (size == SIZE_MAX)
179 		return -EOVERFLOW;
180 
181 	res = memdup_user(arg, size);
182 	if (IS_ERR(res))
183 		return PTR_ERR(res);
184 
185 	ret = 0;
186 
187 	for (i = 0; i < nr_args; i++) {
188 		switch (res[i].opcode) {
189 		case IORING_RESTRICTION_REGISTER_OP:
190 			if (res[i].register_op >= IORING_REGISTER_LAST) {
191 				ret = -EINVAL;
192 				goto out;
193 			}
194 
195 			__set_bit(res[i].register_op,
196 				  ctx->restrictions.register_op);
197 			break;
198 		case IORING_RESTRICTION_SQE_OP:
199 			if (res[i].sqe_op >= IORING_OP_LAST) {
200 				ret = -EINVAL;
201 				goto out;
202 			}
203 
204 			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
205 			break;
206 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
207 			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
208 			break;
209 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
210 			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
211 			break;
212 		default:
213 			ret = -EINVAL;
214 			goto out;
215 		}
216 	}
217 
218 out:
219 	/* Reset all restrictions if an error happened */
220 	if (ret != 0)
221 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
222 	else
223 		ctx->restrictions.registered = true;
224 
225 	kfree(res);
226 	return ret;
227 }
228 
229 static int io_register_enable_rings(struct io_ring_ctx *ctx)
230 {
231 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
232 		return -EBADFD;
233 
234 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
235 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
236 		/*
237 		 * Lazy activation attempts would fail if it was polled before
238 		 * submitter_task is set.
239 		 */
240 		if (wq_has_sleeper(&ctx->poll_wq))
241 			io_activate_pollwq(ctx);
242 	}
243 
244 	if (ctx->restrictions.registered)
245 		ctx->restricted = 1;
246 
247 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
248 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
249 		wake_up(&ctx->sq_data->wait);
250 	return 0;
251 }
252 
253 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
254 					 cpumask_var_t new_mask)
255 {
256 	int ret;
257 
258 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
259 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
260 	} else {
261 		mutex_unlock(&ctx->uring_lock);
262 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
263 		mutex_lock(&ctx->uring_lock);
264 	}
265 
266 	return ret;
267 }
268 
269 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
270 				       void __user *arg, unsigned len)
271 {
272 	cpumask_var_t new_mask;
273 	int ret;
274 
275 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
276 		return -ENOMEM;
277 
278 	cpumask_clear(new_mask);
279 	if (len > cpumask_size())
280 		len = cpumask_size();
281 
282 #ifdef CONFIG_COMPAT
283 	if (in_compat_syscall())
284 		ret = compat_get_bitmap(cpumask_bits(new_mask),
285 					(const compat_ulong_t __user *)arg,
286 					len * 8 /* CHAR_BIT */);
287 	else
288 #endif
289 		ret = copy_from_user(new_mask, arg, len);
290 
291 	if (ret) {
292 		free_cpumask_var(new_mask);
293 		return -EFAULT;
294 	}
295 
296 	ret = __io_register_iowq_aff(ctx, new_mask);
297 	free_cpumask_var(new_mask);
298 	return ret;
299 }
300 
301 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
302 {
303 	return __io_register_iowq_aff(ctx, NULL);
304 }
305 
306 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
307 					       void __user *arg)
308 	__must_hold(&ctx->uring_lock)
309 {
310 	struct io_tctx_node *node;
311 	struct io_uring_task *tctx = NULL;
312 	struct io_sq_data *sqd = NULL;
313 	__u32 new_count[2];
314 	int i, ret;
315 
316 	if (copy_from_user(new_count, arg, sizeof(new_count)))
317 		return -EFAULT;
318 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
319 		if (new_count[i] > INT_MAX)
320 			return -EINVAL;
321 
322 	if (ctx->flags & IORING_SETUP_SQPOLL) {
323 		sqd = ctx->sq_data;
324 		if (sqd) {
325 			/*
326 			 * Observe the correct sqd->lock -> ctx->uring_lock
327 			 * ordering. Fine to drop uring_lock here, we hold
328 			 * a ref to the ctx.
329 			 */
330 			refcount_inc(&sqd->refs);
331 			mutex_unlock(&ctx->uring_lock);
332 			mutex_lock(&sqd->lock);
333 			mutex_lock(&ctx->uring_lock);
334 			if (sqd->thread)
335 				tctx = sqd->thread->io_uring;
336 		}
337 	} else {
338 		tctx = current->io_uring;
339 	}
340 
341 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
342 
343 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
344 		if (new_count[i])
345 			ctx->iowq_limits[i] = new_count[i];
346 	ctx->iowq_limits_set = true;
347 
348 	if (tctx && tctx->io_wq) {
349 		ret = io_wq_max_workers(tctx->io_wq, new_count);
350 		if (ret)
351 			goto err;
352 	} else {
353 		memset(new_count, 0, sizeof(new_count));
354 	}
355 
356 	if (sqd) {
357 		mutex_unlock(&sqd->lock);
358 		io_put_sq_data(sqd);
359 	}
360 
361 	if (copy_to_user(arg, new_count, sizeof(new_count)))
362 		return -EFAULT;
363 
364 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
365 	if (sqd)
366 		return 0;
367 
368 	/* now propagate the restriction to all registered users */
369 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
370 		struct io_uring_task *tctx = node->task->io_uring;
371 
372 		if (WARN_ON_ONCE(!tctx->io_wq))
373 			continue;
374 
375 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
376 			new_count[i] = ctx->iowq_limits[i];
377 		/* ignore errors, it always returns zero anyway */
378 		(void)io_wq_max_workers(tctx->io_wq, new_count);
379 	}
380 	return 0;
381 err:
382 	if (sqd) {
383 		mutex_unlock(&sqd->lock);
384 		io_put_sq_data(sqd);
385 	}
386 	return ret;
387 }
388 
389 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
390 			       void __user *arg, unsigned nr_args)
391 	__releases(ctx->uring_lock)
392 	__acquires(ctx->uring_lock)
393 {
394 	int ret;
395 
396 	/*
397 	 * We don't quiesce the refs for register anymore and so it can't be
398 	 * dying as we're holding a file ref here.
399 	 */
400 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
401 		return -ENXIO;
402 
403 	if (ctx->submitter_task && ctx->submitter_task != current)
404 		return -EEXIST;
405 
406 	if (ctx->restricted) {
407 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
408 		if (!test_bit(opcode, ctx->restrictions.register_op))
409 			return -EACCES;
410 	}
411 
412 	switch (opcode) {
413 	case IORING_REGISTER_BUFFERS:
414 		ret = -EFAULT;
415 		if (!arg)
416 			break;
417 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
418 		break;
419 	case IORING_UNREGISTER_BUFFERS:
420 		ret = -EINVAL;
421 		if (arg || nr_args)
422 			break;
423 		ret = io_sqe_buffers_unregister(ctx);
424 		break;
425 	case IORING_REGISTER_FILES:
426 		ret = -EFAULT;
427 		if (!arg)
428 			break;
429 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
430 		break;
431 	case IORING_UNREGISTER_FILES:
432 		ret = -EINVAL;
433 		if (arg || nr_args)
434 			break;
435 		ret = io_sqe_files_unregister(ctx);
436 		break;
437 	case IORING_REGISTER_FILES_UPDATE:
438 		ret = io_register_files_update(ctx, arg, nr_args);
439 		break;
440 	case IORING_REGISTER_EVENTFD:
441 		ret = -EINVAL;
442 		if (nr_args != 1)
443 			break;
444 		ret = io_eventfd_register(ctx, arg, 0);
445 		break;
446 	case IORING_REGISTER_EVENTFD_ASYNC:
447 		ret = -EINVAL;
448 		if (nr_args != 1)
449 			break;
450 		ret = io_eventfd_register(ctx, arg, 1);
451 		break;
452 	case IORING_UNREGISTER_EVENTFD:
453 		ret = -EINVAL;
454 		if (arg || nr_args)
455 			break;
456 		ret = io_eventfd_unregister(ctx);
457 		break;
458 	case IORING_REGISTER_PROBE:
459 		ret = -EINVAL;
460 		if (!arg || nr_args > 256)
461 			break;
462 		ret = io_probe(ctx, arg, nr_args);
463 		break;
464 	case IORING_REGISTER_PERSONALITY:
465 		ret = -EINVAL;
466 		if (arg || nr_args)
467 			break;
468 		ret = io_register_personality(ctx);
469 		break;
470 	case IORING_UNREGISTER_PERSONALITY:
471 		ret = -EINVAL;
472 		if (arg)
473 			break;
474 		ret = io_unregister_personality(ctx, nr_args);
475 		break;
476 	case IORING_REGISTER_ENABLE_RINGS:
477 		ret = -EINVAL;
478 		if (arg || nr_args)
479 			break;
480 		ret = io_register_enable_rings(ctx);
481 		break;
482 	case IORING_REGISTER_RESTRICTIONS:
483 		ret = io_register_restrictions(ctx, arg, nr_args);
484 		break;
485 	case IORING_REGISTER_FILES2:
486 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
487 		break;
488 	case IORING_REGISTER_FILES_UPDATE2:
489 		ret = io_register_rsrc_update(ctx, arg, nr_args,
490 					      IORING_RSRC_FILE);
491 		break;
492 	case IORING_REGISTER_BUFFERS2:
493 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
494 		break;
495 	case IORING_REGISTER_BUFFERS_UPDATE:
496 		ret = io_register_rsrc_update(ctx, arg, nr_args,
497 					      IORING_RSRC_BUFFER);
498 		break;
499 	case IORING_REGISTER_IOWQ_AFF:
500 		ret = -EINVAL;
501 		if (!arg || !nr_args)
502 			break;
503 		ret = io_register_iowq_aff(ctx, arg, nr_args);
504 		break;
505 	case IORING_UNREGISTER_IOWQ_AFF:
506 		ret = -EINVAL;
507 		if (arg || nr_args)
508 			break;
509 		ret = io_unregister_iowq_aff(ctx);
510 		break;
511 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
512 		ret = -EINVAL;
513 		if (!arg || nr_args != 2)
514 			break;
515 		ret = io_register_iowq_max_workers(ctx, arg);
516 		break;
517 	case IORING_REGISTER_RING_FDS:
518 		ret = io_ringfd_register(ctx, arg, nr_args);
519 		break;
520 	case IORING_UNREGISTER_RING_FDS:
521 		ret = io_ringfd_unregister(ctx, arg, nr_args);
522 		break;
523 	case IORING_REGISTER_PBUF_RING:
524 		ret = -EINVAL;
525 		if (!arg || nr_args != 1)
526 			break;
527 		ret = io_register_pbuf_ring(ctx, arg);
528 		break;
529 	case IORING_UNREGISTER_PBUF_RING:
530 		ret = -EINVAL;
531 		if (!arg || nr_args != 1)
532 			break;
533 		ret = io_unregister_pbuf_ring(ctx, arg);
534 		break;
535 	case IORING_REGISTER_SYNC_CANCEL:
536 		ret = -EINVAL;
537 		if (!arg || nr_args != 1)
538 			break;
539 		ret = io_sync_cancel(ctx, arg);
540 		break;
541 	case IORING_REGISTER_FILE_ALLOC_RANGE:
542 		ret = -EINVAL;
543 		if (!arg || nr_args)
544 			break;
545 		ret = io_register_file_alloc_range(ctx, arg);
546 		break;
547 	case IORING_REGISTER_PBUF_STATUS:
548 		ret = -EINVAL;
549 		if (!arg || nr_args != 1)
550 			break;
551 		ret = io_register_pbuf_status(ctx, arg);
552 		break;
553 	default:
554 		ret = -EINVAL;
555 		break;
556 	}
557 
558 	return ret;
559 }
560 
561 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
562 		void __user *, arg, unsigned int, nr_args)
563 {
564 	struct io_ring_ctx *ctx;
565 	long ret = -EBADF;
566 	struct file *file;
567 	bool use_registered_ring;
568 
569 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
570 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
571 
572 	if (opcode >= IORING_REGISTER_LAST)
573 		return -EINVAL;
574 
575 	if (use_registered_ring) {
576 		/*
577 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
578 		 * need only dereference our task private array to find it.
579 		 */
580 		struct io_uring_task *tctx = current->io_uring;
581 
582 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
583 			return -EINVAL;
584 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
585 		file = tctx->registered_rings[fd];
586 		if (unlikely(!file))
587 			return -EBADF;
588 	} else {
589 		file = fget(fd);
590 		if (unlikely(!file))
591 			return -EBADF;
592 		ret = -EOPNOTSUPP;
593 		if (!io_is_uring_fops(file))
594 			goto out_fput;
595 	}
596 
597 	ctx = file->private_data;
598 
599 	mutex_lock(&ctx->uring_lock);
600 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
601 	mutex_unlock(&ctx->uring_lock);
602 	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
603 out_fput:
604 	if (!use_registered_ring)
605 		fput(file);
606 	return ret;
607 }
608