1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Code related to the io_uring_register() syscall 4 * 5 * Copyright (C) 2023 Jens Axboe 6 */ 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/syscalls.h> 10 #include <linux/refcount.h> 11 #include <linux/bits.h> 12 #include <linux/fs.h> 13 #include <linux/file.h> 14 #include <linux/slab.h> 15 #include <linux/uaccess.h> 16 #include <linux/nospec.h> 17 #include <linux/compat.h> 18 #include <linux/io_uring.h> 19 #include <linux/io_uring_types.h> 20 21 #include "io_uring.h" 22 #include "opdef.h" 23 #include "tctx.h" 24 #include "rsrc.h" 25 #include "sqpoll.h" 26 #include "register.h" 27 #include "cancel.h" 28 #include "kbuf.h" 29 30 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 31 IORING_REGISTER_LAST + IORING_OP_LAST) 32 33 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, 34 unsigned int eventfd_async) 35 { 36 struct io_ev_fd *ev_fd; 37 __s32 __user *fds = arg; 38 int fd; 39 40 ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 41 lockdep_is_held(&ctx->uring_lock)); 42 if (ev_fd) 43 return -EBUSY; 44 45 if (copy_from_user(&fd, fds, sizeof(*fds))) 46 return -EFAULT; 47 48 ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); 49 if (!ev_fd) 50 return -ENOMEM; 51 52 ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); 53 if (IS_ERR(ev_fd->cq_ev_fd)) { 54 int ret = PTR_ERR(ev_fd->cq_ev_fd); 55 kfree(ev_fd); 56 return ret; 57 } 58 59 spin_lock(&ctx->completion_lock); 60 ctx->evfd_last_cq_tail = ctx->cached_cq_tail; 61 spin_unlock(&ctx->completion_lock); 62 63 ev_fd->eventfd_async = eventfd_async; 64 ctx->has_evfd = true; 65 rcu_assign_pointer(ctx->io_ev_fd, ev_fd); 66 atomic_set(&ev_fd->refs, 1); 67 atomic_set(&ev_fd->ops, 0); 68 return 0; 69 } 70 71 int io_eventfd_unregister(struct io_ring_ctx *ctx) 72 { 73 struct io_ev_fd *ev_fd; 74 75 ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 76 lockdep_is_held(&ctx->uring_lock)); 77 if (ev_fd) { 78 ctx->has_evfd = false; 79 rcu_assign_pointer(ctx->io_ev_fd, NULL); 80 if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) 81 call_rcu(&ev_fd->rcu, io_eventfd_ops); 82 return 0; 83 } 84 85 return -ENXIO; 86 } 87 88 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 89 unsigned nr_args) 90 { 91 struct io_uring_probe *p; 92 size_t size; 93 int i, ret; 94 95 size = struct_size(p, ops, nr_args); 96 if (size == SIZE_MAX) 97 return -EOVERFLOW; 98 p = kzalloc(size, GFP_KERNEL); 99 if (!p) 100 return -ENOMEM; 101 102 ret = -EFAULT; 103 if (copy_from_user(p, arg, size)) 104 goto out; 105 ret = -EINVAL; 106 if (memchr_inv(p, 0, size)) 107 goto out; 108 109 p->last_op = IORING_OP_LAST - 1; 110 if (nr_args > IORING_OP_LAST) 111 nr_args = IORING_OP_LAST; 112 113 for (i = 0; i < nr_args; i++) { 114 p->ops[i].op = i; 115 if (!io_issue_defs[i].not_supported) 116 p->ops[i].flags = IO_URING_OP_SUPPORTED; 117 } 118 p->ops_len = i; 119 120 ret = 0; 121 if (copy_to_user(arg, p, size)) 122 ret = -EFAULT; 123 out: 124 kfree(p); 125 return ret; 126 } 127 128 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 129 { 130 const struct cred *creds; 131 132 creds = xa_erase(&ctx->personalities, id); 133 if (creds) { 134 put_cred(creds); 135 return 0; 136 } 137 138 return -EINVAL; 139 } 140 141 142 static int io_register_personality(struct io_ring_ctx *ctx) 143 { 144 const struct cred *creds; 145 u32 id; 146 int ret; 147 148 creds = get_current_cred(); 149 150 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 151 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 152 if (ret < 0) { 153 put_cred(creds); 154 return ret; 155 } 156 return id; 157 } 158 159 static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 160 void __user *arg, unsigned int nr_args) 161 { 162 struct io_uring_restriction *res; 163 size_t size; 164 int i, ret; 165 166 /* Restrictions allowed only if rings started disabled */ 167 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 168 return -EBADFD; 169 170 /* We allow only a single restrictions registration */ 171 if (ctx->restrictions.registered) 172 return -EBUSY; 173 174 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 175 return -EINVAL; 176 177 size = array_size(nr_args, sizeof(*res)); 178 if (size == SIZE_MAX) 179 return -EOVERFLOW; 180 181 res = memdup_user(arg, size); 182 if (IS_ERR(res)) 183 return PTR_ERR(res); 184 185 ret = 0; 186 187 for (i = 0; i < nr_args; i++) { 188 switch (res[i].opcode) { 189 case IORING_RESTRICTION_REGISTER_OP: 190 if (res[i].register_op >= IORING_REGISTER_LAST) { 191 ret = -EINVAL; 192 goto out; 193 } 194 195 __set_bit(res[i].register_op, 196 ctx->restrictions.register_op); 197 break; 198 case IORING_RESTRICTION_SQE_OP: 199 if (res[i].sqe_op >= IORING_OP_LAST) { 200 ret = -EINVAL; 201 goto out; 202 } 203 204 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 205 break; 206 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 207 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 208 break; 209 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 210 ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 211 break; 212 default: 213 ret = -EINVAL; 214 goto out; 215 } 216 } 217 218 out: 219 /* Reset all restrictions if an error happened */ 220 if (ret != 0) 221 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 222 else 223 ctx->restrictions.registered = true; 224 225 kfree(res); 226 return ret; 227 } 228 229 static int io_register_enable_rings(struct io_ring_ctx *ctx) 230 { 231 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 232 return -EBADFD; 233 234 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { 235 WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 236 /* 237 * Lazy activation attempts would fail if it was polled before 238 * submitter_task is set. 239 */ 240 if (wq_has_sleeper(&ctx->poll_wq)) 241 io_activate_pollwq(ctx); 242 } 243 244 if (ctx->restrictions.registered) 245 ctx->restricted = 1; 246 247 ctx->flags &= ~IORING_SETUP_R_DISABLED; 248 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 249 wake_up(&ctx->sq_data->wait); 250 return 0; 251 } 252 253 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 254 cpumask_var_t new_mask) 255 { 256 int ret; 257 258 if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 259 ret = io_wq_cpu_affinity(current->io_uring, new_mask); 260 } else { 261 mutex_unlock(&ctx->uring_lock); 262 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 263 mutex_lock(&ctx->uring_lock); 264 } 265 266 return ret; 267 } 268 269 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 270 void __user *arg, unsigned len) 271 { 272 cpumask_var_t new_mask; 273 int ret; 274 275 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 276 return -ENOMEM; 277 278 cpumask_clear(new_mask); 279 if (len > cpumask_size()) 280 len = cpumask_size(); 281 282 #ifdef CONFIG_COMPAT 283 if (in_compat_syscall()) 284 ret = compat_get_bitmap(cpumask_bits(new_mask), 285 (const compat_ulong_t __user *)arg, 286 len * 8 /* CHAR_BIT */); 287 else 288 #endif 289 ret = copy_from_user(new_mask, arg, len); 290 291 if (ret) { 292 free_cpumask_var(new_mask); 293 return -EFAULT; 294 } 295 296 ret = __io_register_iowq_aff(ctx, new_mask); 297 free_cpumask_var(new_mask); 298 return ret; 299 } 300 301 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 302 { 303 return __io_register_iowq_aff(ctx, NULL); 304 } 305 306 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 307 void __user *arg) 308 __must_hold(&ctx->uring_lock) 309 { 310 struct io_tctx_node *node; 311 struct io_uring_task *tctx = NULL; 312 struct io_sq_data *sqd = NULL; 313 __u32 new_count[2]; 314 int i, ret; 315 316 if (copy_from_user(new_count, arg, sizeof(new_count))) 317 return -EFAULT; 318 for (i = 0; i < ARRAY_SIZE(new_count); i++) 319 if (new_count[i] > INT_MAX) 320 return -EINVAL; 321 322 if (ctx->flags & IORING_SETUP_SQPOLL) { 323 sqd = ctx->sq_data; 324 if (sqd) { 325 /* 326 * Observe the correct sqd->lock -> ctx->uring_lock 327 * ordering. Fine to drop uring_lock here, we hold 328 * a ref to the ctx. 329 */ 330 refcount_inc(&sqd->refs); 331 mutex_unlock(&ctx->uring_lock); 332 mutex_lock(&sqd->lock); 333 mutex_lock(&ctx->uring_lock); 334 if (sqd->thread) 335 tctx = sqd->thread->io_uring; 336 } 337 } else { 338 tctx = current->io_uring; 339 } 340 341 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 342 343 for (i = 0; i < ARRAY_SIZE(new_count); i++) 344 if (new_count[i]) 345 ctx->iowq_limits[i] = new_count[i]; 346 ctx->iowq_limits_set = true; 347 348 if (tctx && tctx->io_wq) { 349 ret = io_wq_max_workers(tctx->io_wq, new_count); 350 if (ret) 351 goto err; 352 } else { 353 memset(new_count, 0, sizeof(new_count)); 354 } 355 356 if (sqd) { 357 mutex_unlock(&sqd->lock); 358 io_put_sq_data(sqd); 359 } 360 361 if (copy_to_user(arg, new_count, sizeof(new_count))) 362 return -EFAULT; 363 364 /* that's it for SQPOLL, only the SQPOLL task creates requests */ 365 if (sqd) 366 return 0; 367 368 /* now propagate the restriction to all registered users */ 369 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 370 struct io_uring_task *tctx = node->task->io_uring; 371 372 if (WARN_ON_ONCE(!tctx->io_wq)) 373 continue; 374 375 for (i = 0; i < ARRAY_SIZE(new_count); i++) 376 new_count[i] = ctx->iowq_limits[i]; 377 /* ignore errors, it always returns zero anyway */ 378 (void)io_wq_max_workers(tctx->io_wq, new_count); 379 } 380 return 0; 381 err: 382 if (sqd) { 383 mutex_unlock(&sqd->lock); 384 io_put_sq_data(sqd); 385 } 386 return ret; 387 } 388 389 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 390 void __user *arg, unsigned nr_args) 391 __releases(ctx->uring_lock) 392 __acquires(ctx->uring_lock) 393 { 394 int ret; 395 396 /* 397 * We don't quiesce the refs for register anymore and so it can't be 398 * dying as we're holding a file ref here. 399 */ 400 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) 401 return -ENXIO; 402 403 if (ctx->submitter_task && ctx->submitter_task != current) 404 return -EEXIST; 405 406 if (ctx->restricted) { 407 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 408 if (!test_bit(opcode, ctx->restrictions.register_op)) 409 return -EACCES; 410 } 411 412 switch (opcode) { 413 case IORING_REGISTER_BUFFERS: 414 ret = -EFAULT; 415 if (!arg) 416 break; 417 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 418 break; 419 case IORING_UNREGISTER_BUFFERS: 420 ret = -EINVAL; 421 if (arg || nr_args) 422 break; 423 ret = io_sqe_buffers_unregister(ctx); 424 break; 425 case IORING_REGISTER_FILES: 426 ret = -EFAULT; 427 if (!arg) 428 break; 429 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 430 break; 431 case IORING_UNREGISTER_FILES: 432 ret = -EINVAL; 433 if (arg || nr_args) 434 break; 435 ret = io_sqe_files_unregister(ctx); 436 break; 437 case IORING_REGISTER_FILES_UPDATE: 438 ret = io_register_files_update(ctx, arg, nr_args); 439 break; 440 case IORING_REGISTER_EVENTFD: 441 ret = -EINVAL; 442 if (nr_args != 1) 443 break; 444 ret = io_eventfd_register(ctx, arg, 0); 445 break; 446 case IORING_REGISTER_EVENTFD_ASYNC: 447 ret = -EINVAL; 448 if (nr_args != 1) 449 break; 450 ret = io_eventfd_register(ctx, arg, 1); 451 break; 452 case IORING_UNREGISTER_EVENTFD: 453 ret = -EINVAL; 454 if (arg || nr_args) 455 break; 456 ret = io_eventfd_unregister(ctx); 457 break; 458 case IORING_REGISTER_PROBE: 459 ret = -EINVAL; 460 if (!arg || nr_args > 256) 461 break; 462 ret = io_probe(ctx, arg, nr_args); 463 break; 464 case IORING_REGISTER_PERSONALITY: 465 ret = -EINVAL; 466 if (arg || nr_args) 467 break; 468 ret = io_register_personality(ctx); 469 break; 470 case IORING_UNREGISTER_PERSONALITY: 471 ret = -EINVAL; 472 if (arg) 473 break; 474 ret = io_unregister_personality(ctx, nr_args); 475 break; 476 case IORING_REGISTER_ENABLE_RINGS: 477 ret = -EINVAL; 478 if (arg || nr_args) 479 break; 480 ret = io_register_enable_rings(ctx); 481 break; 482 case IORING_REGISTER_RESTRICTIONS: 483 ret = io_register_restrictions(ctx, arg, nr_args); 484 break; 485 case IORING_REGISTER_FILES2: 486 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 487 break; 488 case IORING_REGISTER_FILES_UPDATE2: 489 ret = io_register_rsrc_update(ctx, arg, nr_args, 490 IORING_RSRC_FILE); 491 break; 492 case IORING_REGISTER_BUFFERS2: 493 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 494 break; 495 case IORING_REGISTER_BUFFERS_UPDATE: 496 ret = io_register_rsrc_update(ctx, arg, nr_args, 497 IORING_RSRC_BUFFER); 498 break; 499 case IORING_REGISTER_IOWQ_AFF: 500 ret = -EINVAL; 501 if (!arg || !nr_args) 502 break; 503 ret = io_register_iowq_aff(ctx, arg, nr_args); 504 break; 505 case IORING_UNREGISTER_IOWQ_AFF: 506 ret = -EINVAL; 507 if (arg || nr_args) 508 break; 509 ret = io_unregister_iowq_aff(ctx); 510 break; 511 case IORING_REGISTER_IOWQ_MAX_WORKERS: 512 ret = -EINVAL; 513 if (!arg || nr_args != 2) 514 break; 515 ret = io_register_iowq_max_workers(ctx, arg); 516 break; 517 case IORING_REGISTER_RING_FDS: 518 ret = io_ringfd_register(ctx, arg, nr_args); 519 break; 520 case IORING_UNREGISTER_RING_FDS: 521 ret = io_ringfd_unregister(ctx, arg, nr_args); 522 break; 523 case IORING_REGISTER_PBUF_RING: 524 ret = -EINVAL; 525 if (!arg || nr_args != 1) 526 break; 527 ret = io_register_pbuf_ring(ctx, arg); 528 break; 529 case IORING_UNREGISTER_PBUF_RING: 530 ret = -EINVAL; 531 if (!arg || nr_args != 1) 532 break; 533 ret = io_unregister_pbuf_ring(ctx, arg); 534 break; 535 case IORING_REGISTER_SYNC_CANCEL: 536 ret = -EINVAL; 537 if (!arg || nr_args != 1) 538 break; 539 ret = io_sync_cancel(ctx, arg); 540 break; 541 case IORING_REGISTER_FILE_ALLOC_RANGE: 542 ret = -EINVAL; 543 if (!arg || nr_args) 544 break; 545 ret = io_register_file_alloc_range(ctx, arg); 546 break; 547 case IORING_REGISTER_PBUF_STATUS: 548 ret = -EINVAL; 549 if (!arg || nr_args != 1) 550 break; 551 ret = io_register_pbuf_status(ctx, arg); 552 break; 553 default: 554 ret = -EINVAL; 555 break; 556 } 557 558 return ret; 559 } 560 561 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 562 void __user *, arg, unsigned int, nr_args) 563 { 564 struct io_ring_ctx *ctx; 565 long ret = -EBADF; 566 struct file *file; 567 bool use_registered_ring; 568 569 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); 570 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; 571 572 if (opcode >= IORING_REGISTER_LAST) 573 return -EINVAL; 574 575 if (use_registered_ring) { 576 /* 577 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 578 * need only dereference our task private array to find it. 579 */ 580 struct io_uring_task *tctx = current->io_uring; 581 582 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 583 return -EINVAL; 584 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 585 file = tctx->registered_rings[fd]; 586 if (unlikely(!file)) 587 return -EBADF; 588 } else { 589 file = fget(fd); 590 if (unlikely(!file)) 591 return -EBADF; 592 ret = -EOPNOTSUPP; 593 if (!io_is_uring_fops(file)) 594 goto out_fput; 595 } 596 597 ctx = file->private_data; 598 599 mutex_lock(&ctx->uring_lock); 600 ret = __io_uring_register(ctx, opcode, arg, nr_args); 601 mutex_unlock(&ctx->uring_lock); 602 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); 603 out_fput: 604 if (!use_registered_ring) 605 fput(file); 606 return ret; 607 } 608