1 #ifndef IO_URING_TYPES_H 2 #define IO_URING_TYPES_H 3 4 #include <linux/blkdev.h> 5 #include <linux/task_work.h> 6 #include <linux/bitmap.h> 7 #include <linux/llist.h> 8 #include <uapi/linux/io_uring.h> 9 10 struct io_wq_work_node { 11 struct io_wq_work_node *next; 12 }; 13 14 struct io_wq_work_list { 15 struct io_wq_work_node *first; 16 struct io_wq_work_node *last; 17 }; 18 19 struct io_wq_work { 20 struct io_wq_work_node list; 21 unsigned flags; 22 /* place it here instead of io_kiocb as it fills padding and saves 4B */ 23 int cancel_seq; 24 }; 25 26 struct io_fixed_file { 27 /* file * with additional FFS_* flags */ 28 unsigned long file_ptr; 29 }; 30 31 struct io_file_table { 32 struct io_fixed_file *files; 33 unsigned long *bitmap; 34 unsigned int alloc_hint; 35 }; 36 37 struct io_hash_bucket { 38 spinlock_t lock; 39 struct hlist_head list; 40 } ____cacheline_aligned_in_smp; 41 42 struct io_hash_table { 43 struct io_hash_bucket *hbs; 44 unsigned hash_bits; 45 }; 46 47 /* 48 * Arbitrary limit, can be raised if need be 49 */ 50 #define IO_RINGFD_REG_MAX 16 51 52 struct io_uring_task { 53 /* submission side */ 54 int cached_refs; 55 const struct io_ring_ctx *last; 56 struct io_wq *io_wq; 57 struct file *registered_rings[IO_RINGFD_REG_MAX]; 58 59 struct xarray xa; 60 struct wait_queue_head wait; 61 atomic_t in_idle; 62 atomic_t inflight_tracked; 63 struct percpu_counter inflight; 64 65 struct { /* task_work */ 66 struct llist_head task_list; 67 struct callback_head task_work; 68 } ____cacheline_aligned_in_smp; 69 }; 70 71 struct io_uring { 72 u32 head ____cacheline_aligned_in_smp; 73 u32 tail ____cacheline_aligned_in_smp; 74 }; 75 76 /* 77 * This data is shared with the application through the mmap at offsets 78 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. 79 * 80 * The offsets to the member fields are published through struct 81 * io_sqring_offsets when calling io_uring_setup. 82 */ 83 struct io_rings { 84 /* 85 * Head and tail offsets into the ring; the offsets need to be 86 * masked to get valid indices. 87 * 88 * The kernel controls head of the sq ring and the tail of the cq ring, 89 * and the application controls tail of the sq ring and the head of the 90 * cq ring. 91 */ 92 struct io_uring sq, cq; 93 /* 94 * Bitmasks to apply to head and tail offsets (constant, equals 95 * ring_entries - 1) 96 */ 97 u32 sq_ring_mask, cq_ring_mask; 98 /* Ring sizes (constant, power of 2) */ 99 u32 sq_ring_entries, cq_ring_entries; 100 /* 101 * Number of invalid entries dropped by the kernel due to 102 * invalid index stored in array 103 * 104 * Written by the kernel, shouldn't be modified by the 105 * application (i.e. get number of "new events" by comparing to 106 * cached value). 107 * 108 * After a new SQ head value was read by the application this 109 * counter includes all submissions that were dropped reaching 110 * the new SQ head (and possibly more). 111 */ 112 u32 sq_dropped; 113 /* 114 * Runtime SQ flags 115 * 116 * Written by the kernel, shouldn't be modified by the 117 * application. 118 * 119 * The application needs a full memory barrier before checking 120 * for IORING_SQ_NEED_WAKEUP after updating the sq tail. 121 */ 122 atomic_t sq_flags; 123 /* 124 * Runtime CQ flags 125 * 126 * Written by the application, shouldn't be modified by the 127 * kernel. 128 */ 129 u32 cq_flags; 130 /* 131 * Number of completion events lost because the queue was full; 132 * this should be avoided by the application by making sure 133 * there are not more requests pending than there is space in 134 * the completion queue. 135 * 136 * Written by the kernel, shouldn't be modified by the 137 * application (i.e. get number of "new events" by comparing to 138 * cached value). 139 * 140 * As completion events come in out of order this counter is not 141 * ordered with any other data. 142 */ 143 u32 cq_overflow; 144 /* 145 * Ring buffer of completion events. 146 * 147 * The kernel writes completion events fresh every time they are 148 * produced, so the application is allowed to modify pending 149 * entries. 150 */ 151 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; 152 }; 153 154 struct io_restriction { 155 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); 156 DECLARE_BITMAP(sqe_op, IORING_OP_LAST); 157 u8 sqe_flags_allowed; 158 u8 sqe_flags_required; 159 bool registered; 160 }; 161 162 struct io_submit_link { 163 struct io_kiocb *head; 164 struct io_kiocb *last; 165 }; 166 167 struct io_submit_state { 168 /* inline/task_work completion list, under ->uring_lock */ 169 struct io_wq_work_node free_list; 170 /* batch completion logic */ 171 struct io_wq_work_list compl_reqs; 172 struct io_submit_link link; 173 174 bool plug_started; 175 bool need_plug; 176 unsigned short submit_nr; 177 struct blk_plug plug; 178 }; 179 180 struct io_ev_fd { 181 struct eventfd_ctx *cq_ev_fd; 182 unsigned int eventfd_async: 1; 183 struct rcu_head rcu; 184 atomic_t refs; 185 atomic_t ops; 186 }; 187 188 struct io_alloc_cache { 189 struct hlist_head list; 190 unsigned int nr_cached; 191 }; 192 193 struct io_ring_ctx { 194 /* const or read-mostly hot data */ 195 struct { 196 struct percpu_ref refs; 197 198 struct io_rings *rings; 199 unsigned int flags; 200 enum task_work_notify_mode notify_method; 201 unsigned int compat: 1; 202 unsigned int drain_next: 1; 203 unsigned int restricted: 1; 204 unsigned int off_timeout_used: 1; 205 unsigned int drain_active: 1; 206 unsigned int drain_disabled: 1; 207 unsigned int has_evfd: 1; 208 unsigned int syscall_iopoll: 1; 209 } ____cacheline_aligned_in_smp; 210 211 /* submission data */ 212 struct { 213 struct mutex uring_lock; 214 215 /* 216 * Ring buffer of indices into array of io_uring_sqe, which is 217 * mmapped by the application using the IORING_OFF_SQES offset. 218 * 219 * This indirection could e.g. be used to assign fixed 220 * io_uring_sqe entries to operations and only submit them to 221 * the queue when needed. 222 * 223 * The kernel modifies neither the indices array nor the entries 224 * array. 225 */ 226 u32 *sq_array; 227 struct io_uring_sqe *sq_sqes; 228 unsigned cached_sq_head; 229 unsigned sq_entries; 230 231 /* 232 * Fixed resources fast path, should be accessed only under 233 * uring_lock, and updated through io_uring_register(2) 234 */ 235 struct io_rsrc_node *rsrc_node; 236 int rsrc_cached_refs; 237 atomic_t cancel_seq; 238 struct io_file_table file_table; 239 unsigned nr_user_files; 240 unsigned nr_user_bufs; 241 struct io_mapped_ubuf **user_bufs; 242 243 struct io_submit_state submit_state; 244 245 struct io_buffer_list *io_bl; 246 struct xarray io_bl_xa; 247 struct list_head io_buffers_cache; 248 249 struct io_hash_table cancel_table_locked; 250 struct list_head cq_overflow_list; 251 struct io_alloc_cache apoll_cache; 252 struct io_alloc_cache netmsg_cache; 253 } ____cacheline_aligned_in_smp; 254 255 /* IRQ completion list, under ->completion_lock */ 256 struct io_wq_work_list locked_free_list; 257 unsigned int locked_free_nr; 258 259 const struct cred *sq_creds; /* cred used for __io_sq_thread() */ 260 struct io_sq_data *sq_data; /* if using sq thread polling */ 261 262 struct wait_queue_head sqo_sq_wait; 263 struct list_head sqd_list; 264 265 unsigned long check_cq; 266 267 unsigned int file_alloc_start; 268 unsigned int file_alloc_end; 269 270 struct xarray personalities; 271 u32 pers_next; 272 273 struct { 274 /* 275 * We cache a range of free CQEs we can use, once exhausted it 276 * should go through a slower range setup, see __io_get_cqe() 277 */ 278 struct io_uring_cqe *cqe_cached; 279 struct io_uring_cqe *cqe_sentinel; 280 281 unsigned cached_cq_tail; 282 unsigned cq_entries; 283 struct io_ev_fd __rcu *io_ev_fd; 284 struct wait_queue_head cq_wait; 285 unsigned cq_extra; 286 } ____cacheline_aligned_in_smp; 287 288 struct { 289 spinlock_t completion_lock; 290 291 /* 292 * ->iopoll_list is protected by the ctx->uring_lock for 293 * io_uring instances that don't use IORING_SETUP_SQPOLL. 294 * For SQPOLL, only the single threaded io_sq_thread() will 295 * manipulate the list, hence no extra locking is needed there. 296 */ 297 struct io_wq_work_list iopoll_list; 298 struct io_hash_table cancel_table; 299 bool poll_multi_queue; 300 301 struct llist_head work_llist; 302 303 struct list_head io_buffers_comp; 304 } ____cacheline_aligned_in_smp; 305 306 /* timeouts */ 307 struct { 308 spinlock_t timeout_lock; 309 atomic_t cq_timeouts; 310 struct list_head timeout_list; 311 struct list_head ltimeout_list; 312 unsigned cq_last_tm_flush; 313 } ____cacheline_aligned_in_smp; 314 315 /* Keep this last, we don't need it for the fast path */ 316 317 struct io_restriction restrictions; 318 struct task_struct *submitter_task; 319 320 /* slow path rsrc auxilary data, used by update/register */ 321 struct io_rsrc_node *rsrc_backup_node; 322 struct io_mapped_ubuf *dummy_ubuf; 323 struct io_rsrc_data *file_data; 324 struct io_rsrc_data *buf_data; 325 326 struct delayed_work rsrc_put_work; 327 struct llist_head rsrc_put_llist; 328 struct list_head rsrc_ref_list; 329 spinlock_t rsrc_ref_lock; 330 331 struct list_head io_buffers_pages; 332 333 #if defined(CONFIG_UNIX) 334 struct socket *ring_sock; 335 #endif 336 /* hashed buffered write serialization */ 337 struct io_wq_hash *hash_map; 338 339 /* Only used for accounting purposes */ 340 struct user_struct *user; 341 struct mm_struct *mm_account; 342 343 /* ctx exit and cancelation */ 344 struct llist_head fallback_llist; 345 struct delayed_work fallback_work; 346 struct work_struct exit_work; 347 struct list_head tctx_list; 348 struct completion ref_comp; 349 350 /* io-wq management, e.g. thread count */ 351 u32 iowq_limits[2]; 352 bool iowq_limits_set; 353 354 struct list_head defer_list; 355 unsigned sq_thread_idle; 356 /* protected by ->completion_lock */ 357 unsigned evfd_last_cq_tail; 358 }; 359 360 enum { 361 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, 362 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, 363 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, 364 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, 365 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 366 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 367 REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, 368 369 /* first byte is taken by user flags, shift it to not overlap */ 370 REQ_F_FAIL_BIT = 8, 371 REQ_F_INFLIGHT_BIT, 372 REQ_F_CUR_POS_BIT, 373 REQ_F_NOWAIT_BIT, 374 REQ_F_LINK_TIMEOUT_BIT, 375 REQ_F_NEED_CLEANUP_BIT, 376 REQ_F_POLLED_BIT, 377 REQ_F_BUFFER_SELECTED_BIT, 378 REQ_F_BUFFER_RING_BIT, 379 REQ_F_REISSUE_BIT, 380 REQ_F_CREDS_BIT, 381 REQ_F_REFCOUNT_BIT, 382 REQ_F_ARM_LTIMEOUT_BIT, 383 REQ_F_ASYNC_DATA_BIT, 384 REQ_F_SKIP_LINK_CQES_BIT, 385 REQ_F_SINGLE_POLL_BIT, 386 REQ_F_DOUBLE_POLL_BIT, 387 REQ_F_PARTIAL_IO_BIT, 388 REQ_F_CQE32_INIT_BIT, 389 REQ_F_APOLL_MULTISHOT_BIT, 390 REQ_F_CLEAR_POLLIN_BIT, 391 REQ_F_HASH_LOCKED_BIT, 392 /* keep async read/write and isreg together and in order */ 393 REQ_F_SUPPORT_NOWAIT_BIT, 394 REQ_F_ISREG_BIT, 395 396 /* not a real bit, just to check we're not overflowing the space */ 397 __REQ_F_LAST_BIT, 398 }; 399 400 enum { 401 /* ctx owns file */ 402 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), 403 /* drain existing IO first */ 404 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), 405 /* linked sqes */ 406 REQ_F_LINK = BIT(REQ_F_LINK_BIT), 407 /* doesn't sever on completion < 0 */ 408 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), 409 /* IOSQE_ASYNC */ 410 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), 411 /* IOSQE_BUFFER_SELECT */ 412 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 413 /* IOSQE_CQE_SKIP_SUCCESS */ 414 REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), 415 416 /* fail rest of links */ 417 REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), 418 /* on inflight list, should be cancelled and waited on exit reliably */ 419 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), 420 /* read/write uses file position */ 421 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), 422 /* must not punt to workers */ 423 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), 424 /* has or had linked timeout */ 425 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), 426 /* needs cleanup */ 427 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), 428 /* already went through poll handler */ 429 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), 430 /* buffer already selected */ 431 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), 432 /* buffer selected from ring, needs commit */ 433 REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), 434 /* caller should reissue async */ 435 REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), 436 /* supports async reads/writes */ 437 REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), 438 /* regular file */ 439 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), 440 /* has creds assigned */ 441 REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), 442 /* skip refcounting if not set */ 443 REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), 444 /* there is a linked timeout that has to be armed */ 445 REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), 446 /* ->async_data allocated */ 447 REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), 448 /* don't post CQEs while failing linked requests */ 449 REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), 450 /* single poll may be active */ 451 REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), 452 /* double poll may active */ 453 REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), 454 /* request has already done partial IO */ 455 REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), 456 /* fast poll multishot mode */ 457 REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), 458 /* ->extra1 and ->extra2 are initialised */ 459 REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), 460 /* recvmsg special flag, clear EPOLLIN */ 461 REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), 462 /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ 463 REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), 464 }; 465 466 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); 467 468 struct io_task_work { 469 struct llist_node node; 470 io_req_tw_func_t func; 471 }; 472 473 struct io_cqe { 474 __u64 user_data; 475 __s32 res; 476 /* fd initially, then cflags for completion */ 477 union { 478 __u32 flags; 479 int fd; 480 }; 481 }; 482 483 /* 484 * Each request type overlays its private data structure on top of this one. 485 * They must not exceed this one in size. 486 */ 487 struct io_cmd_data { 488 struct file *file; 489 /* each command gets 56 bytes of data */ 490 __u8 data[56]; 491 }; 492 493 static inline void io_kiocb_cmd_sz_check(size_t cmd_sz) 494 { 495 BUILD_BUG_ON(cmd_sz > sizeof(struct io_cmd_data)); 496 } 497 #define io_kiocb_to_cmd(req, cmd_type) ( \ 498 io_kiocb_cmd_sz_check(sizeof(cmd_type)) , \ 499 ((cmd_type *)&(req)->cmd) \ 500 ) 501 #define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) 502 503 struct io_kiocb { 504 union { 505 /* 506 * NOTE! Each of the io_kiocb union members has the file pointer 507 * as the first entry in their struct definition. So you can 508 * access the file pointer through any of the sub-structs, 509 * or directly as just 'file' in this struct. 510 */ 511 struct file *file; 512 struct io_cmd_data cmd; 513 }; 514 515 u8 opcode; 516 /* polled IO has completed */ 517 u8 iopoll_completed; 518 /* 519 * Can be either a fixed buffer index, or used with provided buffers. 520 * For the latter, before issue it points to the buffer group ID, 521 * and after selection it points to the buffer ID itself. 522 */ 523 u16 buf_index; 524 unsigned int flags; 525 526 struct io_cqe cqe; 527 528 struct io_ring_ctx *ctx; 529 struct task_struct *task; 530 531 struct io_rsrc_node *rsrc_node; 532 533 union { 534 /* store used ubuf, so we can prevent reloading */ 535 struct io_mapped_ubuf *imu; 536 537 /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ 538 struct io_buffer *kbuf; 539 540 /* 541 * stores buffer ID for ring provided buffers, valid IFF 542 * REQ_F_BUFFER_RING is set. 543 */ 544 struct io_buffer_list *buf_list; 545 }; 546 547 union { 548 /* used by request caches, completion batching and iopoll */ 549 struct io_wq_work_node comp_list; 550 /* cache ->apoll->events */ 551 __poll_t apoll_events; 552 }; 553 atomic_t refs; 554 atomic_t poll_refs; 555 struct io_task_work io_task_work; 556 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 557 union { 558 struct hlist_node hash_node; 559 struct { 560 u64 extra1; 561 u64 extra2; 562 }; 563 }; 564 /* internal polling, see IORING_FEAT_FAST_POLL */ 565 struct async_poll *apoll; 566 /* opcode allocated if it needs to store data for async defer */ 567 void *async_data; 568 /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ 569 struct io_kiocb *link; 570 /* custom credentials, valid IFF REQ_F_CREDS is set */ 571 const struct cred *creds; 572 struct io_wq_work work; 573 }; 574 575 struct io_overflow_cqe { 576 struct list_head list; 577 struct io_uring_cqe cqe; 578 }; 579 580 #endif 581