1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef BLK_MQ_H 3 #define BLK_MQ_H 4 5 #include <linux/blkdev.h> 6 #include <linux/sbitmap.h> 7 #include <linux/srcu.h> 8 9 struct blk_mq_tags; 10 struct blk_flush_queue; 11 12 /** 13 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware 14 * block device 15 */ 16 struct blk_mq_hw_ctx { 17 struct { 18 /** @lock: Protects the dispatch list. */ 19 spinlock_t lock; 20 /** 21 * @dispatch: Used for requests that are ready to be 22 * dispatched to the hardware but for some reason (e.g. lack of 23 * resources) could not be sent to the hardware. As soon as the 24 * driver can send new requests, requests at this list will 25 * be sent first for a fairer dispatch. 26 */ 27 struct list_head dispatch; 28 /** 29 * @state: BLK_MQ_S_* flags. Defines the state of the hw 30 * queue (active, scheduled to restart, stopped). 31 */ 32 unsigned long state; 33 } ____cacheline_aligned_in_smp; 34 35 /** 36 * @run_work: Used for scheduling a hardware queue run at a later time. 37 */ 38 struct delayed_work run_work; 39 /** @cpumask: Map of available CPUs where this hctx can run. */ 40 cpumask_var_t cpumask; 41 /** 42 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU 43 * selection from @cpumask. 44 */ 45 int next_cpu; 46 /** 47 * @next_cpu_batch: Counter of how many works left in the batch before 48 * changing to the next CPU. 49 */ 50 int next_cpu_batch; 51 52 /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ 53 unsigned long flags; 54 55 /** 56 * @sched_data: Pointer owned by the IO scheduler attached to a request 57 * queue. It's up to the IO scheduler how to use this pointer. 58 */ 59 void *sched_data; 60 /** 61 * @queue: Pointer to the request queue that owns this hardware context. 62 */ 63 struct request_queue *queue; 64 /** @fq: Queue of requests that need to perform a flush operation. */ 65 struct blk_flush_queue *fq; 66 67 /** 68 * @driver_data: Pointer to data owned by the block driver that created 69 * this hctx 70 */ 71 void *driver_data; 72 73 /** 74 * @ctx_map: Bitmap for each software queue. If bit is on, there is a 75 * pending request in that software queue. 76 */ 77 struct sbitmap ctx_map; 78 79 /** 80 * @dispatch_from: Software queue to be used when no scheduler was 81 * selected. 82 */ 83 struct blk_mq_ctx *dispatch_from; 84 /** 85 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to 86 * decide if the hw_queue is busy using Exponential Weighted Moving 87 * Average algorithm. 88 */ 89 unsigned int dispatch_busy; 90 91 /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ 92 unsigned short type; 93 /** @nr_ctx: Number of software queues. */ 94 unsigned short nr_ctx; 95 /** @ctxs: Array of software queues. */ 96 struct blk_mq_ctx **ctxs; 97 98 /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ 99 spinlock_t dispatch_wait_lock; 100 /** 101 * @dispatch_wait: Waitqueue to put requests when there is no tag 102 * available at the moment, to wait for another try in the future. 103 */ 104 wait_queue_entry_t dispatch_wait; 105 106 /** 107 * @wait_index: Index of next available dispatch_wait queue to insert 108 * requests. 109 */ 110 atomic_t wait_index; 111 112 /** 113 * @tags: Tags owned by the block driver. A tag at this set is only 114 * assigned when a request is dispatched from a hardware queue. 115 */ 116 struct blk_mq_tags *tags; 117 /** 118 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O 119 * scheduler associated with a request queue, a tag is assigned when 120 * that request is allocated. Else, this member is not used. 121 */ 122 struct blk_mq_tags *sched_tags; 123 124 /** @queued: Number of queued requests. */ 125 unsigned long queued; 126 /** @run: Number of dispatched requests. */ 127 unsigned long run; 128 #define BLK_MQ_MAX_DISPATCH_ORDER 7 129 /** @dispatched: Number of dispatch requests by queue. */ 130 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; 131 132 /** @numa_node: NUMA node the storage adapter has been connected to. */ 133 unsigned int numa_node; 134 /** @queue_num: Index of this hardware queue. */ 135 unsigned int queue_num; 136 137 /** 138 * @nr_active: Number of active requests. Only used when a tag set is 139 * shared across request queues. 140 */ 141 atomic_t nr_active; 142 143 /** @cpuhp_dead: List to store request if some CPU die. */ 144 struct hlist_node cpuhp_dead; 145 /** @kobj: Kernel object for sysfs. */ 146 struct kobject kobj; 147 148 /** @poll_considered: Count times blk_poll() was called. */ 149 unsigned long poll_considered; 150 /** @poll_invoked: Count how many requests blk_poll() polled. */ 151 unsigned long poll_invoked; 152 /** @poll_success: Count how many polled requests were completed. */ 153 unsigned long poll_success; 154 155 #ifdef CONFIG_BLK_DEBUG_FS 156 /** 157 * @debugfs_dir: debugfs directory for this hardware queue. Named 158 * as cpu<cpu_number>. 159 */ 160 struct dentry *debugfs_dir; 161 /** @sched_debugfs_dir: debugfs directory for the scheduler. */ 162 struct dentry *sched_debugfs_dir; 163 #endif 164 165 /** @hctx_list: List of all hardware queues. */ 166 struct list_head hctx_list; 167 168 /** 169 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is 170 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also 171 * blk_mq_hw_ctx_size(). 172 */ 173 struct srcu_struct srcu[0]; 174 }; 175 176 /** 177 * struct blk_mq_queue_map - Map software queues to hardware queues 178 * @mq_map: CPU ID to hardware queue index map. This is an array 179 * with nr_cpu_ids elements. Each element has a value in the range 180 * [@queue_offset, @queue_offset + @nr_queues). 181 * @nr_queues: Number of hardware queues to map CPU IDs onto. 182 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe 183 * driver to map each hardware queue type (enum hctx_type) onto a distinct 184 * set of hardware queues. 185 */ 186 struct blk_mq_queue_map { 187 unsigned int *mq_map; 188 unsigned int nr_queues; 189 unsigned int queue_offset; 190 }; 191 192 /** 193 * enum hctx_type - Type of hardware queue 194 * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. 195 * @HCTX_TYPE_READ: Just for READ I/O. 196 * @HCTX_TYPE_POLL: Polled I/O of any kind. 197 * @HCTX_MAX_TYPES: Number of types of hctx. 198 */ 199 enum hctx_type { 200 HCTX_TYPE_DEFAULT, 201 HCTX_TYPE_READ, 202 HCTX_TYPE_POLL, 203 204 HCTX_MAX_TYPES, 205 }; 206 207 /** 208 * struct blk_mq_tag_set - tag set that can be shared between request queues 209 * @map: One or more ctx -> hctx mappings. One map exists for each 210 * hardware queue type (enum hctx_type) that the driver wishes 211 * to support. There are no restrictions on maps being of the 212 * same size, and it's perfectly legal to share maps between 213 * types. 214 * @nr_maps: Number of elements in the @map array. A number in the range 215 * [1, HCTX_MAX_TYPES]. 216 * @ops: Pointers to functions that implement block driver behavior. 217 * @nr_hw_queues: Number of hardware queues supported by the block driver that 218 * owns this data structure. 219 * @queue_depth: Number of tags per hardware queue, reserved tags included. 220 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag 221 * allocations. 222 * @cmd_size: Number of additional bytes to allocate per request. The block 223 * driver owns these additional bytes. 224 * @numa_node: NUMA node the storage adapter has been connected to. 225 * @timeout: Request processing timeout in jiffies. 226 * @flags: Zero or more BLK_MQ_F_* flags. 227 * @driver_data: Pointer to data owned by the block driver that created this 228 * tag set. 229 * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues 230 * elements. 231 * @tag_list_lock: Serializes tag_list accesses. 232 * @tag_list: List of the request queues that use this tag set. See also 233 * request_queue.tag_set_list. 234 */ 235 struct blk_mq_tag_set { 236 struct blk_mq_queue_map map[HCTX_MAX_TYPES]; 237 unsigned int nr_maps; 238 const struct blk_mq_ops *ops; 239 unsigned int nr_hw_queues; 240 unsigned int queue_depth; 241 unsigned int reserved_tags; 242 unsigned int cmd_size; 243 int numa_node; 244 unsigned int timeout; 245 unsigned int flags; 246 void *driver_data; 247 248 struct blk_mq_tags **tags; 249 250 struct mutex tag_list_lock; 251 struct list_head tag_list; 252 }; 253 254 /** 255 * struct blk_mq_queue_data - Data about a request inserted in a queue 256 * 257 * @rq: Request pointer. 258 * @last: If it is the last request in the queue. 259 */ 260 struct blk_mq_queue_data { 261 struct request *rq; 262 bool last; 263 }; 264 265 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, 266 const struct blk_mq_queue_data *); 267 typedef void (commit_rqs_fn)(struct blk_mq_hw_ctx *); 268 typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *); 269 typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); 270 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); 271 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); 272 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); 273 typedef int (init_request_fn)(struct blk_mq_tag_set *set, struct request *, 274 unsigned int, unsigned int); 275 typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *, 276 unsigned int); 277 278 typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, 279 bool); 280 typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); 281 typedef int (poll_fn)(struct blk_mq_hw_ctx *); 282 typedef int (map_queues_fn)(struct blk_mq_tag_set *set); 283 typedef bool (busy_fn)(struct request_queue *); 284 typedef void (complete_fn)(struct request *); 285 typedef void (cleanup_rq_fn)(struct request *); 286 287 /** 288 * struct blk_mq_ops - Callback functions that implements block driver 289 * behaviour. 290 */ 291 struct blk_mq_ops { 292 /** 293 * @queue_rq: Queue a new request from block IO. 294 */ 295 queue_rq_fn *queue_rq; 296 297 /** 298 * @commit_rqs: If a driver uses bd->last to judge when to submit 299 * requests to hardware, it must define this function. In case of errors 300 * that make us stop issuing further requests, this hook serves the 301 * purpose of kicking the hardware (which the last request otherwise 302 * would have done). 303 */ 304 commit_rqs_fn *commit_rqs; 305 306 /** 307 * @get_budget: Reserve budget before queue request, once .queue_rq is 308 * run, it is driver's responsibility to release the 309 * reserved budget. Also we have to handle failure case 310 * of .get_budget for avoiding I/O deadlock. 311 */ 312 get_budget_fn *get_budget; 313 /** 314 * @put_budget: Release the reserved budget. 315 */ 316 put_budget_fn *put_budget; 317 318 /** 319 * @timeout: Called on request timeout. 320 */ 321 timeout_fn *timeout; 322 323 /** 324 * @poll: Called to poll for completion of a specific tag. 325 */ 326 poll_fn *poll; 327 328 /** 329 * @complete: Mark the request as complete. 330 */ 331 complete_fn *complete; 332 333 /** 334 * @init_hctx: Called when the block layer side of a hardware queue has 335 * been set up, allowing the driver to allocate/init matching 336 * structures. 337 */ 338 init_hctx_fn *init_hctx; 339 /** 340 * @exit_hctx: Ditto for exit/teardown. 341 */ 342 exit_hctx_fn *exit_hctx; 343 344 /** 345 * @init_request: Called for every command allocated by the block layer 346 * to allow the driver to set up driver specific data. 347 * 348 * Tag greater than or equal to queue_depth is for setting up 349 * flush request. 350 */ 351 init_request_fn *init_request; 352 /** 353 * @exit_request: Ditto for exit/teardown. 354 */ 355 exit_request_fn *exit_request; 356 357 /** 358 * @initialize_rq_fn: Called from inside blk_get_request(). 359 */ 360 void (*initialize_rq_fn)(struct request *rq); 361 362 /** 363 * @cleanup_rq: Called before freeing one request which isn't completed 364 * yet, and usually for freeing the driver private data. 365 */ 366 cleanup_rq_fn *cleanup_rq; 367 368 /** 369 * @busy: If set, returns whether or not this queue currently is busy. 370 */ 371 busy_fn *busy; 372 373 /** 374 * @map_queues: This allows drivers specify their own queue mapping by 375 * overriding the setup-time function that builds the mq_map. 376 */ 377 map_queues_fn *map_queues; 378 379 #ifdef CONFIG_BLK_DEBUG_FS 380 /** 381 * @show_rq: Used by the debugfs implementation to show driver-specific 382 * information about a request. 383 */ 384 void (*show_rq)(struct seq_file *m, struct request *rq); 385 #endif 386 }; 387 388 enum { 389 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 390 BLK_MQ_F_TAG_SHARED = 1 << 1, 391 BLK_MQ_F_BLOCKING = 1 << 5, 392 BLK_MQ_F_NO_SCHED = 1 << 6, 393 BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, 394 BLK_MQ_F_ALLOC_POLICY_BITS = 1, 395 396 BLK_MQ_S_STOPPED = 0, 397 BLK_MQ_S_TAG_ACTIVE = 1, 398 BLK_MQ_S_SCHED_RESTART = 2, 399 400 BLK_MQ_MAX_DEPTH = 10240, 401 402 BLK_MQ_CPU_WORK_BATCH = 8, 403 }; 404 #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ 405 ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ 406 ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) 407 #define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ 408 ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ 409 << BLK_MQ_F_ALLOC_POLICY_START_BIT) 410 411 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); 412 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 413 struct request_queue *q, 414 bool elevator_init); 415 struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, 416 const struct blk_mq_ops *ops, 417 unsigned int queue_depth, 418 unsigned int set_flags); 419 void blk_mq_unregister_dev(struct device *, struct request_queue *); 420 421 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); 422 void blk_mq_free_tag_set(struct blk_mq_tag_set *set); 423 424 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); 425 426 void blk_mq_free_request(struct request *rq); 427 428 bool blk_mq_queue_inflight(struct request_queue *q); 429 430 enum { 431 /* return when out of requests */ 432 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), 433 /* allocate from reserved pool */ 434 BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), 435 /* allocate internal/sched tag */ 436 BLK_MQ_REQ_INTERNAL = (__force blk_mq_req_flags_t)(1 << 2), 437 /* set RQF_PREEMPT */ 438 BLK_MQ_REQ_PREEMPT = (__force blk_mq_req_flags_t)(1 << 3), 439 }; 440 441 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 442 blk_mq_req_flags_t flags); 443 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 444 unsigned int op, blk_mq_req_flags_t flags, 445 unsigned int hctx_idx); 446 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); 447 448 enum { 449 BLK_MQ_UNIQUE_TAG_BITS = 16, 450 BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, 451 }; 452 453 u32 blk_mq_unique_tag(struct request *rq); 454 455 static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) 456 { 457 return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; 458 } 459 460 static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) 461 { 462 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; 463 } 464 465 /** 466 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request 467 * @rq: target request. 468 */ 469 static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) 470 { 471 return READ_ONCE(rq->state); 472 } 473 474 static inline int blk_mq_request_started(struct request *rq) 475 { 476 return blk_mq_rq_state(rq) != MQ_RQ_IDLE; 477 } 478 479 static inline int blk_mq_request_completed(struct request *rq) 480 { 481 return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; 482 } 483 484 void blk_mq_start_request(struct request *rq); 485 void blk_mq_end_request(struct request *rq, blk_status_t error); 486 void __blk_mq_end_request(struct request *rq, blk_status_t error); 487 488 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); 489 void blk_mq_kick_requeue_list(struct request_queue *q); 490 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); 491 bool blk_mq_complete_request(struct request *rq); 492 bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, 493 struct bio *bio, unsigned int nr_segs); 494 bool blk_mq_queue_stopped(struct request_queue *q); 495 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 496 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 497 void blk_mq_stop_hw_queues(struct request_queue *q); 498 void blk_mq_start_hw_queues(struct request_queue *q); 499 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 500 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); 501 void blk_mq_quiesce_queue(struct request_queue *q); 502 void blk_mq_unquiesce_queue(struct request_queue *q); 503 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 504 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 505 void blk_mq_run_hw_queues(struct request_queue *q, bool async); 506 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 507 busy_tag_iter_fn *fn, void *priv); 508 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); 509 void blk_mq_freeze_queue(struct request_queue *q); 510 void blk_mq_unfreeze_queue(struct request_queue *q); 511 void blk_freeze_queue_start(struct request_queue *q); 512 void blk_mq_freeze_queue_wait(struct request_queue *q); 513 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 514 unsigned long timeout); 515 516 int blk_mq_map_queues(struct blk_mq_queue_map *qmap); 517 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); 518 519 void blk_mq_quiesce_queue_nowait(struct request_queue *q); 520 521 unsigned int blk_mq_rq_cpu(struct request *rq); 522 523 /** 524 * blk_mq_rq_from_pdu - cast a PDU to a request 525 * @pdu: the PDU (Protocol Data Unit) to be casted 526 * 527 * Return: request 528 * 529 * Driver command data is immediately after the request. So subtract request 530 * size to get back to the original request. 531 */ 532 static inline struct request *blk_mq_rq_from_pdu(void *pdu) 533 { 534 return pdu - sizeof(struct request); 535 } 536 537 /** 538 * blk_mq_rq_to_pdu - cast a request to a PDU 539 * @rq: the request to be casted 540 * 541 * Return: pointer to the PDU 542 * 543 * Driver command data is immediately after the request. So add request to get 544 * the PDU. 545 */ 546 static inline void *blk_mq_rq_to_pdu(struct request *rq) 547 { 548 return rq + 1; 549 } 550 551 #define queue_for_each_hw_ctx(q, hctx, i) \ 552 for ((i) = 0; (i) < (q)->nr_hw_queues && \ 553 ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) 554 555 #define hctx_for_each_ctx(hctx, ctx, i) \ 556 for ((i) = 0; (i) < (hctx)->nr_ctx && \ 557 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) 558 559 static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, 560 struct request *rq) 561 { 562 if (rq->tag != -1) 563 return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT); 564 565 return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) | 566 BLK_QC_T_INTERNAL; 567 } 568 569 static inline void blk_mq_cleanup_rq(struct request *rq) 570 { 571 if (rq->q->mq_ops->cleanup_rq) 572 rq->q->mq_ops->cleanup_rq(rq); 573 } 574 575 #endif 576