xref: /illumos-gate/usr/src/uts/common/io/nvme/nvme_var.h (revision 50d757e7)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2016 The MathWorks, Inc. All rights reserved.
14  * Copyright 2019 Joyent, Inc.
15  * Copyright 2019 Unix Software Ltd.
16  * Copyright 2024 Oxide Computer Company.
17  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
18  * Copyright 2022 Tintri by DDN, Inc. All rights reserved.
19  */
20 
21 #ifndef _NVME_VAR_H
22 #define	_NVME_VAR_H
23 
24 #include <sys/ddi.h>
25 #include <sys/sunddi.h>
26 #include <sys/blkdev.h>
27 #include <sys/taskq_impl.h>
28 #include <sys/list.h>
29 #include <sys/ddi_ufm.h>
30 #include <nvme_common.h>
31 
32 /*
33  * NVMe driver state
34  */
35 
36 #ifdef __cplusplus
37 extern "C" {
38 #endif
39 
40 typedef enum {
41 	NVME_PCI_CONFIG			= 1 << 0,
42 	NVME_FMA_INIT			= 1 << 1,
43 	NVME_REGS_MAPPED		= 1 << 2,
44 	NVME_ADMIN_QUEUE		= 1 << 3,
45 	NVME_CTRL_LIMITS		= 1 << 4,
46 	NVME_INTERRUPTS			= 1 << 5,
47 	NVME_UFM_INIT			= 1 << 6,
48 	NVME_MUTEX_INIT			= 1 << 7,
49 	NVME_MGMT_INIT			= 1 << 8
50 } nvme_progress_t;
51 
52 typedef enum {
53 	NVME_NS_LOCK	= 1 << 0
54 } nvme_ns_progress_t;
55 
56 typedef enum {
57 	/*
58 	 * The controller fails to properly process commands on the admin queue
59 	 * if the first one has CID 0. Subsequent use of CID 0 doesn't present
60 	 * a problem.
61 	 */
62 	NVME_QUIRK_START_CID		= 1 << 0,
63 } nvme_quirk_t;
64 
65 #define	NVME_MIN_ADMIN_QUEUE_LEN	16
66 #define	NVME_MIN_IO_QUEUE_LEN		16
67 #define	NVME_DEFAULT_ADMIN_QUEUE_LEN	256
68 #define	NVME_DEFAULT_IO_QUEUE_LEN	1024
69 #define	NVME_DEFAULT_ASYNC_EVENT_LIMIT	10
70 #define	NVME_MIN_ASYNC_EVENT_LIMIT	1
71 #define	NVME_DEFAULT_MIN_BLOCK_SIZE	512
72 
73 
74 typedef struct nvme nvme_t;
75 typedef struct nvme_namespace nvme_namespace_t;
76 typedef struct nvme_minor nvme_minor_t;
77 typedef struct nvme_lock nvme_lock_t;
78 typedef struct nvme_minor_lock_info nvme_minor_lock_info_t;
79 typedef struct nvme_dma nvme_dma_t;
80 typedef struct nvme_cmd nvme_cmd_t;
81 typedef struct nvme_cq nvme_cq_t;
82 typedef struct nvme_qpair nvme_qpair_t;
83 typedef struct nvme_task_arg nvme_task_arg_t;
84 
85 /*
86  * These states represent the minor's perspective. That is, of a minor's
87  * namespace and controller lock, where is it?
88  */
89 typedef enum {
90 	NVME_LOCK_STATE_UNLOCKED	= 0,
91 	NVME_LOCK_STATE_BLOCKED,
92 	NVME_LOCK_STATE_ACQUIRED
93 } nvme_minor_lock_state_t;
94 
95 struct nvme_minor_lock_info {
96 	list_node_t nli_node;
97 	nvme_lock_t *nli_lock;
98 	nvme_minor_lock_state_t nli_state;
99 	nvme_lock_level_t nli_curlevel;
100 	/*
101 	 * While the minor points back to itself and the nvme_t should always
102 	 * point to the current controller, the namespace should only point to
103 	 * one if this is a particular namespace lock. The former two are
104 	 * initialized at minor initialization time.
105 	 */
106 	nvme_minor_t *nli_minor;
107 	nvme_t *nli_nvme;
108 	nvme_namespace_t *nli_ns;
109 	/*
110 	 * This is the common ioctl information that should be filled in when
111 	 * we're being woken up for any reason other than an interrupted signal.
112 	 * This should only be set while blocking.
113 	 */
114 	nvme_ioctl_common_t *nli_ioc;
115 	/*
116 	 * The following are provided for debugging purposes. In particular,
117 	 * information like the kthread_t and related that performed this should
118 	 * be considered suspect as it represents who took the operation, not
119 	 * who performed the operation (unless we're actively blocking).
120 	 */
121 	hrtime_t nli_last_change;
122 	uintptr_t nli_acq_kthread;
123 	pid_t nli_acq_pid;
124 };
125 
126 struct nvme_minor {
127 	/*
128 	 * The following three fields are set when this is created.
129 	 */
130 	id_t nm_minor;
131 	nvme_t *nm_ctrl;
132 	nvme_namespace_t *nm_ns;
133 	/*
134 	 * This link is used to index this minor on the global list of active
135 	 * open-related minors. This is only manipulated under the
136 	 * nvme_open_minors_mutex.
137 	 */
138 	avl_node_t nm_avl;
139 	/*
140 	 * Information related to locking. Note, there is no pointer to a locked
141 	 * controller as the only one can be the one specified here. This data
142 	 * is protected by the controller's n_minor_mutex.
143 	 */
144 	kcondvar_t nm_cv;
145 	nvme_minor_lock_info_t nm_ctrl_lock;
146 	nvme_minor_lock_info_t nm_ns_lock;
147 };
148 
149 struct nvme_lock {
150 	nvme_minor_lock_info_t *nl_writer;
151 	list_t nl_readers;
152 	list_t nl_pend_readers;
153 	list_t nl_pend_writers;
154 	/*
155 	 * The following are stats to indicate how often certain locking
156 	 * activities have occurred for debugging purposes.
157 	 */
158 	uint32_t nl_nwrite_locks;
159 	uint32_t nl_nread_locks;
160 	uint32_t nl_npend_writes;
161 	uint32_t nl_npend_reads;
162 	uint32_t nl_nnonblock;
163 	uint32_t nl_nsignals;
164 	uint32_t nl_nsig_unlock;
165 	uint32_t nl_nsig_blocks;
166 	uint32_t nl_nsig_acq;
167 };
168 
169 struct nvme_dma {
170 	ddi_dma_handle_t nd_dmah;
171 	ddi_acc_handle_t nd_acch;
172 	ddi_dma_cookie_t nd_cookie;
173 	uint_t nd_ncookie;
174 	caddr_t nd_memp;
175 	size_t nd_len;
176 	boolean_t nd_cached;
177 };
178 
179 struct nvme_cmd {
180 	struct list_node nc_list;
181 
182 	nvme_sqe_t nc_sqe;
183 	nvme_cqe_t nc_cqe;
184 
185 	void (*nc_callback)(void *);
186 	bd_xfer_t *nc_xfer;
187 	boolean_t nc_completed;
188 	boolean_t nc_dontpanic;
189 	uint16_t nc_sqid;
190 
191 	nvme_dma_t *nc_dma;
192 	nvme_dma_t *nc_prp; /* DMA for PRP lists */
193 
194 	kmutex_t nc_mutex;
195 	kcondvar_t nc_cv;
196 
197 	taskq_ent_t nc_tqent;
198 	nvme_t *nc_nvme;
199 };
200 
201 struct nvme_cq {
202 	size_t ncq_nentry;
203 	uint16_t ncq_id;
204 
205 	nvme_dma_t *ncq_dma;
206 	nvme_cqe_t *ncq_cq;
207 	uint_t ncq_head;
208 	uint_t ncq_tail;
209 	uintptr_t ncq_hdbl;
210 	int ncq_phase;
211 
212 	taskq_t *ncq_cmd_taskq;
213 
214 	kmutex_t ncq_mutex;
215 };
216 
217 struct nvme_qpair {
218 	size_t nq_nentry;
219 
220 	/* submission fields */
221 	nvme_dma_t *nq_sqdma;
222 	nvme_sqe_t *nq_sq;
223 	uint_t nq_sqhead;
224 	uint_t nq_sqtail;
225 	uintptr_t nq_sqtdbl;
226 
227 	/* completion */
228 	nvme_cq_t *nq_cq;
229 
230 	/* shared structures for completion and submission */
231 	nvme_cmd_t **nq_cmd;	/* active command array */
232 	uint16_t nq_next_cmd;	/* next potential empty queue slot */
233 	uint_t nq_active_cmds;	/* number of active cmds */
234 
235 	kmutex_t nq_mutex;	/* protects shared state */
236 	ksema_t nq_sema; /* semaphore to ensure q always has >= 1 empty slot */
237 };
238 
239 typedef struct nvme_mgmt_lock {
240 	kmutex_t nml_lock;
241 	kcondvar_t nml_cv;
242 	uintptr_t nml_bd_own;
243 } nvme_mgmt_lock_t;
244 
245 struct nvme {
246 	dev_info_t *n_dip;
247 	nvme_progress_t n_progress;
248 	nvme_quirk_t n_quirks;
249 
250 	caddr_t n_regs;
251 	ddi_acc_handle_t n_regh;
252 
253 	kmem_cache_t *n_cmd_cache;
254 	kmem_cache_t *n_prp_cache;
255 
256 	size_t n_inth_sz;
257 	ddi_intr_handle_t *n_inth;
258 	int n_intr_cnt;
259 	uint_t n_intr_pri;
260 	int n_intr_cap;
261 	int n_intr_type;
262 	int n_intr_types;
263 
264 	ddi_acc_handle_t n_pcicfg_handle;
265 	uint16_t n_vendor_id;
266 	uint16_t n_device_id;
267 	uint16_t n_subsystem_vendor_id;
268 	uint16_t n_subsystem_device_id;
269 	uint8_t n_revision_id;
270 
271 	char *n_product;
272 	char *n_vendor;
273 
274 	nvme_version_t n_version;
275 	boolean_t n_dead;
276 	nvme_ioctl_errno_t n_dead_status;
277 	taskq_ent_t n_dead_tqent;
278 	boolean_t n_strict_version;
279 	boolean_t n_ignore_unknown_vendor_status;
280 	uint32_t n_admin_queue_len;
281 	uint32_t n_io_squeue_len;
282 	uint32_t n_io_cqueue_len;
283 	uint16_t n_async_event_limit;
284 	uint_t n_min_block_size;
285 	uint16_t n_abort_command_limit;
286 	uint64_t n_max_data_transfer_size;
287 	boolean_t n_write_cache_present;
288 	boolean_t n_write_cache_enabled;
289 	int n_error_log_len;
290 	boolean_t n_async_event_supported;
291 	int n_submission_queues;
292 	int n_completion_queues;
293 
294 	int n_nssr_supported;
295 	int n_doorbell_stride;
296 	int n_timeout;
297 	int n_arbitration_mechanisms;
298 	int n_cont_queues_reqd;
299 	int n_max_queue_entries;
300 	int n_pageshift;
301 	int n_pagesize;
302 
303 	uint32_t n_namespace_count;
304 	uint_t n_namespaces_attachable;
305 	uint_t n_ioq_count;
306 	uint_t n_cq_count;
307 
308 	/*
309 	 * This is cached identify controller and common namespace data that
310 	 * exists in the system. This generally can be used in the kernel;
311 	 * however, we have to be careful about what we use here because these
312 	 * values are not refreshed after attach. Therefore these are good for
313 	 * answering the question what does the controller support or what is in
314 	 * the common namespace information, but not otherwise. That means you
315 	 * shouldn't use this to try to answer how much capacity is still in the
316 	 * controller because this information is just cached.
317 	 */
318 	nvme_identify_ctrl_t *n_idctl;
319 	nvme_identify_nsid_t *n_idcomns;
320 
321 	/* Pointer to the admin queue, which is always queue 0 in n_ioq. */
322 	nvme_qpair_t *n_adminq;
323 	/*
324 	 * All command queues, including the admin queue.
325 	 * Its length is: n_ioq_count + 1.
326 	 */
327 	nvme_qpair_t **n_ioq;
328 	nvme_cq_t **n_cq;
329 
330 	nvme_namespace_t *n_ns;
331 
332 	ddi_dma_attr_t n_queue_dma_attr;
333 	ddi_dma_attr_t n_prp_dma_attr;
334 	ddi_dma_attr_t n_sgl_dma_attr;
335 	ddi_device_acc_attr_t n_reg_acc_attr;
336 	ddi_iblock_cookie_t n_fm_ibc;
337 	int n_fm_cap;
338 
339 	ksema_t n_abort_sema;
340 
341 	/* protects namespace management operations */
342 	nvme_mgmt_lock_t n_mgmt;
343 
344 	/*
345 	 * This lock protects the minor node locking state across the controller
346 	 * and all related namespaces.
347 	 */
348 	kmutex_t n_minor_mutex;
349 	nvme_lock_t n_lock;
350 
351 	/* errors detected by driver */
352 	uint32_t n_dma_bind_err;
353 	uint32_t n_abort_failed;
354 	uint32_t n_cmd_timeout;
355 	uint32_t n_cmd_aborted;
356 	uint32_t n_wrong_logpage;
357 	uint32_t n_unknown_logpage;
358 	uint32_t n_too_many_cookies;
359 	uint32_t n_unknown_cid;
360 
361 	/* errors detected by hardware */
362 	uint32_t n_data_xfr_err;
363 	uint32_t n_internal_err;
364 	uint32_t n_abort_rq_err;
365 	uint32_t n_abort_sq_del;
366 	uint32_t n_nvm_cap_exc;
367 	uint32_t n_nvm_ns_notrdy;
368 	uint32_t n_nvm_ns_formatting;
369 	uint32_t n_inv_cq_err;
370 	uint32_t n_inv_qid_err;
371 	uint32_t n_max_qsz_exc;
372 	uint32_t n_inv_int_vect;
373 	uint32_t n_inv_log_page;
374 	uint32_t n_inv_format;
375 	uint32_t n_inv_q_del;
376 	uint32_t n_cnfl_attr;
377 	uint32_t n_inv_prot;
378 	uint32_t n_readonly;
379 
380 	/* errors reported by asynchronous events */
381 	uint32_t n_diagfail_event;
382 	uint32_t n_persistent_event;
383 	uint32_t n_transient_event;
384 	uint32_t n_fw_load_event;
385 	uint32_t n_reliability_event;
386 	uint32_t n_temperature_event;
387 	uint32_t n_spare_event;
388 	uint32_t n_vendor_event;
389 	uint32_t n_notice_event;
390 	uint32_t n_unknown_event;
391 
392 	/* hot removal NDI event handling */
393 	ddi_eventcookie_t n_rm_cookie;
394 	ddi_callback_id_t n_ev_rm_cb_id;
395 
396 	/* DDI UFM handle */
397 	ddi_ufm_handle_t *n_ufmh;
398 	/* Cached Firmware Slot Information log page */
399 	nvme_fwslot_log_t *n_fwslot;
400 	/* Lock protecting the cached firmware slot info */
401 	kmutex_t n_fwslot_mutex;
402 };
403 
404 struct nvme_namespace {
405 	nvme_t *ns_nvme;
406 	nvme_ns_progress_t ns_progress;
407 	uint8_t ns_eui64[8];
408 	uint8_t	ns_nguid[16];
409 	char	ns_name[11];
410 
411 	bd_handle_t ns_bd_hdl;
412 
413 	uint32_t ns_id;
414 	size_t ns_block_count;
415 	size_t ns_block_size;
416 	size_t ns_best_block_size;
417 
418 	boolean_t ns_allocated;
419 	boolean_t ns_active;
420 	boolean_t ns_ignore;
421 	boolean_t ns_attached;
422 
423 	nvme_identify_nsid_t *ns_idns;
424 
425 	/*
426 	 * Namespace lock, see the theory statement for more information.
427 	 */
428 	nvme_lock_t ns_lock;
429 
430 	/*
431 	 * If a namespace has neither NGUID nor EUI64, we create a devid in
432 	 * nvme_prepare_devid().
433 	 */
434 	char *ns_devid;
435 };
436 
437 struct nvme_task_arg {
438 	nvme_t *nt_nvme;
439 	nvme_cmd_t *nt_cmd;
440 };
441 
442 typedef enum {
443 	/*
444 	 * This indicates that there is no exclusive access required for this
445 	 * operation. However, this operation will fail if someone attempts to
446 	 * perform this operation and someone else holds a write lock.
447 	 */
448 	NVME_IOCTL_EXCL_NONE	= 0,
449 	/*
450 	 * This indicates that a write lock is required to perform the
451 	 * operation.
452 	 */
453 	NVME_IOCTL_EXCL_WRITE,
454 	/*
455 	 * This indicates that the exclusive check should be skipped. The only
456 	 * case this should be used in is the lock and unlock ioctls as they
457 	 * should be able to proceed even when the controller is being used
458 	 * exclusively.
459 	 */
460 	NVME_IOCTL_EXCL_SKIP
461 } nvme_ioctl_excl_t;
462 
463 /*
464  * This structure represents the set of checks that we apply to ioctl's using
465  * the nvme_ioctl_common_t structure as part of validation.
466  */
467 typedef struct nvme_ioctl_check {
468 	/*
469 	 * This indicates whether or not the command in question allows a
470 	 * namespace to be specified at all. If this is false, a namespace minor
471 	 * cannot be used and a controller minor must leave the nsid set to
472 	 * zero.
473 	 */
474 	boolean_t nck_ns_ok;
475 	/*
476 	 * This indicates that a minor node corresponding to a namespace is
477 	 * allowed to issue this.
478 	 */
479 	boolean_t nck_ns_minor_ok;
480 	/*
481 	 * This indicates that the controller should be skipped from all of the
482 	 * following processing behavior. That is, it's allowed to specify
483 	 * whatever it wants in the nsid field, regardless if it is valid or
484 	 * not. This is required for some of the Identify Command options that
485 	 * list endpoints. This should generally not be used and the driver
486 	 * should still validate the nuance here.
487 	 */
488 	boolean_t nck_skip_ctrl;
489 	/*
490 	 * This indicates that if we're on the controller's minor and we don't
491 	 * have an explicit namespace ID (i.e. 0), should the namespace be
492 	 * rewritten to be the broadcast namespace.
493 	 */
494 	boolean_t nck_ctrl_rewrite;
495 	/*
496 	 * This indicates whether or not the broadcast NSID is acceptable for
497 	 * the controller node.
498 	 */
499 	boolean_t nck_bcast_ok;
500 
501 	/*
502 	 * This indicates to the lock checking code what kind of exclusive
503 	 * access is required. This check occurs after any namespace rewriting
504 	 * has occurred. When looking at exclusivity, a broadcast namespace or
505 	 * namespace 0 indicate that the controller is the target, otherwise the
506 	 * target namespace will be checked for a write lock.
507 	 */
508 	nvme_ioctl_excl_t nck_excl;
509 } nvme_ioctl_check_t;
510 
511 /*
512  * Constants
513  */
514 extern uint_t nvme_vendor_specific_admin_cmd_max_timeout;
515 extern uint32_t nvme_vendor_specific_admin_cmd_size;
516 
517 /*
518  * Common functions.
519  */
520 extern nvme_namespace_t *nvme_nsid2ns(nvme_t *, uint32_t);
521 extern boolean_t nvme_ioctl_error(nvme_ioctl_common_t *, nvme_ioctl_errno_t,
522     uint32_t, uint32_t);
523 extern boolean_t nvme_ctrl_atleast(nvme_t *, const nvme_version_t *);
524 extern void nvme_ioctl_success(nvme_ioctl_common_t *);
525 
526 /*
527  * Validation related functions and kernel tunable limits.
528  */
529 extern boolean_t nvme_validate_logpage(nvme_t *, nvme_ioctl_get_logpage_t *);
530 extern boolean_t nvme_validate_identify(nvme_t *, nvme_ioctl_identify_t *,
531     boolean_t);
532 extern boolean_t nvme_validate_get_feature(nvme_t *,
533     nvme_ioctl_get_feature_t *);
534 extern boolean_t nvme_validate_vuc(nvme_t *, nvme_ioctl_passthru_t *);
535 extern boolean_t nvme_validate_format(nvme_t *, nvme_ioctl_format_t *);
536 extern boolean_t nvme_validate_fw_load(nvme_t *, nvme_ioctl_fw_load_t *);
537 extern boolean_t nvme_validate_fw_commit(nvme_t *, nvme_ioctl_fw_commit_t *);
538 
539 /*
540  * Locking functions
541  */
542 extern void nvme_rwlock(nvme_minor_t *, nvme_ioctl_lock_t *);
543 extern void nvme_rwunlock(nvme_minor_lock_info_t *, nvme_lock_t *);
544 extern void nvme_rwlock_ctrl_dead(void *);
545 extern void nvme_lock_init(nvme_lock_t *);
546 extern void nvme_lock_fini(nvme_lock_t *);
547 
548 #ifdef __cplusplus
549 }
550 #endif
551 
552 #endif /* _NVME_VAR_H */
553