xref: /illumos-gate/usr/src/cmd/bhyve/pci_nvme.c (revision 55fea89d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 #ifndef __FreeBSD__
66 #include <endian.h>
67 #endif
68 
69 #include <assert.h>
70 #include <pthread.h>
71 #include <pthread_np.h>
72 #include <semaphore.h>
73 #include <stdbool.h>
74 #include <stddef.h>
75 #include <stdint.h>
76 #include <stdio.h>
77 #include <stdlib.h>
78 #include <string.h>
79 
80 #include <machine/atomic.h>
81 #include <machine/vmm.h>
82 #include <vmmapi.h>
83 
84 #include <dev/nvme/nvme.h>
85 
86 #include "bhyverun.h"
87 #include "block_if.h"
88 #include "config.h"
89 #include "debug.h"
90 #include "pci_emul.h"
91 
92 
93 static int nvme_debug = 0;
94 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
95 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
96 
97 /* defaults; can be overridden */
98 #define	NVME_MSIX_BAR		4
99 
100 #define	NVME_IOSLOTS		8
101 
102 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
103 #define NVME_MMIO_SPACE_MIN	(1 << 14)
104 
105 #define	NVME_QUEUES		16
106 #define	NVME_MAX_QENTRIES	2048
107 /* Memory Page size Minimum reported in CAP register */
108 #define	NVME_MPSMIN		0
109 /* MPSMIN converted to bytes */
110 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
111 
112 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
113 #define	NVME_MDTS		9
114 /* Note the + 1 allows for the initial descriptor to not be page aligned */
115 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
116 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
117 
118 /* This is a synthetic status code to indicate there is no status */
119 #define NVME_NO_STATUS		0xffff
120 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
121 
122 /* Reported temperature in Kelvin (i.e. room temperature) */
123 #define NVME_TEMPERATURE 296
124 
125 /* helpers */
126 
127 /* Convert a zero-based value into a one-based value */
128 #define ONE_BASED(zero)		((zero) + 1)
129 /* Convert a one-based value into a zero-based value */
130 #define ZERO_BASED(one)		((one)  - 1)
131 
132 /* Encode number of SQ's and CQ's for Set/Get Features */
133 #define NVME_FEATURE_NUM_QUEUES(sc) \
134 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
135 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
136 
137 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
138 
139 enum nvme_controller_register_offsets {
140 	NVME_CR_CAP_LOW = 0x00,
141 	NVME_CR_CAP_HI  = 0x04,
142 	NVME_CR_VS      = 0x08,
143 	NVME_CR_INTMS   = 0x0c,
144 	NVME_CR_INTMC   = 0x10,
145 	NVME_CR_CC      = 0x14,
146 	NVME_CR_CSTS    = 0x1c,
147 	NVME_CR_NSSR    = 0x20,
148 	NVME_CR_AQA     = 0x24,
149 	NVME_CR_ASQ_LOW = 0x28,
150 	NVME_CR_ASQ_HI  = 0x2c,
151 	NVME_CR_ACQ_LOW = 0x30,
152 	NVME_CR_ACQ_HI  = 0x34,
153 };
154 
155 enum nvme_cmd_cdw11 {
156 	NVME_CMD_CDW11_PC  = 0x0001,
157 	NVME_CMD_CDW11_IEN = 0x0002,
158 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
159 };
160 
161 enum nvme_copy_dir {
162 	NVME_COPY_TO_PRP,
163 	NVME_COPY_FROM_PRP,
164 };
165 
166 #define	NVME_CQ_INTEN	0x01
167 #define	NVME_CQ_INTCOAL	0x02
168 
169 struct nvme_completion_queue {
170 	struct nvme_completion *qbase;
171 	pthread_mutex_t	mtx;
172 	uint32_t	size;
173 	uint16_t	tail; /* nvme progress */
174 	uint16_t	head; /* guest progress */
175 	uint16_t	intr_vec;
176 	uint32_t	intr_en;
177 };
178 
179 struct nvme_submission_queue {
180 	struct nvme_command *qbase;
181 	pthread_mutex_t	mtx;
182 	uint32_t	size;
183 	uint16_t	head; /* nvme progress */
184 	uint16_t	tail; /* guest progress */
185 	uint16_t	cqid; /* completion queue id */
186 	int		qpriority;
187 };
188 
189 enum nvme_storage_type {
190 	NVME_STOR_BLOCKIF = 0,
191 	NVME_STOR_RAM = 1,
192 };
193 
194 struct pci_nvme_blockstore {
195 	enum nvme_storage_type type;
196 	void		*ctx;
197 	uint64_t	size;
198 	uint32_t	sectsz;
199 	uint32_t	sectsz_bits;
200 	uint64_t	eui64;
201 	uint32_t	deallocate:1;
202 };
203 
204 /*
205  * Calculate the number of additional page descriptors for guest IO requests
206  * based on the advertised Max Data Transfer (MDTS) and given the number of
207  * default iovec's in a struct blockif_req.
208  */
209 #define MDTS_PAD_SIZE \
210 	( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
211 	  NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
212 	  0 )
213 
214 struct pci_nvme_ioreq {
215 	struct pci_nvme_softc *sc;
216 	STAILQ_ENTRY(pci_nvme_ioreq) link;
217 	struct nvme_submission_queue *nvme_sq;
218 	uint16_t	sqid;
219 
220 	/* command information */
221 	uint16_t	opc;
222 	uint16_t	cid;
223 	uint32_t	nsid;
224 
225 	uint64_t	prev_gpaddr;
226 	size_t		prev_size;
227 	size_t		bytes;
228 
229 	struct blockif_req io_req;
230 
231 	struct iovec	iovpadding[MDTS_PAD_SIZE];
232 };
233 
234 enum nvme_dsm_type {
235 	/* Dataset Management bit in ONCS reflects backing storage capability */
236 	NVME_DATASET_MANAGEMENT_AUTO,
237 	/* Unconditionally set Dataset Management bit in ONCS */
238 	NVME_DATASET_MANAGEMENT_ENABLE,
239 	/* Unconditionally clear Dataset Management bit in ONCS */
240 	NVME_DATASET_MANAGEMENT_DISABLE,
241 };
242 
243 struct pci_nvme_softc;
244 struct nvme_feature_obj;
245 
246 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
247     struct nvme_feature_obj *,
248     struct nvme_command *,
249     struct nvme_completion *);
250 
251 struct nvme_feature_obj {
252 	uint32_t	cdw11;
253 	nvme_feature_cb	set;
254 	nvme_feature_cb	get;
255 	bool namespace_specific;
256 };
257 
258 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
259 
260 typedef enum {
261 	PCI_NVME_AE_TYPE_ERROR = 0,
262 	PCI_NVME_AE_TYPE_SMART,
263 	PCI_NVME_AE_TYPE_NOTICE,
264 	PCI_NVME_AE_TYPE_IO_CMD = 6,
265 	PCI_NVME_AE_TYPE_VENDOR = 7,
266 	PCI_NVME_AE_TYPE_MAX		/* Must be last */
267 } pci_nvme_async_type;
268 
269 /* Asynchronous Event Requests */
270 struct pci_nvme_aer {
271 	STAILQ_ENTRY(pci_nvme_aer) link;
272 	uint16_t	cid;	/* Command ID of the submitted AER */
273 };
274 
275 /** Asynchronous Event Information - Notice */
276 typedef enum {
277 	PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0,
278 	PCI_NVME_AEI_NOTICE_FW_ACTIVATION,
279 	PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE,
280 	PCI_NVME_AEI_NOTICE_ANA_CHANGE,
281 	PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE,
282 	PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT,
283 	PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE,
284 	PCI_NVME_AEI_NOTICE_MAX,
285 } pci_nvme_async_event_info_notice;
286 
287 #define PCI_NVME_AEI_NOTICE_SHIFT		8
288 #define PCI_NVME_AEI_NOTICE_MASK(event)	(1 << (event + PCI_NVME_AEI_NOTICE_SHIFT))
289 
290 /* Asynchronous Event Notifications */
291 struct pci_nvme_aen {
292 	pci_nvme_async_type atype;
293 	uint32_t	event_data;
294 	bool		posted;
295 };
296 
297 /*
298  * By default, enable all Asynchrnous Event Notifications:
299  *     SMART / Health Critical Warnings
300  *     Namespace Attribute Notices
301  */
302 #define PCI_NVME_AEN_DEFAULT_MASK	0x11f
303 
304 typedef enum {
305 	NVME_CNTRLTYPE_IO = 1,
306 	NVME_CNTRLTYPE_DISCOVERY = 2,
307 	NVME_CNTRLTYPE_ADMIN = 3,
308 } pci_nvme_cntrl_type;
309 
310 struct pci_nvme_softc {
311 	struct pci_devinst *nsc_pi;
312 
313 	pthread_mutex_t	mtx;
314 
315 	struct nvme_registers regs;
316 
317 	struct nvme_namespace_data  nsdata;
318 	struct nvme_controller_data ctrldata;
319 	struct nvme_error_information_entry err_log;
320 	struct nvme_health_information_page health_log;
321 	struct nvme_firmware_page fw_log;
322 	struct nvme_ns_list ns_log;
323 
324 	struct pci_nvme_blockstore nvstore;
325 
326 	uint16_t	max_qentries;	/* max entries per queue */
327 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
328 	uint32_t	num_cqueues;
329 	uint32_t	num_squeues;
330 	bool		num_q_is_set; /* Has host set Number of Queues */
331 
332 	struct pci_nvme_ioreq *ioreqs;
333 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
334 	uint32_t	pending_ios;
335 	uint32_t	ioslots;
336 	sem_t		iosemlock;
337 
338 	/*
339 	 * Memory mapped Submission and Completion queues
340 	 * Each array includes both Admin and IO queues
341 	 */
342 	struct nvme_completion_queue *compl_queues;
343 	struct nvme_submission_queue *submit_queues;
344 
345 	struct nvme_feature_obj feat[NVME_FID_MAX];
346 
347 	enum nvme_dsm_type dataset_management;
348 
349 	/* Accounting for SMART data */
350 	__uint128_t	read_data_units;
351 	__uint128_t	write_data_units;
352 	__uint128_t	read_commands;
353 	__uint128_t	write_commands;
354 	uint32_t	read_dunits_remainder;
355 	uint32_t	write_dunits_remainder;
356 
357 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
358 	pthread_mutex_t	aer_mtx;
359 	uint32_t	aer_count;
360 	struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
361 	pthread_t	aen_tid;
362 	pthread_mutex_t	aen_mtx;
363 	pthread_cond_t	aen_cond;
364 };
365 
366 
367 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
368     struct nvme_completion_queue *cq,
369     uint32_t cdw0,
370     uint16_t cid,
371     uint16_t sqid,
372     uint16_t status);
373 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
374 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
375 static void pci_nvme_io_done(struct blockif_req *, int);
376 
377 /* Controller Configuration utils */
378 #define	NVME_CC_GET_EN(cc) \
379 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
380 #define	NVME_CC_GET_CSS(cc) \
381 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
382 #define	NVME_CC_GET_SHN(cc) \
383 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
384 #define	NVME_CC_GET_IOSQES(cc) \
385 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
386 #define	NVME_CC_GET_IOCQES(cc) \
387 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
388 
389 #define	NVME_CC_WRITE_MASK \
390 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
391 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
392 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
393 
394 #define	NVME_CC_NEN_WRITE_MASK \
395 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
396 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
397 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
398 
399 /* Controller Status utils */
400 #define	NVME_CSTS_GET_RDY(sts) \
401 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
402 
403 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
404 #define	NVME_CSTS_CFS	(1 << NVME_CSTS_REG_CFS_SHIFT)
405 
406 /* Completion Queue status word utils */
407 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
408 #define	NVME_STATUS_MASK \
409 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
410 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
411 
412 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
413 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
414 
415 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
416     struct nvme_feature_obj *,
417     struct nvme_command *,
418     struct nvme_completion *);
419 static void nvme_feature_temperature(struct pci_nvme_softc *,
420     struct nvme_feature_obj *,
421     struct nvme_command *,
422     struct nvme_completion *);
423 static void nvme_feature_num_queues(struct pci_nvme_softc *,
424     struct nvme_feature_obj *,
425     struct nvme_command *,
426     struct nvme_completion *);
427 static void nvme_feature_iv_config(struct pci_nvme_softc *,
428     struct nvme_feature_obj *,
429     struct nvme_command *,
430     struct nvme_completion *);
431 static void nvme_feature_async_event(struct pci_nvme_softc *,
432     struct nvme_feature_obj *,
433     struct nvme_command *,
434     struct nvme_completion *);
435 
436 static void *aen_thr(void *arg);
437 
438 static __inline void
439 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
440 {
441 	size_t len;
442 
443 	len = strnlen(src, dst_size);
444 	memset(dst, pad, dst_size);
445 	memcpy(dst, src, len);
446 }
447 
448 static __inline void
449 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
450 {
451 
452 	*status &= ~NVME_STATUS_MASK;
453 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
454 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
455 }
456 
457 static __inline void
458 pci_nvme_status_genc(uint16_t *status, uint16_t code)
459 {
460 
461 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
462 }
463 
464 /*
465  * Initialize the requested number or IO Submission and Completion Queues.
466  * Admin queues are allocated implicitly.
467  */
468 static void
469 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
470 {
471 	uint32_t i;
472 
473 	/*
474 	 * Allocate and initialize the Submission Queues
475 	 */
476 	if (nsq > NVME_QUEUES) {
477 		WPRINTF("%s: clamping number of SQ from %u to %u",
478 					__func__, nsq, NVME_QUEUES);
479 		nsq = NVME_QUEUES;
480 	}
481 
482 	sc->num_squeues = nsq;
483 
484 	sc->submit_queues = calloc(sc->num_squeues + 1,
485 				sizeof(struct nvme_submission_queue));
486 	if (sc->submit_queues == NULL) {
487 		WPRINTF("%s: SQ allocation failed", __func__);
488 		sc->num_squeues = 0;
489 	} else {
490 		struct nvme_submission_queue *sq = sc->submit_queues;
491 
492 		for (i = 0; i < sc->num_squeues + 1; i++)
493 			pthread_mutex_init(&sq[i].mtx, NULL);
494 	}
495 
496 	/*
497 	 * Allocate and initialize the Completion Queues
498 	 */
499 	if (ncq > NVME_QUEUES) {
500 		WPRINTF("%s: clamping number of CQ from %u to %u",
501 					__func__, ncq, NVME_QUEUES);
502 		ncq = NVME_QUEUES;
503 	}
504 
505 	sc->num_cqueues = ncq;
506 
507 	sc->compl_queues = calloc(sc->num_cqueues + 1,
508 				sizeof(struct nvme_completion_queue));
509 	if (sc->compl_queues == NULL) {
510 		WPRINTF("%s: CQ allocation failed", __func__);
511 		sc->num_cqueues = 0;
512 	} else {
513 		struct nvme_completion_queue *cq = sc->compl_queues;
514 
515 		for (i = 0; i < sc->num_cqueues + 1; i++)
516 			pthread_mutex_init(&cq[i].mtx, NULL);
517 	}
518 }
519 
520 static void
521 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
522 {
523 	struct nvme_controller_data *cd = &sc->ctrldata;
524 
525 	cd->vid = 0xFB5D;
526 	cd->ssvid = 0x0000;
527 
528 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
529 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
530 
531 	/* Num of submission commands that we can handle at a time (2^rab) */
532 	cd->rab   = 4;
533 
534 	/* FreeBSD OUI */
535 	cd->ieee[0] = 0x58;
536 	cd->ieee[1] = 0x9c;
537 	cd->ieee[2] = 0xfc;
538 
539 	cd->mic = 0;
540 
541 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
542 
543 	cd->ver = NVME_REV(1,4);
544 
545 	cd->cntrltype = NVME_CNTRLTYPE_IO;
546 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
547 	cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR);
548 	cd->acl = 2;
549 	cd->aerl = 4;
550 
551 	/* Advertise 1, Read-only firmware slot */
552 	cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) |
553 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
554 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
555 	cd->elpe = 0;	/* max error log page entries */
556 	/*
557 	 * Report a single power state (zero-based value)
558 	 * power_state[] values are left as zero to indicate "Not reported"
559 	 */
560 	cd->npss = 0;
561 
562 	/* Warning Composite Temperature Threshold */
563 	cd->wctemp = 0x0157;
564 	cd->cctemp = 0x0157;
565 
566 	/* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */
567 	cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO <<
568 			NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT);
569 
570 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
571 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
572 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
573 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
574 	cd->nn = 1;	/* number of namespaces */
575 
576 	cd->oncs = 0;
577 	switch (sc->dataset_management) {
578 	case NVME_DATASET_MANAGEMENT_AUTO:
579 		if (sc->nvstore.deallocate)
580 			cd->oncs |= NVME_ONCS_DSM;
581 		break;
582 	case NVME_DATASET_MANAGEMENT_ENABLE:
583 		cd->oncs |= NVME_ONCS_DSM;
584 		break;
585 	default:
586 		break;
587 	}
588 
589 	cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
590 	    NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
591 
592 	cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT;
593 }
594 
595 /*
596  * Calculate the CRC-16 of the given buffer
597  * See copyright attribution at top of file
598  */
599 static uint16_t
600 crc16(uint16_t crc, const void *buffer, unsigned int len)
601 {
602 	const unsigned char *cp = buffer;
603 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
604 	static uint16_t const crc16_table[256] = {
605 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
606 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
607 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
608 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
609 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
610 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
611 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
612 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
613 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
614 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
615 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
616 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
617 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
618 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
619 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
620 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
621 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
622 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
623 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
624 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
625 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
626 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
627 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
628 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
629 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
630 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
631 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
632 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
633 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
634 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
635 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
636 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
637 	};
638 
639 	while (len--)
640 		crc = (((crc >> 8) & 0xffU) ^
641 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
642 	return crc;
643 }
644 
645 static void
646 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
647     struct nvme_namespace_data *nd)
648 {
649 
650 	/* Get capacity and block size information from backing store */
651 	nd->nsze = nvstore->size / nvstore->sectsz;
652 	nd->ncap = nd->nsze;
653 	nd->nuse = nd->nsze;
654 }
655 
656 static void
657 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
658     struct nvme_namespace_data *nd, uint32_t nsid,
659     struct pci_nvme_blockstore *nvstore)
660 {
661 
662 	pci_nvme_init_nsdata_size(nvstore, nd);
663 
664 	if (nvstore->type == NVME_STOR_BLOCKIF)
665 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
666 
667 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
668 	nd->flbas = 0;
669 
670 	/* Create an EUI-64 if user did not provide one */
671 	if (nvstore->eui64 == 0) {
672 		char *data = NULL;
673 		uint64_t eui64 = nvstore->eui64;
674 
675 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
676 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
677 		    sc->nsc_pi->pi_func);
678 
679 		if (data != NULL) {
680 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
681 			free(data);
682 		}
683 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
684 	}
685 	be64enc(nd->eui64, nvstore->eui64);
686 
687 	/* LBA data-sz = 2^lbads */
688 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
689 }
690 
691 static void
692 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
693 {
694 	__uint128_t power_cycles = 1;
695 
696 	memset(&sc->err_log, 0, sizeof(sc->err_log));
697 	memset(&sc->health_log, 0, sizeof(sc->health_log));
698 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
699 	memset(&sc->ns_log, 0, sizeof(sc->ns_log));
700 
701 	/* Set read/write remainder to round up according to spec */
702 	sc->read_dunits_remainder = 999;
703 	sc->write_dunits_remainder = 999;
704 
705 	/* Set nominal Health values checked by implementations */
706 	sc->health_log.temperature = NVME_TEMPERATURE;
707 	sc->health_log.available_spare = 100;
708 	sc->health_log.available_spare_threshold = 10;
709 
710 	/* Set Active Firmware Info to slot 1 */
711 	sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT);
712 	memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr,
713 	    sizeof(sc->fw_log.revision[0]));
714 
715 	memcpy(&sc->health_log.power_cycles, &power_cycles,
716 	    sizeof(sc->health_log.power_cycles));
717 }
718 
719 static void
720 pci_nvme_init_features(struct pci_nvme_softc *sc)
721 {
722 	enum nvme_feature	fid;
723 
724 	for (fid = 0; fid < NVME_FID_MAX; fid++) {
725 		switch (fid) {
726 		case NVME_FEAT_ARBITRATION:
727 		case NVME_FEAT_POWER_MANAGEMENT:
728 		case NVME_FEAT_INTERRUPT_COALESCING: //XXX
729 		case NVME_FEAT_WRITE_ATOMICITY:
730 			/* Mandatory but no special handling required */
731 		//XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
732 		//XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
733 		//		  this returns a data buffer
734 			break;
735 		case NVME_FEAT_TEMPERATURE_THRESHOLD:
736 			sc->feat[fid].set = nvme_feature_temperature;
737 			break;
738 		case NVME_FEAT_ERROR_RECOVERY:
739 			sc->feat[fid].namespace_specific = true;
740 			break;
741 		case NVME_FEAT_NUMBER_OF_QUEUES:
742 			sc->feat[fid].set = nvme_feature_num_queues;
743 			break;
744 		case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
745 			sc->feat[fid].set = nvme_feature_iv_config;
746 			break;
747 		case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
748 			sc->feat[fid].set = nvme_feature_async_event;
749 			/* Enable all AENs by default */
750 			sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK;
751 			break;
752 		default:
753 			sc->feat[fid].set = nvme_feature_invalid_cb;
754 			sc->feat[fid].get = nvme_feature_invalid_cb;
755 		}
756 	}
757 }
758 
759 static void
760 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
761 {
762 
763 	STAILQ_INIT(&sc->aer_list);
764 	sc->aer_count = 0;
765 }
766 
767 static void
768 pci_nvme_aer_init(struct pci_nvme_softc *sc)
769 {
770 
771 	pthread_mutex_init(&sc->aer_mtx, NULL);
772 	pci_nvme_aer_reset(sc);
773 }
774 
775 static void
776 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
777 {
778 	struct pci_nvme_aer *aer = NULL;
779 
780 	pthread_mutex_lock(&sc->aer_mtx);
781 	while (!STAILQ_EMPTY(&sc->aer_list)) {
782 		aer = STAILQ_FIRST(&sc->aer_list);
783 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
784 		free(aer);
785 	}
786 	pthread_mutex_unlock(&sc->aer_mtx);
787 
788 	pci_nvme_aer_reset(sc);
789 }
790 
791 static bool
792 pci_nvme_aer_available(struct pci_nvme_softc *sc)
793 {
794 
795 	return (sc->aer_count != 0);
796 }
797 
798 static bool
799 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
800 {
801 	struct nvme_controller_data *cd = &sc->ctrldata;
802 
803 	/* AERL is a zero based value while aer_count is one's based */
804 	return (sc->aer_count == (cd->aerl + 1U));
805 }
806 
807 /*
808  * Add an Async Event Request
809  *
810  * Stores an AER to be returned later if the Controller needs to notify the
811  * host of an event.
812  * Note that while the NVMe spec doesn't require Controllers to return AER's
813  * in order, this implementation does preserve the order.
814  */
815 static int
816 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
817 {
818 	struct pci_nvme_aer *aer = NULL;
819 
820 	aer = calloc(1, sizeof(struct pci_nvme_aer));
821 	if (aer == NULL)
822 		return (-1);
823 
824 	/* Save the Command ID for use in the completion message */
825 	aer->cid = cid;
826 
827 	pthread_mutex_lock(&sc->aer_mtx);
828 	sc->aer_count++;
829 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
830 	pthread_mutex_unlock(&sc->aer_mtx);
831 
832 	return (0);
833 }
834 
835 /*
836  * Get an Async Event Request structure
837  *
838  * Returns a pointer to an AER previously submitted by the host or NULL if
839  * no AER's exist. Caller is responsible for freeing the returned struct.
840  */
841 static struct pci_nvme_aer *
842 pci_nvme_aer_get(struct pci_nvme_softc *sc)
843 {
844 	struct pci_nvme_aer *aer = NULL;
845 
846 	pthread_mutex_lock(&sc->aer_mtx);
847 	aer = STAILQ_FIRST(&sc->aer_list);
848 	if (aer != NULL) {
849 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
850 		sc->aer_count--;
851 	}
852 	pthread_mutex_unlock(&sc->aer_mtx);
853 
854 	return (aer);
855 }
856 
857 static void
858 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
859 {
860 	uint32_t	atype;
861 
862 	memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
863 
864 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
865 		sc->aen[atype].atype = atype;
866 	}
867 }
868 
869 static void
870 pci_nvme_aen_init(struct pci_nvme_softc *sc)
871 {
872 	char nstr[80];
873 
874 	pci_nvme_aen_reset(sc);
875 
876 	pthread_mutex_init(&sc->aen_mtx, NULL);
877 	pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
878 	snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
879 	    sc->nsc_pi->pi_func);
880 	pthread_set_name_np(sc->aen_tid, nstr);
881 }
882 
883 static void
884 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
885 {
886 
887 	pci_nvme_aen_reset(sc);
888 }
889 
890 /* Notify the AEN thread of pending work */
891 static void
892 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
893 {
894 
895 	pthread_cond_signal(&sc->aen_cond);
896 }
897 
898 /*
899  * Post an Asynchronous Event Notification
900  */
901 static int32_t
902 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
903 		uint32_t event_data)
904 {
905 	struct pci_nvme_aen *aen;
906 
907 	if (atype >= PCI_NVME_AE_TYPE_MAX) {
908 		return(EINVAL);
909 	}
910 
911 	pthread_mutex_lock(&sc->aen_mtx);
912 	aen = &sc->aen[atype];
913 
914 	/* Has the controller already posted an event of this type? */
915 	if (aen->posted) {
916 		pthread_mutex_unlock(&sc->aen_mtx);
917 		return(EALREADY);
918 	}
919 
920 	aen->event_data = event_data;
921 	aen->posted = true;
922 	pthread_mutex_unlock(&sc->aen_mtx);
923 
924 	pci_nvme_aen_notify(sc);
925 
926 	return(0);
927 }
928 
929 static void
930 pci_nvme_aen_process(struct pci_nvme_softc *sc)
931 {
932 	struct pci_nvme_aer *aer;
933 	struct pci_nvme_aen *aen;
934 	pci_nvme_async_type atype;
935 	uint32_t mask;
936 	uint16_t status;
937 	uint8_t lid;
938 
939 #ifndef __FreeBSD__
940 	lid = 0;
941 #endif
942 
943 	assert(pthread_mutex_isowned_np(&sc->aen_mtx));
944 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
945 		aen = &sc->aen[atype];
946 		/* Previous iterations may have depleted the available AER's */
947 		if (!pci_nvme_aer_available(sc)) {
948 			DPRINTF("%s: no AER", __func__);
949 			break;
950 		}
951 
952 		if (!aen->posted) {
953 			DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
954 			continue;
955 		}
956 
957 		status = NVME_SC_SUCCESS;
958 
959 		/* Is the event masked? */
960 		mask =
961 		    sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
962 
963 		DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
964 		switch (atype) {
965 		case PCI_NVME_AE_TYPE_ERROR:
966 			lid = NVME_LOG_ERROR;
967 			break;
968 		case PCI_NVME_AE_TYPE_SMART:
969 			mask &= 0xff;
970 			if ((mask & aen->event_data) == 0)
971 				continue;
972 			lid = NVME_LOG_HEALTH_INFORMATION;
973 			break;
974 		case PCI_NVME_AE_TYPE_NOTICE:
975 			if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) {
976 				EPRINTLN("%s unknown AEN notice type %u",
977 				    __func__, aen->event_data);
978 				status = NVME_SC_INTERNAL_DEVICE_ERROR;
979 				lid = 0;
980 				break;
981 			}
982 			if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0)
983 				continue;
984 			switch (aen->event_data) {
985 			case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED:
986 				lid = NVME_LOG_CHANGED_NAMESPACE;
987 				break;
988 			case PCI_NVME_AEI_NOTICE_FW_ACTIVATION:
989 				lid = NVME_LOG_FIRMWARE_SLOT;
990 				break;
991 			case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE:
992 				lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
993 				break;
994 			case PCI_NVME_AEI_NOTICE_ANA_CHANGE:
995 				lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS;
996 				break;
997 			case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE:
998 				lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
999 				break;
1000 			case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT:
1001 				lid = NVME_LOG_LBA_STATUS_INFORMATION;
1002 				break;
1003 			case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE:
1004 				lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
1005 				break;
1006 			default:
1007 				lid = 0;
1008 			}
1009 			break;
1010 		default:
1011 			/* bad type?!? */
1012 			EPRINTLN("%s unknown AEN type %u", __func__, atype);
1013 			status = NVME_SC_INTERNAL_DEVICE_ERROR;
1014 			lid = 0;
1015 			break;
1016 		}
1017 
1018 		aer = pci_nvme_aer_get(sc);
1019 		assert(aer != NULL);
1020 
1021 		DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
1022 		pci_nvme_cq_update(sc, &sc->compl_queues[0],
1023 		    (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
1024 		    aer->cid,
1025 		    0,		/* SQID */
1026 		    status);
1027 
1028 		aen->event_data = 0;
1029 		aen->posted = false;
1030 
1031 		pci_generate_msix(sc->nsc_pi, 0);
1032 	}
1033 }
1034 
1035 static void *
1036 aen_thr(void *arg)
1037 {
1038 	struct pci_nvme_softc *sc;
1039 
1040 	sc = arg;
1041 
1042 	pthread_mutex_lock(&sc->aen_mtx);
1043 	for (;;) {
1044 		pci_nvme_aen_process(sc);
1045 		pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1046 	}
1047 #ifdef __FreeBSD__	/* Smatch spots unreachable code */
1048 	pthread_mutex_unlock(&sc->aen_mtx);
1049 
1050 	pthread_exit(NULL);
1051 #endif
1052 	return (NULL);
1053 }
1054 
1055 static void
1056 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1057 {
1058 	uint32_t i;
1059 
1060 	DPRINTF("%s", __func__);
1061 
1062 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1063 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1064 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
1065 
1066 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1067 
1068 	sc->regs.vs = NVME_REV(1,4);	/* NVMe v1.4 */
1069 
1070 	sc->regs.cc = 0;
1071 
1072 	assert(sc->submit_queues != NULL);
1073 
1074 	for (i = 0; i < sc->num_squeues + 1; i++) {
1075 		sc->submit_queues[i].qbase = NULL;
1076 		sc->submit_queues[i].size = 0;
1077 		sc->submit_queues[i].cqid = 0;
1078 		sc->submit_queues[i].tail = 0;
1079 		sc->submit_queues[i].head = 0;
1080 	}
1081 
1082 	assert(sc->compl_queues != NULL);
1083 
1084 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1085 		sc->compl_queues[i].qbase = NULL;
1086 		sc->compl_queues[i].size = 0;
1087 		sc->compl_queues[i].tail = 0;
1088 		sc->compl_queues[i].head = 0;
1089 	}
1090 
1091 	sc->num_q_is_set = false;
1092 
1093 	pci_nvme_aer_destroy(sc);
1094 	pci_nvme_aen_destroy(sc);
1095 
1096 	/*
1097 	 * Clear CSTS.RDY last to prevent the host from enabling Controller
1098 	 * before cleanup completes
1099 	 */
1100 	sc->regs.csts = 0;
1101 }
1102 
1103 static void
1104 pci_nvme_reset(struct pci_nvme_softc *sc)
1105 {
1106 	pthread_mutex_lock(&sc->mtx);
1107 	pci_nvme_reset_locked(sc);
1108 	pthread_mutex_unlock(&sc->mtx);
1109 }
1110 
1111 static int
1112 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1113 {
1114 	uint16_t acqs, asqs;
1115 
1116 	DPRINTF("%s", __func__);
1117 
1118 	/*
1119 	 * NVMe 2.0 states that "enabling a controller while this field is
1120 	 * cleared to 0h produces undefined results" for both ACQS and
1121 	 * ASQS. If zero, set CFS and do not become ready.
1122 	 */
1123 	asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK);
1124 	if (asqs < 2) {
1125 		EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__,
1126 		    asqs - 1, sc->regs.aqa);
1127 		sc->regs.csts |= NVME_CSTS_CFS;
1128 		return (-1);
1129 	}
1130 	sc->submit_queues[0].size = asqs;
1131 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1132 	            sizeof(struct nvme_command) * asqs);
1133 	if (sc->submit_queues[0].qbase == NULL) {
1134 		EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__,
1135 		    sc->regs.asq);
1136 		sc->regs.csts |= NVME_CSTS_CFS;
1137 		return (-1);
1138 	}
1139 
1140 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1141 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1142 
1143 	acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1144 	    NVME_AQA_REG_ACQS_MASK);
1145 	if (acqs < 2) {
1146 		EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__,
1147 		    acqs - 1, sc->regs.aqa);
1148 		sc->regs.csts |= NVME_CSTS_CFS;
1149 		return (-1);
1150 	}
1151 	sc->compl_queues[0].size = acqs;
1152 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1153 	         sizeof(struct nvme_completion) * acqs);
1154 	if (sc->compl_queues[0].qbase == NULL) {
1155 		EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__,
1156 		    sc->regs.acq);
1157 		sc->regs.csts |= NVME_CSTS_CFS;
1158 		return (-1);
1159 	}
1160 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1161 
1162 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1163 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1164 
1165 	return (0);
1166 }
1167 
1168 static int
1169 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1170 	size_t len, enum nvme_copy_dir dir)
1171 {
1172 	uint8_t *p;
1173 	size_t bytes;
1174 
1175 	if (len > (8 * 1024)) {
1176 		return (-1);
1177 	}
1178 
1179 	/* Copy from the start of prp1 to the end of the physical page */
1180 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1181 	bytes = MIN(bytes, len);
1182 
1183 	p = vm_map_gpa(ctx, prp1, bytes);
1184 	if (p == NULL) {
1185 		return (-1);
1186 	}
1187 
1188 	if (dir == NVME_COPY_TO_PRP)
1189 		memcpy(p, b, bytes);
1190 	else
1191 		memcpy(b, p, bytes);
1192 
1193 	b += bytes;
1194 
1195 	len -= bytes;
1196 	if (len == 0) {
1197 		return (0);
1198 	}
1199 
1200 	len = MIN(len, PAGE_SIZE);
1201 
1202 	p = vm_map_gpa(ctx, prp2, len);
1203 	if (p == NULL) {
1204 		return (-1);
1205 	}
1206 
1207 	if (dir == NVME_COPY_TO_PRP)
1208 		memcpy(p, b, len);
1209 	else
1210 		memcpy(b, p, len);
1211 
1212 	return (0);
1213 }
1214 
1215 /*
1216  * Write a Completion Queue Entry update
1217  *
1218  * Write the completion and update the doorbell value
1219  */
1220 static void
1221 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1222 		struct nvme_completion_queue *cq,
1223 		uint32_t cdw0,
1224 		uint16_t cid,
1225 		uint16_t sqid,
1226 		uint16_t status)
1227 {
1228 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1229 	struct nvme_completion *cqe;
1230 
1231 	assert(cq->qbase != NULL);
1232 
1233 	pthread_mutex_lock(&cq->mtx);
1234 
1235 	cqe = &cq->qbase[cq->tail];
1236 
1237 	/* Flip the phase bit */
1238 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1239 
1240 	cqe->cdw0 = cdw0;
1241 	cqe->sqhd = sq->head;
1242 	cqe->sqid = sqid;
1243 	cqe->cid = cid;
1244 	cqe->status = status;
1245 
1246 	cq->tail++;
1247 	if (cq->tail >= cq->size) {
1248 		cq->tail = 0;
1249 	}
1250 
1251 	pthread_mutex_unlock(&cq->mtx);
1252 }
1253 
1254 static int
1255 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1256 	struct nvme_completion* compl)
1257 {
1258 	uint16_t qid = command->cdw10 & 0xffff;
1259 
1260 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1261 	if (qid == 0 || qid > sc->num_squeues ||
1262 	    (sc->submit_queues[qid].qbase == NULL)) {
1263 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1264 		        __func__, qid, sc->num_squeues);
1265 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1266 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1267 		return (1);
1268 	}
1269 
1270 	sc->submit_queues[qid].qbase = NULL;
1271 	sc->submit_queues[qid].cqid = 0;
1272 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1273 	return (1);
1274 }
1275 
1276 static int
1277 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1278 	struct nvme_completion* compl)
1279 {
1280 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
1281 		uint16_t qid = command->cdw10 & 0xffff;
1282 		struct nvme_submission_queue *nsq;
1283 
1284 		if ((qid == 0) || (qid > sc->num_squeues) ||
1285 		    (sc->submit_queues[qid].qbase != NULL)) {
1286 			WPRINTF("%s queue index %u > num_squeues %u",
1287 			        __func__, qid, sc->num_squeues);
1288 			pci_nvme_status_tc(&compl->status,
1289 			    NVME_SCT_COMMAND_SPECIFIC,
1290 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1291 			return (1);
1292 		}
1293 
1294 		nsq = &sc->submit_queues[qid];
1295 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1296 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1297 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1298 			/*
1299 			 * Queues must specify at least two entries
1300 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1301 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1302 			 */
1303 			pci_nvme_status_tc(&compl->status,
1304 			    NVME_SCT_COMMAND_SPECIFIC,
1305 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1306 			return (1);
1307 		}
1308 		nsq->head = nsq->tail = 0;
1309 
1310 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1311 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1312 			pci_nvme_status_tc(&compl->status,
1313 			    NVME_SCT_COMMAND_SPECIFIC,
1314 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1315 			return (1);
1316 		}
1317 
1318 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1319 			pci_nvme_status_tc(&compl->status,
1320 			    NVME_SCT_COMMAND_SPECIFIC,
1321 			    NVME_SC_COMPLETION_QUEUE_INVALID);
1322 			return (1);
1323 		}
1324 
1325 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1326 
1327 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1328 		              sizeof(struct nvme_command) * (size_t)nsq->size);
1329 
1330 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1331 		        qid, nsq->size, nsq->qbase, nsq->cqid);
1332 
1333 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1334 
1335 		DPRINTF("%s completed creating IOSQ qid %u",
1336 		         __func__, qid);
1337 	} else {
1338 		/*
1339 		 * Guest sent non-cont submission queue request.
1340 		 * This setting is unsupported by this emulation.
1341 		 */
1342 		WPRINTF("%s unsupported non-contig (list-based) "
1343 		         "create i/o submission queue", __func__);
1344 
1345 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1346 	}
1347 	return (1);
1348 }
1349 
1350 static int
1351 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1352 	struct nvme_completion* compl)
1353 {
1354 	uint16_t qid = command->cdw10 & 0xffff;
1355 	uint16_t sqid;
1356 
1357 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1358 	if (qid == 0 || qid > sc->num_cqueues ||
1359 	    (sc->compl_queues[qid].qbase == NULL)) {
1360 		WPRINTF("%s queue index %u / num_cqueues %u",
1361 		        __func__, qid, sc->num_cqueues);
1362 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1363 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1364 		return (1);
1365 	}
1366 
1367 	/* Deleting an Active CQ is an error */
1368 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1369 		if (sc->submit_queues[sqid].cqid == qid) {
1370 			pci_nvme_status_tc(&compl->status,
1371 			    NVME_SCT_COMMAND_SPECIFIC,
1372 			    NVME_SC_INVALID_QUEUE_DELETION);
1373 			return (1);
1374 		}
1375 
1376 	sc->compl_queues[qid].qbase = NULL;
1377 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1378 	return (1);
1379 }
1380 
1381 static int
1382 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1383 	struct nvme_completion* compl)
1384 {
1385 	struct nvme_completion_queue *ncq;
1386 	uint16_t qid = command->cdw10 & 0xffff;
1387 
1388 	/* Only support Physically Contiguous queues */
1389 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1390 		WPRINTF("%s unsupported non-contig (list-based) "
1391 		         "create i/o completion queue",
1392 		         __func__);
1393 
1394 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1395 		return (1);
1396 	}
1397 
1398 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1399 	    (sc->compl_queues[qid].qbase != NULL)) {
1400 		WPRINTF("%s queue index %u > num_cqueues %u",
1401 			__func__, qid, sc->num_cqueues);
1402 		pci_nvme_status_tc(&compl->status,
1403 		    NVME_SCT_COMMAND_SPECIFIC,
1404 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1405 		return (1);
1406  	}
1407 
1408 	ncq = &sc->compl_queues[qid];
1409 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1410 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1411 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1412 		pci_nvme_status_tc(&compl->status,
1413 		    NVME_SCT_COMMAND_SPECIFIC,
1414 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1415 		return (1);
1416 	}
1417 
1418 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1419 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1420 		/*
1421 		 * Queues must specify at least two entries
1422 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1423 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1424 		 */
1425 		pci_nvme_status_tc(&compl->status,
1426 		    NVME_SCT_COMMAND_SPECIFIC,
1427 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1428 		return (1);
1429 	}
1430 	ncq->head = ncq->tail = 0;
1431 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1432 		     command->prp1,
1433 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1434 
1435 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1436 
1437 
1438 	return (1);
1439 }
1440 
1441 static int
1442 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1443 	struct nvme_completion* compl)
1444 {
1445 	uint64_t logoff;
1446 	uint32_t logsize;
1447 	uint8_t logpage;
1448 
1449 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1450 
1451 	/*
1452 	 * Command specifies the number of dwords to return in fields NUMDU
1453 	 * and NUMDL. This is a zero-based value.
1454 	 */
1455 	logpage = command->cdw10 & 0xFF;
1456 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1457 	logsize *= sizeof(uint32_t);
1458 	logoff  = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1459 
1460 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1461 
1462 	switch (logpage) {
1463 	case NVME_LOG_ERROR:
1464 		if (logoff >= sizeof(sc->err_log)) {
1465 			pci_nvme_status_genc(&compl->status,
1466 			    NVME_SC_INVALID_FIELD);
1467 			break;
1468 		}
1469 
1470 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1471 		    command->prp2, (uint8_t *)&sc->err_log + logoff,
1472 		    MIN(logsize - logoff, sizeof(sc->err_log)),
1473 		    NVME_COPY_TO_PRP);
1474 		break;
1475 	case NVME_LOG_HEALTH_INFORMATION:
1476 		if (logoff >= sizeof(sc->health_log)) {
1477 			pci_nvme_status_genc(&compl->status,
1478 			    NVME_SC_INVALID_FIELD);
1479 			break;
1480 		}
1481 
1482 		pthread_mutex_lock(&sc->mtx);
1483 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1484 		    sizeof(sc->health_log.data_units_read));
1485 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1486 		    sizeof(sc->health_log.data_units_written));
1487 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1488 		    sizeof(sc->health_log.host_read_commands));
1489 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1490 		    sizeof(sc->health_log.host_write_commands));
1491 		pthread_mutex_unlock(&sc->mtx);
1492 
1493 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1494 		    command->prp2, (uint8_t *)&sc->health_log + logoff,
1495 		    MIN(logsize - logoff, sizeof(sc->health_log)),
1496 		    NVME_COPY_TO_PRP);
1497 		break;
1498 	case NVME_LOG_FIRMWARE_SLOT:
1499 		if (logoff >= sizeof(sc->fw_log)) {
1500 			pci_nvme_status_genc(&compl->status,
1501 			    NVME_SC_INVALID_FIELD);
1502 			break;
1503 		}
1504 
1505 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1506 		    command->prp2, (uint8_t *)&sc->fw_log + logoff,
1507 		    MIN(logsize - logoff, sizeof(sc->fw_log)),
1508 		    NVME_COPY_TO_PRP);
1509 		break;
1510 	case NVME_LOG_CHANGED_NAMESPACE:
1511 		if (logoff >= sizeof(sc->ns_log)) {
1512 			pci_nvme_status_genc(&compl->status,
1513 			    NVME_SC_INVALID_FIELD);
1514 			break;
1515 		}
1516 
1517 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1518 		    command->prp2, (uint8_t *)&sc->ns_log + logoff,
1519 		    MIN(logsize - logoff, sizeof(sc->ns_log)),
1520 		    NVME_COPY_TO_PRP);
1521 		memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1522 		break;
1523 	default:
1524 		DPRINTF("%s get log page %x command not supported",
1525 		        __func__, logpage);
1526 
1527 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1528 		    NVME_SC_INVALID_LOG_PAGE);
1529 	}
1530 
1531 	return (1);
1532 }
1533 
1534 static int
1535 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1536 	struct nvme_completion* compl)
1537 {
1538 	void *dest;
1539 	uint16_t status;
1540 
1541 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1542 	        command->cdw10 & 0xFF, command->nsid);
1543 
1544 	status = 0;
1545 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1546 
1547 	switch (command->cdw10 & 0xFF) {
1548 	case 0x00: /* return Identify Namespace data structure */
1549 		/* Global NS only valid with NS Management */
1550 		if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) {
1551 			pci_nvme_status_genc(&status,
1552 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1553 			break;
1554 		}
1555 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1556 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1557 		    NVME_COPY_TO_PRP);
1558 		break;
1559 	case 0x01: /* return Identify Controller data structure */
1560 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1561 		    command->prp2, (uint8_t *)&sc->ctrldata,
1562 		    sizeof(sc->ctrldata),
1563 		    NVME_COPY_TO_PRP);
1564 		break;
1565 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1566 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1567 		                  sizeof(uint32_t) * 1024);
1568 		/* All unused entries shall be zero */
1569 		memset(dest, 0, sizeof(uint32_t) * 1024);
1570 		((uint32_t *)dest)[0] = 1;
1571 		break;
1572 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1573 		if (command->nsid != 1) {
1574 			pci_nvme_status_genc(&status,
1575 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1576 			break;
1577 		}
1578 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1579 		                  sizeof(uint32_t) * 1024);
1580 		/* All bytes after the descriptor shall be zero */
1581 		memset(dest, 0, sizeof(uint32_t) * 1024);
1582 
1583 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1584 		((uint8_t *)dest)[0] = 1;
1585 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1586 		memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t));
1587 		break;
1588 	case 0x13:
1589 		/*
1590 		 * Controller list is optional but used by UNH tests. Return
1591 		 * a valid but empty list.
1592 		 */
1593 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1594 		                  sizeof(uint16_t) * 2048);
1595 		memset(dest, 0, sizeof(uint16_t) * 2048);
1596 		break;
1597 	default:
1598 		DPRINTF("%s unsupported identify command requested 0x%x",
1599 		         __func__, command->cdw10 & 0xFF);
1600 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1601 		break;
1602 	}
1603 
1604 	compl->status = status;
1605 	return (1);
1606 }
1607 
1608 static const char *
1609 nvme_fid_to_name(uint8_t fid)
1610 {
1611 	const char *name;
1612 
1613 	switch (fid) {
1614 	case NVME_FEAT_ARBITRATION:
1615 		name = "Arbitration";
1616 		break;
1617 	case NVME_FEAT_POWER_MANAGEMENT:
1618 		name = "Power Management";
1619 		break;
1620 	case NVME_FEAT_LBA_RANGE_TYPE:
1621 		name = "LBA Range Type";
1622 		break;
1623 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1624 		name = "Temperature Threshold";
1625 		break;
1626 	case NVME_FEAT_ERROR_RECOVERY:
1627 		name = "Error Recovery";
1628 		break;
1629 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1630 		name = "Volatile Write Cache";
1631 		break;
1632 	case NVME_FEAT_NUMBER_OF_QUEUES:
1633 		name = "Number of Queues";
1634 		break;
1635 	case NVME_FEAT_INTERRUPT_COALESCING:
1636 		name = "Interrupt Coalescing";
1637 		break;
1638 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1639 		name = "Interrupt Vector Configuration";
1640 		break;
1641 	case NVME_FEAT_WRITE_ATOMICITY:
1642 		name = "Write Atomicity Normal";
1643 		break;
1644 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1645 		name = "Asynchronous Event Configuration";
1646 		break;
1647 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1648 		name = "Autonomous Power State Transition";
1649 		break;
1650 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1651 		name = "Host Memory Buffer";
1652 		break;
1653 	case NVME_FEAT_TIMESTAMP:
1654 		name = "Timestamp";
1655 		break;
1656 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1657 		name = "Keep Alive Timer";
1658 		break;
1659 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1660 		name = "Host Controlled Thermal Management";
1661 		break;
1662 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1663 		name = "Non-Operation Power State Config";
1664 		break;
1665 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1666 		name = "Read Recovery Level Config";
1667 		break;
1668 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1669 		name = "Predictable Latency Mode Config";
1670 		break;
1671 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1672 		name = "Predictable Latency Mode Window";
1673 		break;
1674 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1675 		name = "LBA Status Information Report Interval";
1676 		break;
1677 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1678 		name = "Host Behavior Support";
1679 		break;
1680 	case NVME_FEAT_SANITIZE_CONFIG:
1681 		name = "Sanitize Config";
1682 		break;
1683 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1684 		name = "Endurance Group Event Configuration";
1685 		break;
1686 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1687 		name = "Software Progress Marker";
1688 		break;
1689 	case NVME_FEAT_HOST_IDENTIFIER:
1690 		name = "Host Identifier";
1691 		break;
1692 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1693 		name = "Reservation Notification Mask";
1694 		break;
1695 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1696 		name = "Reservation Persistence";
1697 		break;
1698 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1699 		name = "Namespace Write Protection Config";
1700 		break;
1701 	default:
1702 		name = "Unknown";
1703 		break;
1704 	}
1705 
1706 	return (name);
1707 }
1708 
1709 static void
1710 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused,
1711     struct nvme_feature_obj *feat __unused,
1712     struct nvme_command *command __unused,
1713     struct nvme_completion *compl)
1714 {
1715 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1716 }
1717 
1718 static void
1719 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1720     struct nvme_feature_obj *feat __unused,
1721     struct nvme_command *command,
1722     struct nvme_completion *compl)
1723 {
1724 	uint32_t i;
1725 	uint32_t cdw11 = command->cdw11;
1726 	uint16_t iv;
1727 	bool cd;
1728 
1729 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1730 
1731 	iv = cdw11 & 0xffff;
1732 	cd = cdw11 & (1 << 16);
1733 
1734 	if (iv > (sc->max_queues + 1)) {
1735 		return;
1736 	}
1737 
1738 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1739 	if ((iv == 0) && !cd)
1740 		return;
1741 
1742 	/* Requested Interrupt Vector must be used by a CQ */
1743 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1744 		if (sc->compl_queues[i].intr_vec == iv) {
1745 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1746 		}
1747 	}
1748 }
1749 
1750 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP		(0x4000)
1751 static void
1752 nvme_feature_async_event(struct pci_nvme_softc *sc __unused,
1753     struct nvme_feature_obj *feat __unused,
1754     struct nvme_command *command,
1755     struct nvme_completion *compl)
1756 {
1757 	if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP)
1758 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1759 }
1760 
1761 #define NVME_TEMP_THRESH_OVER	0
1762 #define NVME_TEMP_THRESH_UNDER	1
1763 static void
1764 nvme_feature_temperature(struct pci_nvme_softc *sc,
1765     struct nvme_feature_obj *feat __unused,
1766     struct nvme_command *command,
1767     struct nvme_completion *compl)
1768 {
1769 	uint16_t	tmpth;	/* Temperature Threshold */
1770 	uint8_t		tmpsel; /* Threshold Temperature Select */
1771 	uint8_t		thsel;  /* Threshold Type Select */
1772 	bool		set_crit = false;
1773 	bool		report_crit;
1774 
1775 	tmpth  = command->cdw11 & 0xffff;
1776 	tmpsel = (command->cdw11 >> 16) & 0xf;
1777 	thsel  = (command->cdw11 >> 20) & 0x3;
1778 
1779 	DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1780 
1781 	/* Check for unsupported values */
1782 	if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1783 	    (thsel > NVME_TEMP_THRESH_UNDER)) {
1784 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1785 		return;
1786 	}
1787 
1788 	if (((thsel == NVME_TEMP_THRESH_OVER)  && (NVME_TEMPERATURE >= tmpth)) ||
1789 	    ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1790 		set_crit = true;
1791 
1792 	pthread_mutex_lock(&sc->mtx);
1793 	if (set_crit)
1794 		sc->health_log.critical_warning |=
1795 		    NVME_CRIT_WARN_ST_TEMPERATURE;
1796 	else
1797 		sc->health_log.critical_warning &=
1798 		    ~NVME_CRIT_WARN_ST_TEMPERATURE;
1799 	pthread_mutex_unlock(&sc->mtx);
1800 
1801 	report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 &
1802 	    NVME_CRIT_WARN_ST_TEMPERATURE;
1803 
1804 	if (set_crit && report_crit)
1805 		pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1806 		    sc->health_log.critical_warning);
1807 
1808 	DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1809 }
1810 
1811 static void
1812 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1813     struct nvme_feature_obj *feat __unused,
1814     struct nvme_command *command,
1815     struct nvme_completion *compl)
1816 {
1817 	uint16_t nqr;	/* Number of Queues Requested */
1818 
1819 	if (sc->num_q_is_set) {
1820 		WPRINTF("%s: Number of Queues already set", __func__);
1821 		pci_nvme_status_genc(&compl->status,
1822 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1823 		return;
1824 	}
1825 
1826 	nqr = command->cdw11 & 0xFFFF;
1827 	if (nqr == 0xffff) {
1828 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1829 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1830 		return;
1831 	}
1832 
1833 	sc->num_squeues = ONE_BASED(nqr);
1834 	if (sc->num_squeues > sc->max_queues) {
1835 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1836 					sc->max_queues);
1837 		sc->num_squeues = sc->max_queues;
1838 	}
1839 
1840 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1841 	if (nqr == 0xffff) {
1842 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1843 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1844 		return;
1845 	}
1846 
1847 	sc->num_cqueues = ONE_BASED(nqr);
1848 	if (sc->num_cqueues > sc->max_queues) {
1849 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1850 					sc->max_queues);
1851 		sc->num_cqueues = sc->max_queues;
1852 	}
1853 
1854 	/* Patch the command value which will be saved on callback's return */
1855 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1856 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1857 
1858 	sc->num_q_is_set = true;
1859 }
1860 
1861 static int
1862 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1863 	struct nvme_completion *compl)
1864 {
1865 	struct nvme_feature_obj *feat;
1866 	uint32_t nsid = command->nsid;
1867 	uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10);
1868 	bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10);
1869 
1870 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1871 
1872 	if (fid >= NVME_FID_MAX) {
1873 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1874 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1875 		return (1);
1876 	}
1877 
1878 	if (sv) {
1879 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1880 		    NVME_SC_FEATURE_NOT_SAVEABLE);
1881 		return (1);
1882 	}
1883 
1884 	feat = &sc->feat[fid];
1885 
1886 	if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1887 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1888 		return (1);
1889 	}
1890 
1891 	if (!feat->namespace_specific &&
1892 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1893 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1894 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1895 		return (1);
1896 	}
1897 
1898 	compl->cdw0 = 0;
1899 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1900 
1901 	if (feat->set)
1902 		feat->set(sc, feat, command, compl);
1903 	else {
1904 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1905 		    NVME_SC_FEATURE_NOT_CHANGEABLE);
1906 		return (1);
1907 	}
1908 
1909 	DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1910 	if (compl->status == NVME_SC_SUCCESS) {
1911 		feat->cdw11 = command->cdw11;
1912 		if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1913 		    (command->cdw11 != 0))
1914 			pci_nvme_aen_notify(sc);
1915 	}
1916 
1917 	return (0);
1918 }
1919 
1920 #define NVME_FEATURES_SEL_SUPPORTED	0x3
1921 #define NVME_FEATURES_NS_SPECIFIC	(1 << 1)
1922 
1923 static int
1924 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1925 	struct nvme_completion* compl)
1926 {
1927 	struct nvme_feature_obj *feat;
1928 	uint8_t fid = command->cdw10 & 0xFF;
1929 	uint8_t sel = (command->cdw10 >> 8) & 0x7;
1930 
1931 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1932 
1933 	if (fid >= NVME_FID_MAX) {
1934 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1935 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1936 		return (1);
1937 	}
1938 
1939 	compl->cdw0 = 0;
1940 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1941 
1942 	feat = &sc->feat[fid];
1943 	if (feat->get) {
1944 		feat->get(sc, feat, command, compl);
1945 	}
1946 
1947 	if (compl->status == NVME_SC_SUCCESS) {
1948 		if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1949 			compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1950 		else
1951 			compl->cdw0 = feat->cdw11;
1952 	}
1953 
1954 	return (0);
1955 }
1956 
1957 static int
1958 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1959 	struct nvme_completion* compl)
1960 {
1961 	uint8_t	ses, lbaf, pi;
1962 
1963 	/* Only supports Secure Erase Setting - User Data Erase */
1964 	ses = (command->cdw10 >> 9) & 0x7;
1965 	if (ses > 0x1) {
1966 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1967 		return (1);
1968 	}
1969 
1970 	/* Only supports a single LBA Format */
1971 	lbaf = command->cdw10 & 0xf;
1972 	if (lbaf != 0) {
1973 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1974 		    NVME_SC_INVALID_FORMAT);
1975 		return (1);
1976 	}
1977 
1978 	/* Doesn't support Protection Infomation */
1979 	pi = (command->cdw10 >> 5) & 0x7;
1980 	if (pi != 0) {
1981 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1982 		return (1);
1983 	}
1984 
1985 	if (sc->nvstore.type == NVME_STOR_RAM) {
1986 		if (sc->nvstore.ctx)
1987 			free(sc->nvstore.ctx);
1988 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1989 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1990 	} else {
1991 		struct pci_nvme_ioreq *req;
1992 		int err;
1993 
1994 		req = pci_nvme_get_ioreq(sc);
1995 		if (req == NULL) {
1996 			pci_nvme_status_genc(&compl->status,
1997 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1998 			WPRINTF("%s: unable to allocate IO req", __func__);
1999 			return (1);
2000 		}
2001 		req->nvme_sq = &sc->submit_queues[0];
2002 		req->sqid = 0;
2003 		req->opc = command->opc;
2004 		req->cid = command->cid;
2005 		req->nsid = command->nsid;
2006 
2007 		req->io_req.br_offset = 0;
2008 		req->io_req.br_resid = sc->nvstore.size;
2009 		req->io_req.br_callback = pci_nvme_io_done;
2010 
2011 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
2012 		if (err) {
2013 			pci_nvme_status_genc(&compl->status,
2014 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2015 			pci_nvme_release_ioreq(sc, req);
2016 		} else
2017 			compl->status = NVME_NO_STATUS;
2018 	}
2019 
2020 	return (1);
2021 }
2022 
2023 static int
2024 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command,
2025     struct nvme_completion *compl)
2026 {
2027 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
2028 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
2029 
2030 	/* TODO: search for the command ID and abort it */
2031 
2032 	compl->cdw0 = 1;
2033 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
2034 	return (1);
2035 }
2036 
2037 static int
2038 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
2039 	struct nvme_command* command, struct nvme_completion* compl)
2040 {
2041 	DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
2042 	    sc->aer_count, sc->ctrldata.aerl, command->cid);
2043 
2044 	/* Don't exceed the Async Event Request Limit (AERL). */
2045 	if (pci_nvme_aer_limit_reached(sc)) {
2046 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
2047 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
2048 		return (1);
2049 	}
2050 
2051 	if (pci_nvme_aer_add(sc, command->cid)) {
2052 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
2053 				NVME_SC_INTERNAL_DEVICE_ERROR);
2054 		return (1);
2055 	}
2056 
2057 	/*
2058 	 * Raise events when they happen based on the Set Features cmd.
2059 	 * These events happen async, so only set completion successful if
2060 	 * there is an event reflective of the request to get event.
2061 	 */
2062 	compl->status = NVME_NO_STATUS;
2063 	pci_nvme_aen_notify(sc);
2064 
2065 	return (0);
2066 }
2067 
2068 static void
2069 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
2070 {
2071 	struct nvme_completion compl;
2072 	struct nvme_command *cmd;
2073 	struct nvme_submission_queue *sq;
2074 	struct nvme_completion_queue *cq;
2075 	uint16_t sqhead;
2076 
2077 	DPRINTF("%s index %u", __func__, (uint32_t)value);
2078 
2079 	sq = &sc->submit_queues[0];
2080 	cq = &sc->compl_queues[0];
2081 
2082 	pthread_mutex_lock(&sq->mtx);
2083 
2084 	sqhead = sq->head;
2085 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
2086 
2087 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2088 		cmd = &(sq->qbase)[sqhead];
2089 		compl.cdw0 = 0;
2090 		compl.status = 0;
2091 
2092 		switch (cmd->opc) {
2093 		case NVME_OPC_DELETE_IO_SQ:
2094 			DPRINTF("%s command DELETE_IO_SQ", __func__);
2095 			nvme_opc_delete_io_sq(sc, cmd, &compl);
2096 			break;
2097 		case NVME_OPC_CREATE_IO_SQ:
2098 			DPRINTF("%s command CREATE_IO_SQ", __func__);
2099 			nvme_opc_create_io_sq(sc, cmd, &compl);
2100 			break;
2101 		case NVME_OPC_DELETE_IO_CQ:
2102 			DPRINTF("%s command DELETE_IO_CQ", __func__);
2103 			nvme_opc_delete_io_cq(sc, cmd, &compl);
2104 			break;
2105 		case NVME_OPC_CREATE_IO_CQ:
2106 			DPRINTF("%s command CREATE_IO_CQ", __func__);
2107 			nvme_opc_create_io_cq(sc, cmd, &compl);
2108 			break;
2109 		case NVME_OPC_GET_LOG_PAGE:
2110 			DPRINTF("%s command GET_LOG_PAGE", __func__);
2111 			nvme_opc_get_log_page(sc, cmd, &compl);
2112 			break;
2113 		case NVME_OPC_IDENTIFY:
2114 			DPRINTF("%s command IDENTIFY", __func__);
2115 			nvme_opc_identify(sc, cmd, &compl);
2116 			break;
2117 		case NVME_OPC_ABORT:
2118 			DPRINTF("%s command ABORT", __func__);
2119 			nvme_opc_abort(sc, cmd, &compl);
2120 			break;
2121 		case NVME_OPC_SET_FEATURES:
2122 			DPRINTF("%s command SET_FEATURES", __func__);
2123 			nvme_opc_set_features(sc, cmd, &compl);
2124 			break;
2125 		case NVME_OPC_GET_FEATURES:
2126 			DPRINTF("%s command GET_FEATURES", __func__);
2127 			nvme_opc_get_features(sc, cmd, &compl);
2128 			break;
2129 		case NVME_OPC_FIRMWARE_ACTIVATE:
2130 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2131 			pci_nvme_status_tc(&compl.status,
2132 			    NVME_SCT_COMMAND_SPECIFIC,
2133 			    NVME_SC_INVALID_FIRMWARE_SLOT);
2134 			break;
2135 		case NVME_OPC_ASYNC_EVENT_REQUEST:
2136 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2137 			nvme_opc_async_event_req(sc, cmd, &compl);
2138 			break;
2139 		case NVME_OPC_FORMAT_NVM:
2140 			DPRINTF("%s command FORMAT_NVM", __func__);
2141 			if ((sc->ctrldata.oacs &
2142 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2143 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2144 				break;
2145 			}
2146 			nvme_opc_format_nvm(sc, cmd, &compl);
2147 			break;
2148 		case NVME_OPC_SECURITY_SEND:
2149 		case NVME_OPC_SECURITY_RECEIVE:
2150 		case NVME_OPC_SANITIZE:
2151 		case NVME_OPC_GET_LBA_STATUS:
2152 			DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2153 			    cmd->opc);
2154 			/* Valid but unsupported opcodes */
2155 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2156 			break;
2157 		default:
2158 			DPRINTF("%s command OPC=%#X (not implemented)",
2159 			    __func__,
2160 			    cmd->opc);
2161 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2162 		}
2163 		sqhead = (sqhead + 1) % sq->size;
2164 
2165 		if (NVME_COMPLETION_VALID(compl)) {
2166 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
2167 			    compl.cdw0,
2168 			    cmd->cid,
2169 			    0,		/* SQID */
2170 			    compl.status);
2171 		}
2172 	}
2173 
2174 	DPRINTF("setting sqhead %u", sqhead);
2175 	sq->head = sqhead;
2176 
2177 	if (cq->head != cq->tail)
2178 		pci_generate_msix(sc->nsc_pi, 0);
2179 
2180 	pthread_mutex_unlock(&sq->mtx);
2181 }
2182 
2183 /*
2184  * Update the Write and Read statistics reported in SMART data
2185  *
2186  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2187  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2188  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2189  */
2190 static void
2191 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2192     size_t bytes, uint16_t status)
2193 {
2194 
2195 	pthread_mutex_lock(&sc->mtx);
2196 	switch (opc) {
2197 	case NVME_OPC_WRITE:
2198 		sc->write_commands++;
2199 		if (status != NVME_SC_SUCCESS)
2200 			break;
2201 		sc->write_dunits_remainder += (bytes / 512);
2202 		while (sc->write_dunits_remainder >= 1000) {
2203 			sc->write_data_units++;
2204 			sc->write_dunits_remainder -= 1000;
2205 		}
2206 		break;
2207 	case NVME_OPC_READ:
2208 		sc->read_commands++;
2209 		if (status != NVME_SC_SUCCESS)
2210 			break;
2211 		sc->read_dunits_remainder += (bytes / 512);
2212 		while (sc->read_dunits_remainder >= 1000) {
2213 			sc->read_data_units++;
2214 			sc->read_dunits_remainder -= 1000;
2215 		}
2216 		break;
2217 	default:
2218 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2219 		break;
2220 	}
2221 	pthread_mutex_unlock(&sc->mtx);
2222 }
2223 
2224 /*
2225  * Check if the combination of Starting LBA (slba) and number of blocks
2226  * exceeds the range of the underlying storage.
2227  *
2228  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2229  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2230  * overflow.
2231  */
2232 static bool
2233 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2234     uint32_t nblocks)
2235 {
2236 	size_t	offset, bytes;
2237 
2238 	/* Overflow check of multiplying Starting LBA by the sector size */
2239 	if (slba >> (64 - nvstore->sectsz_bits))
2240 		return (true);
2241 
2242 	offset = slba << nvstore->sectsz_bits;
2243 	bytes = nblocks << nvstore->sectsz_bits;
2244 
2245 	/* Overflow check of Number of Logical Blocks */
2246 	if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes))
2247 		return (true);
2248 
2249 	return (false);
2250 }
2251 
2252 static int
2253 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused,
2254     struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset)
2255 {
2256 	int iovidx;
2257 	bool range_is_contiguous;
2258 
2259 	if (req == NULL)
2260 		return (-1);
2261 
2262 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2263 		return (-1);
2264 	}
2265 
2266 	/*
2267 	 * Minimize the number of IOVs by concatenating contiguous address
2268 	 * ranges. If the IOV count is zero, there is no previous range to
2269 	 * concatenate.
2270 	 */
2271 	if (req->io_req.br_iovcnt == 0)
2272 		range_is_contiguous = false;
2273 	else
2274 		range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr;
2275 
2276 	if (range_is_contiguous) {
2277 		iovidx = req->io_req.br_iovcnt - 1;
2278 
2279 		req->io_req.br_iov[iovidx].iov_base =
2280 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2281 				     req->prev_gpaddr, size);
2282 		if (req->io_req.br_iov[iovidx].iov_base == NULL)
2283 			return (-1);
2284 
2285 		req->prev_size += size;
2286 		req->io_req.br_resid += size;
2287 
2288 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2289 	} else {
2290 		iovidx = req->io_req.br_iovcnt;
2291 		if (iovidx == 0) {
2292 			req->io_req.br_offset = offset;
2293 			req->io_req.br_resid = 0;
2294 			req->io_req.br_param = req;
2295 		}
2296 
2297 		req->io_req.br_iov[iovidx].iov_base =
2298 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2299 				     gpaddr, size);
2300 		if (req->io_req.br_iov[iovidx].iov_base == NULL)
2301 			return (-1);
2302 
2303 		req->io_req.br_iov[iovidx].iov_len = size;
2304 
2305 		req->prev_gpaddr = gpaddr;
2306 		req->prev_size = size;
2307 		req->io_req.br_resid += size;
2308 
2309 		req->io_req.br_iovcnt++;
2310 	}
2311 
2312 	return (0);
2313 }
2314 
2315 static void
2316 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2317     struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status)
2318 {
2319 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2320 
2321 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2322 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2323 		 NVME_STATUS_GET_SC(status));
2324 
2325 	pci_nvme_cq_update(sc, cq, 0, cid, sqid, status);
2326 
2327 	if (cq->head != cq->tail) {
2328 		if (cq->intr_en & NVME_CQ_INTEN) {
2329 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2330 		} else {
2331 			DPRINTF("%s: CQ%u interrupt disabled",
2332 						__func__, sq->cqid);
2333 		}
2334 	}
2335 }
2336 
2337 static void
2338 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2339 {
2340 	req->sc = NULL;
2341 	req->nvme_sq = NULL;
2342 	req->sqid = 0;
2343 
2344 	pthread_mutex_lock(&sc->mtx);
2345 
2346 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2347 	sc->pending_ios--;
2348 
2349 	/* when no more IO pending, can set to ready if device reset/enabled */
2350 	if (sc->pending_ios == 0 &&
2351 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2352 		sc->regs.csts |= NVME_CSTS_RDY;
2353 
2354 	pthread_mutex_unlock(&sc->mtx);
2355 
2356 	sem_post(&sc->iosemlock);
2357 }
2358 
2359 static struct pci_nvme_ioreq *
2360 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2361 {
2362 	struct pci_nvme_ioreq *req = NULL;
2363 
2364 	sem_wait(&sc->iosemlock);
2365 	pthread_mutex_lock(&sc->mtx);
2366 
2367 	req = STAILQ_FIRST(&sc->ioreqs_free);
2368 	assert(req != NULL);
2369 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2370 
2371 	req->sc = sc;
2372 
2373 	sc->pending_ios++;
2374 
2375 	pthread_mutex_unlock(&sc->mtx);
2376 
2377 	req->io_req.br_iovcnt = 0;
2378 	req->io_req.br_offset = 0;
2379 	req->io_req.br_resid = 0;
2380 	req->io_req.br_param = req;
2381 	req->prev_gpaddr = 0;
2382 	req->prev_size = 0;
2383 
2384 	return req;
2385 }
2386 
2387 static void
2388 pci_nvme_io_done(struct blockif_req *br, int err)
2389 {
2390 	struct pci_nvme_ioreq *req = br->br_param;
2391 	struct nvme_submission_queue *sq = req->nvme_sq;
2392 	uint16_t code, status;
2393 
2394 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
2395 
2396 	/* TODO return correct error */
2397 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2398 	status = 0;
2399 	pci_nvme_status_genc(&status, code);
2400 
2401 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status);
2402 	pci_nvme_stats_write_read_update(req->sc, req->opc,
2403 	    req->bytes, status);
2404 	pci_nvme_release_ioreq(req->sc, req);
2405 }
2406 
2407 /*
2408  * Implements the Flush command. The specification states:
2409  *    If a volatile write cache is not present, Flush commands complete
2410  *    successfully and have no effect
2411  * in the description of the Volatile Write Cache (VWC) field of the Identify
2412  * Controller data. Therefore, set status to Success if the command is
2413  * not supported (i.e. RAM or as indicated by the blockif).
2414  */
2415 static bool
2416 nvme_opc_flush(struct pci_nvme_softc *sc __unused,
2417     struct nvme_command *cmd __unused,
2418     struct pci_nvme_blockstore *nvstore,
2419     struct pci_nvme_ioreq *req,
2420     uint16_t *status)
2421 {
2422 	bool pending = false;
2423 
2424 	if (nvstore->type == NVME_STOR_RAM) {
2425 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2426 	} else {
2427 		int err;
2428 
2429 		req->io_req.br_callback = pci_nvme_io_done;
2430 
2431 		err = blockif_flush(nvstore->ctx, &req->io_req);
2432 		switch (err) {
2433 		case 0:
2434 			pending = true;
2435 			break;
2436 		case EOPNOTSUPP:
2437 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2438 			break;
2439 		default:
2440 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2441 		}
2442 	}
2443 
2444 	return (pending);
2445 }
2446 
2447 static uint16_t
2448 nvme_write_read_ram(struct pci_nvme_softc *sc,
2449     struct pci_nvme_blockstore *nvstore,
2450     uint64_t prp1, uint64_t prp2,
2451     size_t offset, uint64_t bytes,
2452     bool is_write)
2453 {
2454 	uint8_t *buf = nvstore->ctx;
2455 	enum nvme_copy_dir dir;
2456 	uint16_t status;
2457 
2458 	if (is_write)
2459 		dir = NVME_COPY_TO_PRP;
2460 	else
2461 		dir = NVME_COPY_FROM_PRP;
2462 
2463 	status = 0;
2464 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2465 	    buf + offset, bytes, dir))
2466 		pci_nvme_status_genc(&status,
2467 		    NVME_SC_DATA_TRANSFER_ERROR);
2468 	else
2469 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2470 
2471 	return (status);
2472 }
2473 
2474 static uint16_t
2475 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2476     struct pci_nvme_blockstore *nvstore,
2477     struct pci_nvme_ioreq *req,
2478     uint64_t prp1, uint64_t prp2,
2479     size_t offset, uint64_t bytes,
2480     bool is_write)
2481 {
2482 	uint64_t size;
2483 	int err;
2484 	uint16_t status = NVME_NO_STATUS;
2485 
2486 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2487 	if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) {
2488 		err = -1;
2489 		goto out;
2490 	}
2491 
2492 	offset += size;
2493 	bytes  -= size;
2494 
2495 	if (bytes == 0) {
2496 		;
2497 	} else if (bytes <= PAGE_SIZE) {
2498 		size = bytes;
2499 		if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) {
2500 			err = -1;
2501 			goto out;
2502 		}
2503 	} else {
2504 		void *vmctx = sc->nsc_pi->pi_vmctx;
2505 		uint64_t *prp_list = &prp2;
2506 		uint64_t *last = prp_list;
2507 
2508 		/* PRP2 is pointer to a physical region page list */
2509 		while (bytes) {
2510 			/* Last entry in list points to the next list */
2511 			if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2512 				uint64_t prp = *prp_list;
2513 
2514 				prp_list = paddr_guest2host(vmctx, prp,
2515 				    PAGE_SIZE - (prp % PAGE_SIZE));
2516 				if (prp_list == NULL) {
2517 					err = -1;
2518 					goto out;
2519 				}
2520 				last = prp_list + (NVME_PRP2_ITEMS - 1);
2521 			}
2522 
2523 			size = MIN(bytes, PAGE_SIZE);
2524 
2525 			if (pci_nvme_append_iov_req(sc, req, *prp_list, size,
2526 			    offset)) {
2527 				err = -1;
2528 				goto out;
2529 			}
2530 
2531 			offset += size;
2532 			bytes  -= size;
2533 
2534 			prp_list++;
2535 		}
2536 	}
2537 	req->io_req.br_callback = pci_nvme_io_done;
2538 	if (is_write)
2539 		err = blockif_write(nvstore->ctx, &req->io_req);
2540 	else
2541 		err = blockif_read(nvstore->ctx, &req->io_req);
2542 out:
2543 	if (err)
2544 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2545 
2546 	return (status);
2547 }
2548 
2549 static bool
2550 nvme_opc_write_read(struct pci_nvme_softc *sc,
2551     struct nvme_command *cmd,
2552     struct pci_nvme_blockstore *nvstore,
2553     struct pci_nvme_ioreq *req,
2554     uint16_t *status)
2555 {
2556 	uint64_t lba, nblocks, bytes;
2557 	size_t offset;
2558 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2559 	bool pending = false;
2560 
2561 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2562 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2563 	bytes = nblocks << nvstore->sectsz_bits;
2564 	if (bytes > NVME_MAX_DATA_SIZE) {
2565 		WPRINTF("%s command would exceed MDTS", __func__);
2566 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2567 		goto out;
2568 	}
2569 
2570 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2571 		WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
2572 		    __func__, lba, nblocks);
2573 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2574 		goto out;
2575 	}
2576 
2577 	offset = lba << nvstore->sectsz_bits;
2578 
2579 	req->bytes = bytes;
2580 	req->io_req.br_offset = lba;
2581 
2582 	/* PRP bits 1:0 must be zero */
2583 	cmd->prp1 &= ~0x3UL;
2584 	cmd->prp2 &= ~0x3UL;
2585 
2586 	if (nvstore->type == NVME_STOR_RAM) {
2587 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2588 		    cmd->prp2, offset, bytes, is_write);
2589 	} else {
2590 		*status = nvme_write_read_blockif(sc, nvstore, req,
2591 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2592 
2593 		if (*status == NVME_NO_STATUS)
2594 			pending = true;
2595 	}
2596 out:
2597 	if (!pending)
2598 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2599 
2600 	return (pending);
2601 }
2602 
2603 static void
2604 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2605 {
2606 	struct pci_nvme_ioreq *req = br->br_param;
2607 	struct pci_nvme_softc *sc = req->sc;
2608 	bool done = true;
2609 	uint16_t status;
2610 
2611 	status = 0;
2612 	if (err) {
2613 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2614 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2615 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2616 	} else {
2617 		struct iovec *iov = req->io_req.br_iov;
2618 
2619 		req->prev_gpaddr++;
2620 		iov += req->prev_gpaddr;
2621 
2622 		/* The iov_* values already include the sector size */
2623 		req->io_req.br_offset = (off_t)iov->iov_base;
2624 		req->io_req.br_resid = iov->iov_len;
2625 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2626 			pci_nvme_status_genc(&status,
2627 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2628 		} else
2629 			done = false;
2630 	}
2631 
2632 	if (done) {
2633 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid,
2634 		    status);
2635 		pci_nvme_release_ioreq(sc, req);
2636 	}
2637 }
2638 
2639 static bool
2640 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2641     struct nvme_command *cmd,
2642     struct pci_nvme_blockstore *nvstore,
2643     struct pci_nvme_ioreq *req,
2644     uint16_t *status)
2645 {
2646 	struct nvme_dsm_range *range = NULL;
2647 	uint32_t nr, r, non_zero, dr;
2648 	int err;
2649 	bool pending = false;
2650 
2651 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2652 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2653 		goto out;
2654 	}
2655 
2656 	nr = cmd->cdw10 & 0xff;
2657 
2658 	/* copy locally because a range entry could straddle PRPs */
2659 #ifdef	__FreeBSD__
2660 	range = calloc(1, NVME_MAX_DSM_TRIM);
2661 #else
2662 	_Static_assert(NVME_MAX_DSM_TRIM % sizeof(struct nvme_dsm_range) == 0,
2663 	    "NVME_MAX_DSM_TRIM is not a multiple of struct size");
2664 	range = calloc(NVME_MAX_DSM_TRIM / sizeof (*range), sizeof (*range));
2665 #endif
2666 	if (range == NULL) {
2667 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2668 		goto out;
2669 	}
2670 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2671 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2672 
2673 	/* Check for invalid ranges and the number of non-zero lengths */
2674 	non_zero = 0;
2675 	for (r = 0; r <= nr; r++) {
2676 		if (pci_nvme_out_of_range(nvstore,
2677 		    range[r].starting_lba, range[r].length)) {
2678 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2679 			goto out;
2680 		}
2681 		if (range[r].length != 0)
2682 			non_zero++;
2683 	}
2684 
2685 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2686 		size_t offset, bytes;
2687 		int sectsz_bits = sc->nvstore.sectsz_bits;
2688 
2689 		/*
2690 		 * DSM calls are advisory only, and compliant controllers
2691 		 * may choose to take no actions (i.e. return Success).
2692 		 */
2693 		if (!nvstore->deallocate) {
2694 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2695 			goto out;
2696 		}
2697 
2698 		/* If all ranges have a zero length, return Success */
2699 		if (non_zero == 0) {
2700 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2701 			goto out;
2702 		}
2703 
2704 		if (req == NULL) {
2705 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2706 			goto out;
2707 		}
2708 
2709 		offset = range[0].starting_lba << sectsz_bits;
2710 		bytes = range[0].length << sectsz_bits;
2711 
2712 		/*
2713 		 * If the request is for more than a single range, store
2714 		 * the ranges in the br_iov. Optimize for the common case
2715 		 * of a single range.
2716 		 *
2717 		 * Note that NVMe Number of Ranges is a zero based value
2718 		 */
2719 		req->io_req.br_iovcnt = 0;
2720 		req->io_req.br_offset = offset;
2721 		req->io_req.br_resid = bytes;
2722 
2723 		if (nr == 0) {
2724 			req->io_req.br_callback = pci_nvme_io_done;
2725 		} else {
2726 			struct iovec *iov = req->io_req.br_iov;
2727 
2728 			for (r = 0, dr = 0; r <= nr; r++) {
2729 				offset = range[r].starting_lba << sectsz_bits;
2730 				bytes = range[r].length << sectsz_bits;
2731 				if (bytes == 0)
2732 					continue;
2733 
2734 				if ((nvstore->size - offset) < bytes) {
2735 					pci_nvme_status_genc(status,
2736 					    NVME_SC_LBA_OUT_OF_RANGE);
2737 					goto out;
2738 				}
2739 				iov[dr].iov_base = (void *)offset;
2740 				iov[dr].iov_len = bytes;
2741 				dr++;
2742 			}
2743 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2744 
2745 			/*
2746 			 * Use prev_gpaddr to track the current entry and
2747 			 * prev_size to track the number of entries
2748 			 */
2749 			req->prev_gpaddr = 0;
2750 			req->prev_size = dr;
2751 		}
2752 
2753 		err = blockif_delete(nvstore->ctx, &req->io_req);
2754 		if (err)
2755 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2756 		else
2757 			pending = true;
2758 	}
2759 out:
2760 	free(range);
2761 	return (pending);
2762 }
2763 
2764 static void
2765 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2766 {
2767 	struct nvme_submission_queue *sq;
2768 	uint16_t status;
2769 	uint16_t sqhead;
2770 
2771 	/* handle all submissions up to sq->tail index */
2772 	sq = &sc->submit_queues[idx];
2773 
2774 	pthread_mutex_lock(&sq->mtx);
2775 
2776 	sqhead = sq->head;
2777 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2778 	         idx, sqhead, sq->tail, sq->qbase);
2779 
2780 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2781 		struct nvme_command *cmd;
2782 		struct pci_nvme_ioreq *req;
2783 		uint32_t nsid;
2784 		bool pending;
2785 
2786 		pending = false;
2787 		req = NULL;
2788 		status = 0;
2789 
2790 		cmd = &sq->qbase[sqhead];
2791 		sqhead = (sqhead + 1) % sq->size;
2792 
2793 		nsid = le32toh(cmd->nsid);
2794 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2795 			pci_nvme_status_genc(&status,
2796 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2797 			status |=
2798 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2799 			goto complete;
2800  		}
2801 
2802 		req = pci_nvme_get_ioreq(sc);
2803 		if (req == NULL) {
2804 			pci_nvme_status_genc(&status,
2805 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2806 			WPRINTF("%s: unable to allocate IO req", __func__);
2807 			goto complete;
2808 		}
2809 		req->nvme_sq = sq;
2810 		req->sqid = idx;
2811 		req->opc = cmd->opc;
2812 		req->cid = cmd->cid;
2813 		req->nsid = cmd->nsid;
2814 
2815 		switch (cmd->opc) {
2816 		case NVME_OPC_FLUSH:
2817 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2818 			    req, &status);
2819  			break;
2820 		case NVME_OPC_WRITE:
2821 		case NVME_OPC_READ:
2822 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2823 			    req, &status);
2824 			break;
2825 		case NVME_OPC_WRITE_ZEROES:
2826 			/* TODO: write zeroes
2827 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2828 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2829 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2830 			break;
2831 		case NVME_OPC_DATASET_MANAGEMENT:
2832  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2833 			    req, &status);
2834 			break;
2835  		default:
2836  			WPRINTF("%s unhandled io command 0x%x",
2837 			    __func__, cmd->opc);
2838 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2839 		}
2840 complete:
2841 		if (!pending) {
2842 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, status);
2843 			if (req != NULL)
2844 				pci_nvme_release_ioreq(sc, req);
2845 		}
2846 	}
2847 
2848 	sq->head = sqhead;
2849 
2850 	pthread_mutex_unlock(&sq->mtx);
2851 }
2852 
2853 static void
2854 pci_nvme_handle_doorbell(struct pci_nvme_softc* sc,
2855 	uint64_t idx, int is_sq, uint64_t value)
2856 {
2857 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2858 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2859 
2860 	if (is_sq) {
2861 		if (idx > sc->num_squeues) {
2862 			WPRINTF("%s queue index %lu overflow from "
2863 			         "guest (max %u)",
2864 			         __func__, idx, sc->num_squeues);
2865 			return;
2866 		}
2867 
2868 		atomic_store_short(&sc->submit_queues[idx].tail,
2869 		                   (uint16_t)value);
2870 
2871 		if (idx == 0) {
2872 			pci_nvme_handle_admin_cmd(sc, value);
2873 		} else {
2874 			/* submission queue; handle new entries in SQ */
2875 			if (idx > sc->num_squeues) {
2876 				WPRINTF("%s SQ index %lu overflow from "
2877 				         "guest (max %u)",
2878 				         __func__, idx, sc->num_squeues);
2879 				return;
2880 			}
2881 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2882 		}
2883 	} else {
2884 		if (idx > sc->num_cqueues) {
2885 			WPRINTF("%s queue index %lu overflow from "
2886 			         "guest (max %u)",
2887 			         __func__, idx, sc->num_cqueues);
2888 			return;
2889 		}
2890 
2891 		atomic_store_short(&sc->compl_queues[idx].head,
2892 				(uint16_t)value);
2893 	}
2894 }
2895 
2896 static void
2897 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2898 {
2899 	const char *s = iswrite ? "WRITE" : "READ";
2900 
2901 	switch (offset) {
2902 	case NVME_CR_CAP_LOW:
2903 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2904 		break;
2905 	case NVME_CR_CAP_HI:
2906 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2907 		break;
2908 	case NVME_CR_VS:
2909 		DPRINTF("%s %s NVME_CR_VS", func, s);
2910 		break;
2911 	case NVME_CR_INTMS:
2912 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2913 		break;
2914 	case NVME_CR_INTMC:
2915 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2916 		break;
2917 	case NVME_CR_CC:
2918 		DPRINTF("%s %s NVME_CR_CC", func, s);
2919 		break;
2920 	case NVME_CR_CSTS:
2921 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2922 		break;
2923 	case NVME_CR_NSSR:
2924 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2925 		break;
2926 	case NVME_CR_AQA:
2927 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2928 		break;
2929 	case NVME_CR_ASQ_LOW:
2930 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2931 		break;
2932 	case NVME_CR_ASQ_HI:
2933 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2934 		break;
2935 	case NVME_CR_ACQ_LOW:
2936 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2937 		break;
2938 	case NVME_CR_ACQ_HI:
2939 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2940 		break;
2941 	default:
2942 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2943 	}
2944 
2945 }
2946 
2947 static void
2948 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2949 	uint64_t offset, int size, uint64_t value)
2950 {
2951 	uint32_t ccreg;
2952 
2953 	if (offset >= NVME_DOORBELL_OFFSET) {
2954 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2955 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2956 		int is_sq = (belloffset % 8) < 4;
2957 
2958 		if ((sc->regs.csts & NVME_CSTS_RDY) == 0) {
2959 			WPRINTF("doorbell write prior to RDY (offset=%#lx)\n",
2960 			    offset);
2961 			return;
2962 		}
2963 
2964 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2965 			WPRINTF("guest attempted an overflow write offset "
2966 			         "0x%lx, val 0x%lx in %s",
2967 			         offset, value, __func__);
2968 			return;
2969 		}
2970 
2971 		if (is_sq) {
2972 			if (sc->submit_queues[idx].qbase == NULL)
2973 				return;
2974 		} else if (sc->compl_queues[idx].qbase == NULL)
2975 			return;
2976 
2977 		pci_nvme_handle_doorbell(sc, idx, is_sq, value);
2978 		return;
2979 	}
2980 
2981 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2982 	        offset, size, value);
2983 
2984 	if (size != 4) {
2985 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2986 		         "val 0x%lx) to bar0 in %s",
2987 		         size, offset, value, __func__);
2988 		/* TODO: shutdown device */
2989 		return;
2990 	}
2991 
2992 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2993 
2994 	pthread_mutex_lock(&sc->mtx);
2995 
2996 	switch (offset) {
2997 	case NVME_CR_CAP_LOW:
2998 	case NVME_CR_CAP_HI:
2999 		/* readonly */
3000 		break;
3001 	case NVME_CR_VS:
3002 		/* readonly */
3003 		break;
3004 	case NVME_CR_INTMS:
3005 		/* MSI-X, so ignore */
3006 		break;
3007 	case NVME_CR_INTMC:
3008 		/* MSI-X, so ignore */
3009 		break;
3010 	case NVME_CR_CC:
3011 		ccreg = (uint32_t)value;
3012 
3013 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
3014 		         "iocqes %u",
3015 		        __func__,
3016 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
3017 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
3018 			 NVME_CC_GET_IOCQES(ccreg));
3019 
3020 		if (NVME_CC_GET_SHN(ccreg)) {
3021 			/* perform shutdown - flush out data to backend */
3022 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
3023 			    NVME_CSTS_REG_SHST_SHIFT);
3024 			sc->regs.csts |= NVME_SHST_COMPLETE <<
3025 			    NVME_CSTS_REG_SHST_SHIFT;
3026 		}
3027 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
3028 			if (NVME_CC_GET_EN(ccreg) == 0)
3029 				/* transition 1-> causes controller reset */
3030 				pci_nvme_reset_locked(sc);
3031 			else
3032 				pci_nvme_init_controller(ctx, sc);
3033 		}
3034 
3035 		/* Insert the iocqes, iosqes and en bits from the write */
3036 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
3037 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
3038 		if (NVME_CC_GET_EN(ccreg) == 0) {
3039 			/* Insert the ams, mps and css bit fields */
3040 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
3041 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
3042 			sc->regs.csts &= ~NVME_CSTS_RDY;
3043 		} else if ((sc->pending_ios == 0) &&
3044 		    !(sc->regs.csts & NVME_CSTS_CFS)) {
3045 			sc->regs.csts |= NVME_CSTS_RDY;
3046 		}
3047 		break;
3048 	case NVME_CR_CSTS:
3049 		break;
3050 	case NVME_CR_NSSR:
3051 		/* ignore writes; don't support subsystem reset */
3052 		break;
3053 	case NVME_CR_AQA:
3054 		sc->regs.aqa = (uint32_t)value;
3055 		break;
3056 	case NVME_CR_ASQ_LOW:
3057 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
3058 		               (0xFFFFF000 & value);
3059 		break;
3060 	case NVME_CR_ASQ_HI:
3061 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
3062 		               (value << 32);
3063 		break;
3064 	case NVME_CR_ACQ_LOW:
3065 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
3066 		               (0xFFFFF000 & value);
3067 		break;
3068 	case NVME_CR_ACQ_HI:
3069 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
3070 		               (value << 32);
3071 		break;
3072 	default:
3073 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
3074 		         __func__, offset, value, size);
3075 	}
3076 	pthread_mutex_unlock(&sc->mtx);
3077 }
3078 
3079 static void
3080 pci_nvme_write(struct vmctx *ctx, struct pci_devinst *pi,
3081     int baridx, uint64_t offset, int size, uint64_t value)
3082 {
3083 	struct pci_nvme_softc* sc = pi->pi_arg;
3084 
3085 	if (baridx == pci_msix_table_bar(pi) ||
3086 	    baridx == pci_msix_pba_bar(pi)) {
3087 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
3088 		         " value 0x%lx", baridx, offset, size, value);
3089 
3090 		pci_emul_msix_twrite(pi, offset, size, value);
3091 		return;
3092 	}
3093 
3094 	switch (baridx) {
3095 	case 0:
3096 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
3097 		break;
3098 
3099 	default:
3100 		DPRINTF("%s unknown baridx %d, val 0x%lx",
3101 		         __func__, baridx, value);
3102 	}
3103 }
3104 
3105 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
3106 	uint64_t offset, int size)
3107 {
3108 	uint64_t value;
3109 
3110 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
3111 
3112 	if (offset < NVME_DOORBELL_OFFSET) {
3113 		void *p = &(sc->regs);
3114 		pthread_mutex_lock(&sc->mtx);
3115 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
3116 		pthread_mutex_unlock(&sc->mtx);
3117 	} else {
3118 		value = 0;
3119                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
3120 	}
3121 
3122 	switch (size) {
3123 	case 1:
3124 		value &= 0xFF;
3125 		break;
3126 	case 2:
3127 		value &= 0xFFFF;
3128 		break;
3129 	case 4:
3130 		value &= 0xFFFFFFFF;
3131 		break;
3132 	}
3133 
3134 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
3135 	         offset, size, (uint32_t)value);
3136 
3137 	return (value);
3138 }
3139 
3140 
3141 
3142 static uint64_t
3143 pci_nvme_read(struct vmctx *ctx __unused,
3144     struct pci_devinst *pi, int baridx, uint64_t offset, int size)
3145 {
3146 	struct pci_nvme_softc* sc = pi->pi_arg;
3147 
3148 	if (baridx == pci_msix_table_bar(pi) ||
3149 	    baridx == pci_msix_pba_bar(pi)) {
3150 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3151 		        baridx, offset, size);
3152 
3153 		return pci_emul_msix_tread(pi, offset, size);
3154 	}
3155 
3156 	switch (baridx) {
3157 	case 0:
3158        		return pci_nvme_read_bar_0(sc, offset, size);
3159 
3160 	default:
3161 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3162 	}
3163 
3164 	return (0);
3165 }
3166 
3167 static int
3168 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3169 {
3170 	char bident[sizeof("XXX:XXX")];
3171 	const char *value;
3172 	uint32_t sectsz;
3173 
3174 	sc->max_queues = NVME_QUEUES;
3175 	sc->max_qentries = NVME_MAX_QENTRIES;
3176 	sc->ioslots = NVME_IOSLOTS;
3177 	sc->num_squeues = sc->max_queues;
3178 	sc->num_cqueues = sc->max_queues;
3179 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3180 	sectsz = 0;
3181 #ifdef	__FreeBSD__
3182 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3183 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3184 #else
3185 	snprintf((char *)sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3186 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3187 #endif
3188 
3189 	value = get_config_value_node(nvl, "maxq");
3190 	if (value != NULL)
3191 		sc->max_queues = atoi(value);
3192 	value = get_config_value_node(nvl, "qsz");
3193 	if (value != NULL) {
3194 		sc->max_qentries = atoi(value);
3195 		if (sc->max_qentries <= 0) {
3196 			EPRINTLN("nvme: Invalid qsz option %d",
3197 			    sc->max_qentries);
3198 			return (-1);
3199 		}
3200 	}
3201 	value = get_config_value_node(nvl, "ioslots");
3202 	if (value != NULL) {
3203 		sc->ioslots = atoi(value);
3204 		if (sc->ioslots <= 0) {
3205 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3206 			return (-1);
3207 		}
3208 	}
3209 	value = get_config_value_node(nvl, "sectsz");
3210 	if (value != NULL)
3211 		sectsz = atoi(value);
3212 	value = get_config_value_node(nvl, "ser");
3213 	if (value != NULL) {
3214 		/*
3215 		 * This field indicates the Product Serial Number in
3216 		 * 7-bit ASCII, unused bytes should be space characters.
3217 		 * Ref: NVMe v1.3c.
3218 		 */
3219 		cpywithpad((char *)sc->ctrldata.sn,
3220 		    sizeof(sc->ctrldata.sn), value, ' ');
3221 	}
3222 	value = get_config_value_node(nvl, "eui64");
3223 	if (value != NULL)
3224 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3225 	value = get_config_value_node(nvl, "dsm");
3226 	if (value != NULL) {
3227 		if (strcmp(value, "auto") == 0)
3228 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3229 		else if (strcmp(value, "enable") == 0)
3230 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3231 		else if (strcmp(value, "disable") == 0)
3232 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3233 	}
3234 
3235 	value = get_config_value_node(nvl, "ram");
3236 	if (value != NULL) {
3237 		uint64_t sz = strtoull(value, NULL, 10);
3238 
3239 		sc->nvstore.type = NVME_STOR_RAM;
3240 		sc->nvstore.size = sz * 1024 * 1024;
3241 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3242 		sc->nvstore.sectsz = 4096;
3243 		sc->nvstore.sectsz_bits = 12;
3244 		if (sc->nvstore.ctx == NULL) {
3245 			EPRINTLN("nvme: Unable to allocate RAM");
3246 			return (-1);
3247 		}
3248 	} else {
3249 		snprintf(bident, sizeof(bident), "%u:%u",
3250 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3251 		sc->nvstore.ctx = blockif_open(nvl, bident);
3252 		if (sc->nvstore.ctx == NULL) {
3253 			EPRINTLN("nvme: Could not open backing file: %s",
3254 			    strerror(errno));
3255 			return (-1);
3256 		}
3257 		sc->nvstore.type = NVME_STOR_BLOCKIF;
3258 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3259 	}
3260 
3261 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3262 		sc->nvstore.sectsz = sectsz;
3263 	else if (sc->nvstore.type != NVME_STOR_RAM)
3264 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3265 	for (sc->nvstore.sectsz_bits = 9;
3266 	     (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3267 	     sc->nvstore.sectsz_bits++);
3268 
3269 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3270 		sc->max_queues = NVME_QUEUES;
3271 
3272 	return (0);
3273 }
3274 
3275 static void
3276 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg,
3277     size_t new_size)
3278 {
3279 	struct pci_nvme_softc *sc;
3280 	struct pci_nvme_blockstore *nvstore;
3281 	struct nvme_namespace_data *nd;
3282 
3283 	sc = arg;
3284 	nvstore = &sc->nvstore;
3285 	nd = &sc->nsdata;
3286 
3287 	nvstore->size = new_size;
3288 	pci_nvme_init_nsdata_size(nvstore, nd);
3289 
3290 	/* Add changed NSID to list */
3291 	sc->ns_log.ns[0] = 1;
3292 	sc->ns_log.ns[1] = 0;
3293 
3294 	pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3295 	    PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED);
3296 }
3297 
3298 static int
3299 pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl)
3300 {
3301 	struct pci_nvme_softc *sc;
3302 	uint32_t pci_membar_sz;
3303 	int	error;
3304 
3305 	error = 0;
3306 
3307 	sc = calloc(1, sizeof(struct pci_nvme_softc));
3308 	pi->pi_arg = sc;
3309 	sc->nsc_pi = pi;
3310 
3311 	error = pci_nvme_parse_config(sc, nvl);
3312 	if (error < 0)
3313 		goto done;
3314 	else
3315 		error = 0;
3316 
3317 	STAILQ_INIT(&sc->ioreqs_free);
3318 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3319 	for (uint32_t i = 0; i < sc->ioslots; i++) {
3320 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3321 	}
3322 
3323 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3324 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3325 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3326 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3327 	pci_set_cfgdata8(pi, PCIR_PROGIF,
3328 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3329 
3330 	/*
3331 	 * Allocate size of NVMe registers + doorbell space for all queues.
3332 	 *
3333 	 * The specification requires a minimum memory I/O window size of 16K.
3334 	 * The Windows driver will refuse to start a device with a smaller
3335 	 * window.
3336 	 */
3337 	pci_membar_sz = sizeof(struct nvme_registers) +
3338 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
3339 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3340 
3341 	DPRINTF("nvme membar size: %u", pci_membar_sz);
3342 
3343 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3344 	if (error) {
3345 		WPRINTF("%s pci alloc mem bar failed", __func__);
3346 		goto done;
3347 	}
3348 
3349 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3350 	if (error) {
3351 		WPRINTF("%s pci add msixcap failed", __func__);
3352 		goto done;
3353 	}
3354 
3355 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3356 	if (error) {
3357 		WPRINTF("%s pci add Express capability failed", __func__);
3358 		goto done;
3359 	}
3360 
3361 	pthread_mutex_init(&sc->mtx, NULL);
3362 	sem_init(&sc->iosemlock, 0, sc->ioslots);
3363 	blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3364 
3365 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3366 	/*
3367 	 * Controller data depends on Namespace data so initialize Namespace
3368 	 * data first.
3369 	 */
3370 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3371 	pci_nvme_init_ctrldata(sc);
3372 	pci_nvme_init_logpages(sc);
3373 	pci_nvme_init_features(sc);
3374 
3375 	pci_nvme_aer_init(sc);
3376 	pci_nvme_aen_init(sc);
3377 
3378 	pci_nvme_reset(sc);
3379 
3380 	pci_lintr_request(pi);
3381 
3382 done:
3383 	return (error);
3384 }
3385 
3386 static int
3387 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3388 {
3389 	char *cp, *ram;
3390 
3391 	if (opts == NULL)
3392 		return (0);
3393 
3394 	if (strncmp(opts, "ram=", 4) == 0) {
3395 		cp = strchr(opts, ',');
3396 		if (cp == NULL) {
3397 			set_config_value_node(nvl, "ram", opts + 4);
3398 			return (0);
3399 		}
3400 		ram = strndup(opts + 4, cp - opts - 4);
3401 		set_config_value_node(nvl, "ram", ram);
3402 		free(ram);
3403 		return (pci_parse_legacy_config(nvl, cp + 1));
3404 	} else
3405 		return (blockif_legacy_config(nvl, opts));
3406 }
3407 
3408 static const struct pci_devemu pci_de_nvme = {
3409 	.pe_emu =	"nvme",
3410 	.pe_init =	pci_nvme_init,
3411 	.pe_legacy_config = pci_nvme_legacy_config,
3412 	.pe_barwrite =	pci_nvme_write,
3413 	.pe_barread =	pci_nvme_read
3414 };
3415 PCI_EMUL_SET(pci_de_nvme);
3416