xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision 15cebe3d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
70 #include <stdbool.h>
71 #include <stddef.h>
72 #include <stdint.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76 
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
79 #include <vmmapi.h>
80 
81 #include <dev/nvme/nvme.h>
82 
83 #include "bhyverun.h"
84 #include "block_if.h"
85 #include "config.h"
86 #include "debug.h"
87 #include "pci_emul.h"
88 
89 
90 static int nvme_debug = 0;
91 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
93 
94 /* defaults; can be overridden */
95 #define	NVME_MSIX_BAR		4
96 
97 #define	NVME_IOSLOTS		8
98 
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN	(1 << 14)
101 
102 #define	NVME_QUEUES		16
103 #define	NVME_MAX_QENTRIES	2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define	NVME_MPSMIN		0
106 /* MPSMIN converted to bytes */
107 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
108 
109 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
110 #define	NVME_MDTS		9
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
113 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
114 
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS		0xffff
117 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
118 
119 /* Reported temperature in Kelvin (i.e. room temperature) */
120 #define NVME_TEMPERATURE 296
121 
122 /* helpers */
123 
124 /* Convert a zero-based value into a one-based value */
125 #define ONE_BASED(zero)		((zero) + 1)
126 /* Convert a one-based value into a zero-based value */
127 #define ZERO_BASED(one)		((one)  - 1)
128 
129 /* Encode number of SQ's and CQ's for Set/Get Features */
130 #define NVME_FEATURE_NUM_QUEUES(sc) \
131 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
132 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
133 
134 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
135 
136 enum nvme_controller_register_offsets {
137 	NVME_CR_CAP_LOW = 0x00,
138 	NVME_CR_CAP_HI  = 0x04,
139 	NVME_CR_VS      = 0x08,
140 	NVME_CR_INTMS   = 0x0c,
141 	NVME_CR_INTMC   = 0x10,
142 	NVME_CR_CC      = 0x14,
143 	NVME_CR_CSTS    = 0x1c,
144 	NVME_CR_NSSR    = 0x20,
145 	NVME_CR_AQA     = 0x24,
146 	NVME_CR_ASQ_LOW = 0x28,
147 	NVME_CR_ASQ_HI  = 0x2c,
148 	NVME_CR_ACQ_LOW = 0x30,
149 	NVME_CR_ACQ_HI  = 0x34,
150 };
151 
152 enum nvme_cmd_cdw11 {
153 	NVME_CMD_CDW11_PC  = 0x0001,
154 	NVME_CMD_CDW11_IEN = 0x0002,
155 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
156 };
157 
158 enum nvme_copy_dir {
159 	NVME_COPY_TO_PRP,
160 	NVME_COPY_FROM_PRP,
161 };
162 
163 #define	NVME_CQ_INTEN	0x01
164 #define	NVME_CQ_INTCOAL	0x02
165 
166 struct nvme_completion_queue {
167 	struct nvme_completion *qbase;
168 	pthread_mutex_t	mtx;
169 	uint32_t	size;
170 	uint16_t	tail; /* nvme progress */
171 	uint16_t	head; /* guest progress */
172 	uint16_t	intr_vec;
173 	uint32_t	intr_en;
174 };
175 
176 struct nvme_submission_queue {
177 	struct nvme_command *qbase;
178 	pthread_mutex_t	mtx;
179 	uint32_t	size;
180 	uint16_t	head; /* nvme progress */
181 	uint16_t	tail; /* guest progress */
182 	uint16_t	cqid; /* completion queue id */
183 	int		qpriority;
184 };
185 
186 enum nvme_storage_type {
187 	NVME_STOR_BLOCKIF = 0,
188 	NVME_STOR_RAM = 1,
189 };
190 
191 struct pci_nvme_blockstore {
192 	enum nvme_storage_type type;
193 	void		*ctx;
194 	uint64_t	size;
195 	uint32_t	sectsz;
196 	uint32_t	sectsz_bits;
197 	uint64_t	eui64;
198 	uint32_t	deallocate:1;
199 };
200 
201 /*
202  * Calculate the number of additional page descriptors for guest IO requests
203  * based on the advertised Max Data Transfer (MDTS) and given the number of
204  * default iovec's in a struct blockif_req.
205  */
206 #define MDTS_PAD_SIZE \
207 	( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
208 	  NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
209 	  0 )
210 
211 struct pci_nvme_ioreq {
212 	struct pci_nvme_softc *sc;
213 	STAILQ_ENTRY(pci_nvme_ioreq) link;
214 	struct nvme_submission_queue *nvme_sq;
215 	uint16_t	sqid;
216 
217 	/* command information */
218 	uint16_t	opc;
219 	uint16_t	cid;
220 	uint32_t	nsid;
221 
222 	uint64_t	prev_gpaddr;
223 	size_t		prev_size;
224 	size_t		bytes;
225 
226 	struct blockif_req io_req;
227 
228 	struct iovec	iovpadding[MDTS_PAD_SIZE];
229 };
230 
231 enum nvme_dsm_type {
232 	/* Dataset Management bit in ONCS reflects backing storage capability */
233 	NVME_DATASET_MANAGEMENT_AUTO,
234 	/* Unconditionally set Dataset Management bit in ONCS */
235 	NVME_DATASET_MANAGEMENT_ENABLE,
236 	/* Unconditionally clear Dataset Management bit in ONCS */
237 	NVME_DATASET_MANAGEMENT_DISABLE,
238 };
239 
240 struct pci_nvme_softc;
241 struct nvme_feature_obj;
242 
243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
244     struct nvme_feature_obj *,
245     struct nvme_command *,
246     struct nvme_completion *);
247 
248 struct nvme_feature_obj {
249 	uint32_t	cdw11;
250 	nvme_feature_cb	set;
251 	nvme_feature_cb	get;
252 	bool namespace_specific;
253 };
254 
255 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
256 
257 typedef enum {
258 	PCI_NVME_AE_TYPE_ERROR = 0,
259 	PCI_NVME_AE_TYPE_SMART,
260 	PCI_NVME_AE_TYPE_NOTICE,
261 	PCI_NVME_AE_TYPE_IO_CMD = 6,
262 	PCI_NVME_AE_TYPE_VENDOR = 7,
263 	PCI_NVME_AE_TYPE_MAX		/* Must be last */
264 } pci_nvme_async_type;
265 
266 /* Asynchronous Event Requests */
267 struct pci_nvme_aer {
268 	STAILQ_ENTRY(pci_nvme_aer) link;
269 	uint16_t	cid;	/* Command ID of the submitted AER */
270 };
271 
272 /** Asynchronous Event Information - Notice */
273 typedef enum {
274 	PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0,
275 	PCI_NVME_AEI_NOTICE_FW_ACTIVATION,
276 	PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE,
277 	PCI_NVME_AEI_NOTICE_ANA_CHANGE,
278 	PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE,
279 	PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT,
280 	PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE,
281 	PCI_NVME_AEI_NOTICE_MAX,
282 } pci_nvme_async_event_info_notice;
283 
284 #define PCI_NVME_AEI_NOTICE_SHIFT		8
285 #define PCI_NVME_AEI_NOTICE_MASK(event)	(1 << (event + PCI_NVME_AEI_NOTICE_SHIFT))
286 
287 /* Asynchronous Event Notifications */
288 struct pci_nvme_aen {
289 	pci_nvme_async_type atype;
290 	uint32_t	event_data;
291 	bool		posted;
292 };
293 
294 /*
295  * By default, enable all Asynchrnous Event Notifications:
296  *     SMART / Health Critical Warnings
297  *     Namespace Attribute Notices
298  */
299 #define PCI_NVME_AEN_DEFAULT_MASK	0x11f
300 
301 typedef enum {
302 	NVME_CNTRLTYPE_IO = 1,
303 	NVME_CNTRLTYPE_DISCOVERY = 2,
304 	NVME_CNTRLTYPE_ADMIN = 3,
305 } pci_nvme_cntrl_type;
306 
307 struct pci_nvme_softc {
308 	struct pci_devinst *nsc_pi;
309 
310 	pthread_mutex_t	mtx;
311 
312 	struct nvme_registers regs;
313 
314 	struct nvme_namespace_data  nsdata;
315 	struct nvme_controller_data ctrldata;
316 	struct nvme_error_information_entry err_log;
317 	struct nvme_health_information_page health_log;
318 	struct nvme_firmware_page fw_log;
319 	struct nvme_ns_list ns_log;
320 
321 	struct pci_nvme_blockstore nvstore;
322 
323 	uint16_t	max_qentries;	/* max entries per queue */
324 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
325 	uint32_t	num_cqueues;
326 	uint32_t	num_squeues;
327 	bool		num_q_is_set; /* Has host set Number of Queues */
328 
329 	struct pci_nvme_ioreq *ioreqs;
330 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
331 	uint32_t	pending_ios;
332 	uint32_t	ioslots;
333 	sem_t		iosemlock;
334 
335 	/*
336 	 * Memory mapped Submission and Completion queues
337 	 * Each array includes both Admin and IO queues
338 	 */
339 	struct nvme_completion_queue *compl_queues;
340 	struct nvme_submission_queue *submit_queues;
341 
342 	struct nvme_feature_obj feat[NVME_FID_MAX];
343 
344 	enum nvme_dsm_type dataset_management;
345 
346 	/* Accounting for SMART data */
347 	__uint128_t	read_data_units;
348 	__uint128_t	write_data_units;
349 	__uint128_t	read_commands;
350 	__uint128_t	write_commands;
351 	uint32_t	read_dunits_remainder;
352 	uint32_t	write_dunits_remainder;
353 
354 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
355 	pthread_mutex_t	aer_mtx;
356 	uint32_t	aer_count;
357 	struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
358 	pthread_t	aen_tid;
359 	pthread_mutex_t	aen_mtx;
360 	pthread_cond_t	aen_cond;
361 };
362 
363 
364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
365     struct nvme_completion_queue *cq,
366     uint32_t cdw0,
367     uint16_t cid,
368     uint16_t sqid,
369     uint16_t status);
370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
372 static void pci_nvme_io_done(struct blockif_req *, int);
373 
374 /* Controller Configuration utils */
375 #define	NVME_CC_GET_EN(cc) \
376 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
377 #define	NVME_CC_GET_CSS(cc) \
378 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
379 #define	NVME_CC_GET_SHN(cc) \
380 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
381 #define	NVME_CC_GET_IOSQES(cc) \
382 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
383 #define	NVME_CC_GET_IOCQES(cc) \
384 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
385 
386 #define	NVME_CC_WRITE_MASK \
387 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
388 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
389 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
390 
391 #define	NVME_CC_NEN_WRITE_MASK \
392 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
393 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
394 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
395 
396 /* Controller Status utils */
397 #define	NVME_CSTS_GET_RDY(sts) \
398 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
399 
400 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
401 #define	NVME_CSTS_CFS	(1 << NVME_CSTS_REG_CFS_SHIFT)
402 
403 /* Completion Queue status word utils */
404 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
405 #define	NVME_STATUS_MASK \
406 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
407 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
408 
409 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
410 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
411 
412 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
413     struct nvme_feature_obj *,
414     struct nvme_command *,
415     struct nvme_completion *);
416 static void nvme_feature_temperature(struct pci_nvme_softc *,
417     struct nvme_feature_obj *,
418     struct nvme_command *,
419     struct nvme_completion *);
420 static void nvme_feature_num_queues(struct pci_nvme_softc *,
421     struct nvme_feature_obj *,
422     struct nvme_command *,
423     struct nvme_completion *);
424 static void nvme_feature_iv_config(struct pci_nvme_softc *,
425     struct nvme_feature_obj *,
426     struct nvme_command *,
427     struct nvme_completion *);
428 static void nvme_feature_async_event(struct pci_nvme_softc *,
429     struct nvme_feature_obj *,
430     struct nvme_command *,
431     struct nvme_completion *);
432 
433 static void *aen_thr(void *arg);
434 
435 static __inline void
436 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
437 {
438 	size_t len;
439 
440 	len = strnlen(src, dst_size);
441 	memset(dst, pad, dst_size);
442 	memcpy(dst, src, len);
443 }
444 
445 static __inline void
446 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
447 {
448 
449 	*status &= ~NVME_STATUS_MASK;
450 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
451 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
452 }
453 
454 static __inline void
455 pci_nvme_status_genc(uint16_t *status, uint16_t code)
456 {
457 
458 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
459 }
460 
461 /*
462  * Initialize the requested number or IO Submission and Completion Queues.
463  * Admin queues are allocated implicitly.
464  */
465 static void
466 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
467 {
468 	uint32_t i;
469 
470 	/*
471 	 * Allocate and initialize the Submission Queues
472 	 */
473 	if (nsq > NVME_QUEUES) {
474 		WPRINTF("%s: clamping number of SQ from %u to %u",
475 					__func__, nsq, NVME_QUEUES);
476 		nsq = NVME_QUEUES;
477 	}
478 
479 	sc->num_squeues = nsq;
480 
481 	sc->submit_queues = calloc(sc->num_squeues + 1,
482 				sizeof(struct nvme_submission_queue));
483 	if (sc->submit_queues == NULL) {
484 		WPRINTF("%s: SQ allocation failed", __func__);
485 		sc->num_squeues = 0;
486 	} else {
487 		struct nvme_submission_queue *sq = sc->submit_queues;
488 
489 		for (i = 0; i < sc->num_squeues + 1; i++)
490 			pthread_mutex_init(&sq[i].mtx, NULL);
491 	}
492 
493 	/*
494 	 * Allocate and initialize the Completion Queues
495 	 */
496 	if (ncq > NVME_QUEUES) {
497 		WPRINTF("%s: clamping number of CQ from %u to %u",
498 					__func__, ncq, NVME_QUEUES);
499 		ncq = NVME_QUEUES;
500 	}
501 
502 	sc->num_cqueues = ncq;
503 
504 	sc->compl_queues = calloc(sc->num_cqueues + 1,
505 				sizeof(struct nvme_completion_queue));
506 	if (sc->compl_queues == NULL) {
507 		WPRINTF("%s: CQ allocation failed", __func__);
508 		sc->num_cqueues = 0;
509 	} else {
510 		struct nvme_completion_queue *cq = sc->compl_queues;
511 
512 		for (i = 0; i < sc->num_cqueues + 1; i++)
513 			pthread_mutex_init(&cq[i].mtx, NULL);
514 	}
515 }
516 
517 static void
518 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
519 {
520 	struct nvme_controller_data *cd = &sc->ctrldata;
521 
522 	cd->vid = 0xFB5D;
523 	cd->ssvid = 0x0000;
524 
525 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
526 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
527 
528 	/* Num of submission commands that we can handle at a time (2^rab) */
529 	cd->rab   = 4;
530 
531 	/* FreeBSD OUI */
532 	cd->ieee[0] = 0x58;
533 	cd->ieee[1] = 0x9c;
534 	cd->ieee[2] = 0xfc;
535 
536 	cd->mic = 0;
537 
538 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
539 
540 	cd->ver = NVME_REV(1,4);
541 
542 	cd->cntrltype = NVME_CNTRLTYPE_IO;
543 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
544 	cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR);
545 	cd->acl = 2;
546 	cd->aerl = 4;
547 
548 	/* Advertise 1, Read-only firmware slot */
549 	cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) |
550 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
551 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
552 	cd->elpe = 0;	/* max error log page entries */
553 	/*
554 	 * Report a single power state (zero-based value)
555 	 * power_state[] values are left as zero to indicate "Not reported"
556 	 */
557 	cd->npss = 0;
558 
559 	/* Warning Composite Temperature Threshold */
560 	cd->wctemp = 0x0157;
561 	cd->cctemp = 0x0157;
562 
563 	/* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */
564 	cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO <<
565 			NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT);
566 
567 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
568 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
569 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
570 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
571 	cd->nn = 1;	/* number of namespaces */
572 
573 	cd->oncs = 0;
574 	switch (sc->dataset_management) {
575 	case NVME_DATASET_MANAGEMENT_AUTO:
576 		if (sc->nvstore.deallocate)
577 			cd->oncs |= NVME_ONCS_DSM;
578 		break;
579 	case NVME_DATASET_MANAGEMENT_ENABLE:
580 		cd->oncs |= NVME_ONCS_DSM;
581 		break;
582 	default:
583 		break;
584 	}
585 
586 	cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
587 	    NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
588 
589 	cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT;
590 }
591 
592 /*
593  * Calculate the CRC-16 of the given buffer
594  * See copyright attribution at top of file
595  */
596 static uint16_t
597 crc16(uint16_t crc, const void *buffer, unsigned int len)
598 {
599 	const unsigned char *cp = buffer;
600 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
601 	static uint16_t const crc16_table[256] = {
602 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
603 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
604 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
605 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
606 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
607 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
608 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
609 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
610 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
611 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
612 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
613 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
614 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
615 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
616 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
617 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
618 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
619 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
620 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
621 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
622 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
623 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
624 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
625 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
626 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
627 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
628 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
629 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
630 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
631 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
632 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
633 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
634 	};
635 
636 	while (len--)
637 		crc = (((crc >> 8) & 0xffU) ^
638 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
639 	return crc;
640 }
641 
642 static void
643 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
644     struct nvme_namespace_data *nd)
645 {
646 
647 	/* Get capacity and block size information from backing store */
648 	nd->nsze = nvstore->size / nvstore->sectsz;
649 	nd->ncap = nd->nsze;
650 	nd->nuse = nd->nsze;
651 }
652 
653 static void
654 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
655     struct nvme_namespace_data *nd, uint32_t nsid,
656     struct pci_nvme_blockstore *nvstore)
657 {
658 
659 	pci_nvme_init_nsdata_size(nvstore, nd);
660 
661 	if (nvstore->type == NVME_STOR_BLOCKIF)
662 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
663 
664 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
665 	nd->flbas = 0;
666 
667 	/* Create an EUI-64 if user did not provide one */
668 	if (nvstore->eui64 == 0) {
669 		char *data = NULL;
670 		uint64_t eui64 = nvstore->eui64;
671 
672 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
673 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
674 		    sc->nsc_pi->pi_func);
675 
676 		if (data != NULL) {
677 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
678 			free(data);
679 		}
680 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
681 	}
682 	be64enc(nd->eui64, nvstore->eui64);
683 
684 	/* LBA data-sz = 2^lbads */
685 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
686 }
687 
688 static void
689 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
690 {
691 	__uint128_t power_cycles = 1;
692 
693 	memset(&sc->err_log, 0, sizeof(sc->err_log));
694 	memset(&sc->health_log, 0, sizeof(sc->health_log));
695 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
696 	memset(&sc->ns_log, 0, sizeof(sc->ns_log));
697 
698 	/* Set read/write remainder to round up according to spec */
699 	sc->read_dunits_remainder = 999;
700 	sc->write_dunits_remainder = 999;
701 
702 	/* Set nominal Health values checked by implementations */
703 	sc->health_log.temperature = NVME_TEMPERATURE;
704 	sc->health_log.available_spare = 100;
705 	sc->health_log.available_spare_threshold = 10;
706 
707 	/* Set Active Firmware Info to slot 1 */
708 	sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT);
709 	memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr,
710 	    sizeof(sc->fw_log.revision[0]));
711 
712 	memcpy(&sc->health_log.power_cycles, &power_cycles,
713 	    sizeof(sc->health_log.power_cycles));
714 }
715 
716 static void
717 pci_nvme_init_features(struct pci_nvme_softc *sc)
718 {
719 	enum nvme_feature	fid;
720 
721 	for (fid = 0; fid < NVME_FID_MAX; fid++) {
722 		switch (fid) {
723 		case NVME_FEAT_ARBITRATION:
724 		case NVME_FEAT_POWER_MANAGEMENT:
725 		case NVME_FEAT_INTERRUPT_COALESCING: //XXX
726 		case NVME_FEAT_WRITE_ATOMICITY:
727 			/* Mandatory but no special handling required */
728 		//XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
729 		//XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
730 		//		  this returns a data buffer
731 			break;
732 		case NVME_FEAT_TEMPERATURE_THRESHOLD:
733 			sc->feat[fid].set = nvme_feature_temperature;
734 			break;
735 		case NVME_FEAT_ERROR_RECOVERY:
736 			sc->feat[fid].namespace_specific = true;
737 			break;
738 		case NVME_FEAT_NUMBER_OF_QUEUES:
739 			sc->feat[fid].set = nvme_feature_num_queues;
740 			break;
741 		case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
742 			sc->feat[fid].set = nvme_feature_iv_config;
743 			break;
744 		case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
745 			sc->feat[fid].set = nvme_feature_async_event;
746 			/* Enable all AENs by default */
747 			sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK;
748 			break;
749 		default:
750 			sc->feat[fid].set = nvme_feature_invalid_cb;
751 			sc->feat[fid].get = nvme_feature_invalid_cb;
752 		}
753 	}
754 }
755 
756 static void
757 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
758 {
759 
760 	STAILQ_INIT(&sc->aer_list);
761 	sc->aer_count = 0;
762 }
763 
764 static void
765 pci_nvme_aer_init(struct pci_nvme_softc *sc)
766 {
767 
768 	pthread_mutex_init(&sc->aer_mtx, NULL);
769 	pci_nvme_aer_reset(sc);
770 }
771 
772 static void
773 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
774 {
775 	struct pci_nvme_aer *aer = NULL;
776 
777 	pthread_mutex_lock(&sc->aer_mtx);
778 	while (!STAILQ_EMPTY(&sc->aer_list)) {
779 		aer = STAILQ_FIRST(&sc->aer_list);
780 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
781 		free(aer);
782 	}
783 	pthread_mutex_unlock(&sc->aer_mtx);
784 
785 	pci_nvme_aer_reset(sc);
786 }
787 
788 static bool
789 pci_nvme_aer_available(struct pci_nvme_softc *sc)
790 {
791 
792 	return (sc->aer_count != 0);
793 }
794 
795 static bool
796 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
797 {
798 	struct nvme_controller_data *cd = &sc->ctrldata;
799 
800 	/* AERL is a zero based value while aer_count is one's based */
801 	return (sc->aer_count == (cd->aerl + 1U));
802 }
803 
804 /*
805  * Add an Async Event Request
806  *
807  * Stores an AER to be returned later if the Controller needs to notify the
808  * host of an event.
809  * Note that while the NVMe spec doesn't require Controllers to return AER's
810  * in order, this implementation does preserve the order.
811  */
812 static int
813 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
814 {
815 	struct pci_nvme_aer *aer = NULL;
816 
817 	aer = calloc(1, sizeof(struct pci_nvme_aer));
818 	if (aer == NULL)
819 		return (-1);
820 
821 	/* Save the Command ID for use in the completion message */
822 	aer->cid = cid;
823 
824 	pthread_mutex_lock(&sc->aer_mtx);
825 	sc->aer_count++;
826 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
827 	pthread_mutex_unlock(&sc->aer_mtx);
828 
829 	return (0);
830 }
831 
832 /*
833  * Get an Async Event Request structure
834  *
835  * Returns a pointer to an AER previously submitted by the host or NULL if
836  * no AER's exist. Caller is responsible for freeing the returned struct.
837  */
838 static struct pci_nvme_aer *
839 pci_nvme_aer_get(struct pci_nvme_softc *sc)
840 {
841 	struct pci_nvme_aer *aer = NULL;
842 
843 	pthread_mutex_lock(&sc->aer_mtx);
844 	aer = STAILQ_FIRST(&sc->aer_list);
845 	if (aer != NULL) {
846 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
847 		sc->aer_count--;
848 	}
849 	pthread_mutex_unlock(&sc->aer_mtx);
850 
851 	return (aer);
852 }
853 
854 static void
855 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
856 {
857 	uint32_t	atype;
858 
859 	memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
860 
861 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
862 		sc->aen[atype].atype = atype;
863 	}
864 }
865 
866 static void
867 pci_nvme_aen_init(struct pci_nvme_softc *sc)
868 {
869 	char nstr[80];
870 
871 	pci_nvme_aen_reset(sc);
872 
873 	pthread_mutex_init(&sc->aen_mtx, NULL);
874 	pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
875 	snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
876 	    sc->nsc_pi->pi_func);
877 	pthread_set_name_np(sc->aen_tid, nstr);
878 }
879 
880 static void
881 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
882 {
883 
884 	pci_nvme_aen_reset(sc);
885 }
886 
887 /* Notify the AEN thread of pending work */
888 static void
889 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
890 {
891 
892 	pthread_cond_signal(&sc->aen_cond);
893 }
894 
895 /*
896  * Post an Asynchronous Event Notification
897  */
898 static int32_t
899 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
900 		uint32_t event_data)
901 {
902 	struct pci_nvme_aen *aen;
903 
904 	if (atype >= PCI_NVME_AE_TYPE_MAX) {
905 		return(EINVAL);
906 	}
907 
908 	pthread_mutex_lock(&sc->aen_mtx);
909 	aen = &sc->aen[atype];
910 
911 	/* Has the controller already posted an event of this type? */
912 	if (aen->posted) {
913 		pthread_mutex_unlock(&sc->aen_mtx);
914 		return(EALREADY);
915 	}
916 
917 	aen->event_data = event_data;
918 	aen->posted = true;
919 	pthread_mutex_unlock(&sc->aen_mtx);
920 
921 	pci_nvme_aen_notify(sc);
922 
923 	return(0);
924 }
925 
926 static void
927 pci_nvme_aen_process(struct pci_nvme_softc *sc)
928 {
929 	struct pci_nvme_aer *aer;
930 	struct pci_nvme_aen *aen;
931 	pci_nvme_async_type atype;
932 	uint32_t mask;
933 	uint16_t status;
934 	uint8_t lid;
935 
936 	assert(pthread_mutex_isowned_np(&sc->aen_mtx));
937 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
938 		aen = &sc->aen[atype];
939 		/* Previous iterations may have depleted the available AER's */
940 		if (!pci_nvme_aer_available(sc)) {
941 			DPRINTF("%s: no AER", __func__);
942 			break;
943 		}
944 
945 		if (!aen->posted) {
946 			DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
947 			continue;
948 		}
949 
950 		status = NVME_SC_SUCCESS;
951 
952 		/* Is the event masked? */
953 		mask =
954 		    sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
955 
956 		DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
957 		switch (atype) {
958 		case PCI_NVME_AE_TYPE_ERROR:
959 			lid = NVME_LOG_ERROR;
960 			break;
961 		case PCI_NVME_AE_TYPE_SMART:
962 			mask &= 0xff;
963 			if ((mask & aen->event_data) == 0)
964 				continue;
965 			lid = NVME_LOG_HEALTH_INFORMATION;
966 			break;
967 		case PCI_NVME_AE_TYPE_NOTICE:
968 			if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) {
969 				EPRINTLN("%s unknown AEN notice type %u",
970 				    __func__, aen->event_data);
971 				status = NVME_SC_INTERNAL_DEVICE_ERROR;
972 				lid = 0;
973 				break;
974 			}
975 			if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0)
976 				continue;
977 			switch (aen->event_data) {
978 			case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED:
979 				lid = NVME_LOG_CHANGED_NAMESPACE;
980 				break;
981 			case PCI_NVME_AEI_NOTICE_FW_ACTIVATION:
982 				lid = NVME_LOG_FIRMWARE_SLOT;
983 				break;
984 			case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE:
985 				lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
986 				break;
987 			case PCI_NVME_AEI_NOTICE_ANA_CHANGE:
988 				lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS;
989 				break;
990 			case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE:
991 				lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
992 				break;
993 			case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT:
994 				lid = NVME_LOG_LBA_STATUS_INFORMATION;
995 				break;
996 			case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE:
997 				lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
998 				break;
999 			default:
1000 				lid = 0;
1001 			}
1002 			break;
1003 		default:
1004 			/* bad type?!? */
1005 			EPRINTLN("%s unknown AEN type %u", __func__, atype);
1006 			status = NVME_SC_INTERNAL_DEVICE_ERROR;
1007 			lid = 0;
1008 			break;
1009 		}
1010 
1011 		aer = pci_nvme_aer_get(sc);
1012 		assert(aer != NULL);
1013 
1014 		DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
1015 		pci_nvme_cq_update(sc, &sc->compl_queues[0],
1016 		    (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
1017 		    aer->cid,
1018 		    0,		/* SQID */
1019 		    status);
1020 
1021 		aen->event_data = 0;
1022 		aen->posted = false;
1023 
1024 		pci_generate_msix(sc->nsc_pi, 0);
1025 	}
1026 }
1027 
1028 static void *
1029 aen_thr(void *arg)
1030 {
1031 	struct pci_nvme_softc *sc;
1032 
1033 	sc = arg;
1034 
1035 	pthread_mutex_lock(&sc->aen_mtx);
1036 	for (;;) {
1037 		pci_nvme_aen_process(sc);
1038 		pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1039 	}
1040 	pthread_mutex_unlock(&sc->aen_mtx);
1041 
1042 	pthread_exit(NULL);
1043 	return (NULL);
1044 }
1045 
1046 static void
1047 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1048 {
1049 	uint32_t i;
1050 
1051 	DPRINTF("%s", __func__);
1052 
1053 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1054 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1055 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
1056 
1057 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1058 
1059 	sc->regs.vs = NVME_REV(1,4);	/* NVMe v1.4 */
1060 
1061 	sc->regs.cc = 0;
1062 
1063 	assert(sc->submit_queues != NULL);
1064 
1065 	for (i = 0; i < sc->num_squeues + 1; i++) {
1066 		sc->submit_queues[i].qbase = NULL;
1067 		sc->submit_queues[i].size = 0;
1068 		sc->submit_queues[i].cqid = 0;
1069 		sc->submit_queues[i].tail = 0;
1070 		sc->submit_queues[i].head = 0;
1071 	}
1072 
1073 	assert(sc->compl_queues != NULL);
1074 
1075 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1076 		sc->compl_queues[i].qbase = NULL;
1077 		sc->compl_queues[i].size = 0;
1078 		sc->compl_queues[i].tail = 0;
1079 		sc->compl_queues[i].head = 0;
1080 	}
1081 
1082 	sc->num_q_is_set = false;
1083 
1084 	pci_nvme_aer_destroy(sc);
1085 	pci_nvme_aen_destroy(sc);
1086 
1087 	/*
1088 	 * Clear CSTS.RDY last to prevent the host from enabling Controller
1089 	 * before cleanup completes
1090 	 */
1091 	sc->regs.csts = 0;
1092 }
1093 
1094 static void
1095 pci_nvme_reset(struct pci_nvme_softc *sc)
1096 {
1097 	pthread_mutex_lock(&sc->mtx);
1098 	pci_nvme_reset_locked(sc);
1099 	pthread_mutex_unlock(&sc->mtx);
1100 }
1101 
1102 static int
1103 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1104 {
1105 	uint16_t acqs, asqs;
1106 
1107 	DPRINTF("%s", __func__);
1108 
1109 	/*
1110 	 * NVMe 2.0 states that "enabling a controller while this field is
1111 	 * cleared to 0h produces undefined results" for both ACQS and
1112 	 * ASQS. If zero, set CFS and do not become ready.
1113 	 */
1114 	asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK);
1115 	if (asqs < 2) {
1116 		EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__,
1117 		    asqs - 1, sc->regs.aqa);
1118 		sc->regs.csts |= NVME_CSTS_CFS;
1119 		return (-1);
1120 	}
1121 	sc->submit_queues[0].size = asqs;
1122 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1123 	            sizeof(struct nvme_command) * asqs);
1124 	if (sc->submit_queues[0].qbase == NULL) {
1125 		EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__,
1126 		    sc->regs.asq);
1127 		sc->regs.csts |= NVME_CSTS_CFS;
1128 		return (-1);
1129 	}
1130 
1131 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1132 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1133 
1134 	acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1135 	    NVME_AQA_REG_ACQS_MASK);
1136 	if (acqs < 2) {
1137 		EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__,
1138 		    acqs - 1, sc->regs.aqa);
1139 		sc->regs.csts |= NVME_CSTS_CFS;
1140 		return (-1);
1141 	}
1142 	sc->compl_queues[0].size = acqs;
1143 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1144 	         sizeof(struct nvme_completion) * acqs);
1145 	if (sc->compl_queues[0].qbase == NULL) {
1146 		EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__,
1147 		    sc->regs.acq);
1148 		sc->regs.csts |= NVME_CSTS_CFS;
1149 		return (-1);
1150 	}
1151 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1152 
1153 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1154 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1155 
1156 	return (0);
1157 }
1158 
1159 static int
1160 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1161 	size_t len, enum nvme_copy_dir dir)
1162 {
1163 	uint8_t *p;
1164 	size_t bytes;
1165 
1166 	if (len > (8 * 1024)) {
1167 		return (-1);
1168 	}
1169 
1170 	/* Copy from the start of prp1 to the end of the physical page */
1171 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1172 	bytes = MIN(bytes, len);
1173 
1174 	p = vm_map_gpa(ctx, prp1, bytes);
1175 	if (p == NULL) {
1176 		return (-1);
1177 	}
1178 
1179 	if (dir == NVME_COPY_TO_PRP)
1180 		memcpy(p, b, bytes);
1181 	else
1182 		memcpy(b, p, bytes);
1183 
1184 	b += bytes;
1185 
1186 	len -= bytes;
1187 	if (len == 0) {
1188 		return (0);
1189 	}
1190 
1191 	len = MIN(len, PAGE_SIZE);
1192 
1193 	p = vm_map_gpa(ctx, prp2, len);
1194 	if (p == NULL) {
1195 		return (-1);
1196 	}
1197 
1198 	if (dir == NVME_COPY_TO_PRP)
1199 		memcpy(p, b, len);
1200 	else
1201 		memcpy(b, p, len);
1202 
1203 	return (0);
1204 }
1205 
1206 /*
1207  * Write a Completion Queue Entry update
1208  *
1209  * Write the completion and update the doorbell value
1210  */
1211 static void
1212 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1213 		struct nvme_completion_queue *cq,
1214 		uint32_t cdw0,
1215 		uint16_t cid,
1216 		uint16_t sqid,
1217 		uint16_t status)
1218 {
1219 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1220 	struct nvme_completion *cqe;
1221 
1222 	assert(cq->qbase != NULL);
1223 
1224 	pthread_mutex_lock(&cq->mtx);
1225 
1226 	cqe = &cq->qbase[cq->tail];
1227 
1228 	/* Flip the phase bit */
1229 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1230 
1231 	cqe->cdw0 = cdw0;
1232 	cqe->sqhd = sq->head;
1233 	cqe->sqid = sqid;
1234 	cqe->cid = cid;
1235 	cqe->status = status;
1236 
1237 	cq->tail++;
1238 	if (cq->tail >= cq->size) {
1239 		cq->tail = 0;
1240 	}
1241 
1242 	pthread_mutex_unlock(&cq->mtx);
1243 }
1244 
1245 static int
1246 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1247 	struct nvme_completion* compl)
1248 {
1249 	uint16_t qid = command->cdw10 & 0xffff;
1250 
1251 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1252 	if (qid == 0 || qid > sc->num_squeues ||
1253 	    (sc->submit_queues[qid].qbase == NULL)) {
1254 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1255 		        __func__, qid, sc->num_squeues);
1256 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1257 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1258 		return (1);
1259 	}
1260 
1261 	sc->submit_queues[qid].qbase = NULL;
1262 	sc->submit_queues[qid].cqid = 0;
1263 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1264 	return (1);
1265 }
1266 
1267 static int
1268 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1269 	struct nvme_completion* compl)
1270 {
1271 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
1272 		uint16_t qid = command->cdw10 & 0xffff;
1273 		struct nvme_submission_queue *nsq;
1274 
1275 		if ((qid == 0) || (qid > sc->num_squeues) ||
1276 		    (sc->submit_queues[qid].qbase != NULL)) {
1277 			WPRINTF("%s queue index %u > num_squeues %u",
1278 			        __func__, qid, sc->num_squeues);
1279 			pci_nvme_status_tc(&compl->status,
1280 			    NVME_SCT_COMMAND_SPECIFIC,
1281 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1282 			return (1);
1283 		}
1284 
1285 		nsq = &sc->submit_queues[qid];
1286 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1287 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1288 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1289 			/*
1290 			 * Queues must specify at least two entries
1291 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1292 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1293 			 */
1294 			pci_nvme_status_tc(&compl->status,
1295 			    NVME_SCT_COMMAND_SPECIFIC,
1296 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1297 			return (1);
1298 		}
1299 		nsq->head = nsq->tail = 0;
1300 
1301 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1302 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1303 			pci_nvme_status_tc(&compl->status,
1304 			    NVME_SCT_COMMAND_SPECIFIC,
1305 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1306 			return (1);
1307 		}
1308 
1309 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1310 			pci_nvme_status_tc(&compl->status,
1311 			    NVME_SCT_COMMAND_SPECIFIC,
1312 			    NVME_SC_COMPLETION_QUEUE_INVALID);
1313 			return (1);
1314 		}
1315 
1316 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1317 
1318 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1319 		              sizeof(struct nvme_command) * (size_t)nsq->size);
1320 
1321 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1322 		        qid, nsq->size, nsq->qbase, nsq->cqid);
1323 
1324 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1325 
1326 		DPRINTF("%s completed creating IOSQ qid %u",
1327 		         __func__, qid);
1328 	} else {
1329 		/*
1330 		 * Guest sent non-cont submission queue request.
1331 		 * This setting is unsupported by this emulation.
1332 		 */
1333 		WPRINTF("%s unsupported non-contig (list-based) "
1334 		         "create i/o submission queue", __func__);
1335 
1336 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1337 	}
1338 	return (1);
1339 }
1340 
1341 static int
1342 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1343 	struct nvme_completion* compl)
1344 {
1345 	uint16_t qid = command->cdw10 & 0xffff;
1346 	uint16_t sqid;
1347 
1348 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1349 	if (qid == 0 || qid > sc->num_cqueues ||
1350 	    (sc->compl_queues[qid].qbase == NULL)) {
1351 		WPRINTF("%s queue index %u / num_cqueues %u",
1352 		        __func__, qid, sc->num_cqueues);
1353 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1354 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1355 		return (1);
1356 	}
1357 
1358 	/* Deleting an Active CQ is an error */
1359 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1360 		if (sc->submit_queues[sqid].cqid == qid) {
1361 			pci_nvme_status_tc(&compl->status,
1362 			    NVME_SCT_COMMAND_SPECIFIC,
1363 			    NVME_SC_INVALID_QUEUE_DELETION);
1364 			return (1);
1365 		}
1366 
1367 	sc->compl_queues[qid].qbase = NULL;
1368 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1369 	return (1);
1370 }
1371 
1372 static int
1373 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1374 	struct nvme_completion* compl)
1375 {
1376 	struct nvme_completion_queue *ncq;
1377 	uint16_t qid = command->cdw10 & 0xffff;
1378 
1379 	/* Only support Physically Contiguous queues */
1380 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1381 		WPRINTF("%s unsupported non-contig (list-based) "
1382 		         "create i/o completion queue",
1383 		         __func__);
1384 
1385 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1386 		return (1);
1387 	}
1388 
1389 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1390 	    (sc->compl_queues[qid].qbase != NULL)) {
1391 		WPRINTF("%s queue index %u > num_cqueues %u",
1392 			__func__, qid, sc->num_cqueues);
1393 		pci_nvme_status_tc(&compl->status,
1394 		    NVME_SCT_COMMAND_SPECIFIC,
1395 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1396 		return (1);
1397  	}
1398 
1399 	ncq = &sc->compl_queues[qid];
1400 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1401 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1402 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1403 		pci_nvme_status_tc(&compl->status,
1404 		    NVME_SCT_COMMAND_SPECIFIC,
1405 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1406 		return (1);
1407 	}
1408 
1409 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1410 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1411 		/*
1412 		 * Queues must specify at least two entries
1413 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1414 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1415 		 */
1416 		pci_nvme_status_tc(&compl->status,
1417 		    NVME_SCT_COMMAND_SPECIFIC,
1418 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1419 		return (1);
1420 	}
1421 	ncq->head = ncq->tail = 0;
1422 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1423 		     command->prp1,
1424 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1425 
1426 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1427 
1428 
1429 	return (1);
1430 }
1431 
1432 static int
1433 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1434 	struct nvme_completion* compl)
1435 {
1436 	uint64_t logoff;
1437 	uint32_t logsize;
1438 	uint8_t logpage;
1439 
1440 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1441 
1442 	/*
1443 	 * Command specifies the number of dwords to return in fields NUMDU
1444 	 * and NUMDL. This is a zero-based value.
1445 	 */
1446 	logpage = command->cdw10 & 0xFF;
1447 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1448 	logsize *= sizeof(uint32_t);
1449 	logoff  = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1450 
1451 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1452 
1453 	switch (logpage) {
1454 	case NVME_LOG_ERROR:
1455 		if (logoff >= sizeof(sc->err_log)) {
1456 			pci_nvme_status_genc(&compl->status,
1457 			    NVME_SC_INVALID_FIELD);
1458 			break;
1459 		}
1460 
1461 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1462 		    command->prp2, (uint8_t *)&sc->err_log + logoff,
1463 		    MIN(logsize - logoff, sizeof(sc->err_log)),
1464 		    NVME_COPY_TO_PRP);
1465 		break;
1466 	case NVME_LOG_HEALTH_INFORMATION:
1467 		if (logoff >= sizeof(sc->health_log)) {
1468 			pci_nvme_status_genc(&compl->status,
1469 			    NVME_SC_INVALID_FIELD);
1470 			break;
1471 		}
1472 
1473 		pthread_mutex_lock(&sc->mtx);
1474 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1475 		    sizeof(sc->health_log.data_units_read));
1476 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1477 		    sizeof(sc->health_log.data_units_written));
1478 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1479 		    sizeof(sc->health_log.host_read_commands));
1480 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1481 		    sizeof(sc->health_log.host_write_commands));
1482 		pthread_mutex_unlock(&sc->mtx);
1483 
1484 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1485 		    command->prp2, (uint8_t *)&sc->health_log + logoff,
1486 		    MIN(logsize - logoff, sizeof(sc->health_log)),
1487 		    NVME_COPY_TO_PRP);
1488 		break;
1489 	case NVME_LOG_FIRMWARE_SLOT:
1490 		if (logoff >= sizeof(sc->fw_log)) {
1491 			pci_nvme_status_genc(&compl->status,
1492 			    NVME_SC_INVALID_FIELD);
1493 			break;
1494 		}
1495 
1496 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1497 		    command->prp2, (uint8_t *)&sc->fw_log + logoff,
1498 		    MIN(logsize - logoff, sizeof(sc->fw_log)),
1499 		    NVME_COPY_TO_PRP);
1500 		break;
1501 	case NVME_LOG_CHANGED_NAMESPACE:
1502 		if (logoff >= sizeof(sc->ns_log)) {
1503 			pci_nvme_status_genc(&compl->status,
1504 			    NVME_SC_INVALID_FIELD);
1505 			break;
1506 		}
1507 
1508 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1509 		    command->prp2, (uint8_t *)&sc->ns_log + logoff,
1510 		    MIN(logsize - logoff, sizeof(sc->ns_log)),
1511 		    NVME_COPY_TO_PRP);
1512 		memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1513 		break;
1514 	default:
1515 		DPRINTF("%s get log page %x command not supported",
1516 		        __func__, logpage);
1517 
1518 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1519 		    NVME_SC_INVALID_LOG_PAGE);
1520 	}
1521 
1522 	return (1);
1523 }
1524 
1525 static int
1526 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1527 	struct nvme_completion* compl)
1528 {
1529 	void *dest;
1530 	uint16_t status;
1531 
1532 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1533 	        command->cdw10 & 0xFF, command->nsid);
1534 
1535 	status = 0;
1536 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1537 
1538 	switch (command->cdw10 & 0xFF) {
1539 	case 0x00: /* return Identify Namespace data structure */
1540 		/* Global NS only valid with NS Management */
1541 		if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) {
1542 			pci_nvme_status_genc(&status,
1543 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1544 			break;
1545 		}
1546 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1547 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1548 		    NVME_COPY_TO_PRP);
1549 		break;
1550 	case 0x01: /* return Identify Controller data structure */
1551 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1552 		    command->prp2, (uint8_t *)&sc->ctrldata,
1553 		    sizeof(sc->ctrldata),
1554 		    NVME_COPY_TO_PRP);
1555 		break;
1556 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1557 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1558 		                  sizeof(uint32_t) * 1024);
1559 		/* All unused entries shall be zero */
1560 		memset(dest, 0, sizeof(uint32_t) * 1024);
1561 		((uint32_t *)dest)[0] = 1;
1562 		break;
1563 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1564 		if (command->nsid != 1) {
1565 			pci_nvme_status_genc(&status,
1566 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1567 			break;
1568 		}
1569 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1570 		                  sizeof(uint32_t) * 1024);
1571 		/* All bytes after the descriptor shall be zero */
1572 		memset(dest, 0, sizeof(uint32_t) * 1024);
1573 
1574 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1575 		((uint8_t *)dest)[0] = 1;
1576 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1577 		memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t));
1578 		break;
1579 	case 0x13:
1580 		/*
1581 		 * Controller list is optional but used by UNH tests. Return
1582 		 * a valid but empty list.
1583 		 */
1584 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1585 		                  sizeof(uint16_t) * 2048);
1586 		memset(dest, 0, sizeof(uint16_t) * 2048);
1587 		break;
1588 	default:
1589 		DPRINTF("%s unsupported identify command requested 0x%x",
1590 		         __func__, command->cdw10 & 0xFF);
1591 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1592 		break;
1593 	}
1594 
1595 	compl->status = status;
1596 	return (1);
1597 }
1598 
1599 static const char *
1600 nvme_fid_to_name(uint8_t fid)
1601 {
1602 	const char *name;
1603 
1604 	switch (fid) {
1605 	case NVME_FEAT_ARBITRATION:
1606 		name = "Arbitration";
1607 		break;
1608 	case NVME_FEAT_POWER_MANAGEMENT:
1609 		name = "Power Management";
1610 		break;
1611 	case NVME_FEAT_LBA_RANGE_TYPE:
1612 		name = "LBA Range Type";
1613 		break;
1614 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1615 		name = "Temperature Threshold";
1616 		break;
1617 	case NVME_FEAT_ERROR_RECOVERY:
1618 		name = "Error Recovery";
1619 		break;
1620 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1621 		name = "Volatile Write Cache";
1622 		break;
1623 	case NVME_FEAT_NUMBER_OF_QUEUES:
1624 		name = "Number of Queues";
1625 		break;
1626 	case NVME_FEAT_INTERRUPT_COALESCING:
1627 		name = "Interrupt Coalescing";
1628 		break;
1629 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1630 		name = "Interrupt Vector Configuration";
1631 		break;
1632 	case NVME_FEAT_WRITE_ATOMICITY:
1633 		name = "Write Atomicity Normal";
1634 		break;
1635 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1636 		name = "Asynchronous Event Configuration";
1637 		break;
1638 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1639 		name = "Autonomous Power State Transition";
1640 		break;
1641 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1642 		name = "Host Memory Buffer";
1643 		break;
1644 	case NVME_FEAT_TIMESTAMP:
1645 		name = "Timestamp";
1646 		break;
1647 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1648 		name = "Keep Alive Timer";
1649 		break;
1650 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1651 		name = "Host Controlled Thermal Management";
1652 		break;
1653 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1654 		name = "Non-Operation Power State Config";
1655 		break;
1656 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1657 		name = "Read Recovery Level Config";
1658 		break;
1659 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1660 		name = "Predictable Latency Mode Config";
1661 		break;
1662 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1663 		name = "Predictable Latency Mode Window";
1664 		break;
1665 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1666 		name = "LBA Status Information Report Interval";
1667 		break;
1668 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1669 		name = "Host Behavior Support";
1670 		break;
1671 	case NVME_FEAT_SANITIZE_CONFIG:
1672 		name = "Sanitize Config";
1673 		break;
1674 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1675 		name = "Endurance Group Event Configuration";
1676 		break;
1677 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1678 		name = "Software Progress Marker";
1679 		break;
1680 	case NVME_FEAT_HOST_IDENTIFIER:
1681 		name = "Host Identifier";
1682 		break;
1683 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1684 		name = "Reservation Notification Mask";
1685 		break;
1686 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1687 		name = "Reservation Persistence";
1688 		break;
1689 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1690 		name = "Namespace Write Protection Config";
1691 		break;
1692 	default:
1693 		name = "Unknown";
1694 		break;
1695 	}
1696 
1697 	return (name);
1698 }
1699 
1700 static void
1701 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused,
1702     struct nvme_feature_obj *feat __unused,
1703     struct nvme_command *command __unused,
1704     struct nvme_completion *compl)
1705 {
1706 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1707 }
1708 
1709 static void
1710 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1711     struct nvme_feature_obj *feat __unused,
1712     struct nvme_command *command,
1713     struct nvme_completion *compl)
1714 {
1715 	uint32_t i;
1716 	uint32_t cdw11 = command->cdw11;
1717 	uint16_t iv;
1718 	bool cd;
1719 
1720 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1721 
1722 	iv = cdw11 & 0xffff;
1723 	cd = cdw11 & (1 << 16);
1724 
1725 	if (iv > (sc->max_queues + 1)) {
1726 		return;
1727 	}
1728 
1729 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1730 	if ((iv == 0) && !cd)
1731 		return;
1732 
1733 	/* Requested Interrupt Vector must be used by a CQ */
1734 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1735 		if (sc->compl_queues[i].intr_vec == iv) {
1736 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1737 		}
1738 	}
1739 }
1740 
1741 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP		(0x4000)
1742 static void
1743 nvme_feature_async_event(struct pci_nvme_softc *sc __unused,
1744     struct nvme_feature_obj *feat __unused,
1745     struct nvme_command *command,
1746     struct nvme_completion *compl)
1747 {
1748 	if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP)
1749 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1750 }
1751 
1752 #define NVME_TEMP_THRESH_OVER	0
1753 #define NVME_TEMP_THRESH_UNDER	1
1754 static void
1755 nvme_feature_temperature(struct pci_nvme_softc *sc,
1756     struct nvme_feature_obj *feat __unused,
1757     struct nvme_command *command,
1758     struct nvme_completion *compl)
1759 {
1760 	uint16_t	tmpth;	/* Temperature Threshold */
1761 	uint8_t		tmpsel; /* Threshold Temperature Select */
1762 	uint8_t		thsel;  /* Threshold Type Select */
1763 	bool		set_crit = false;
1764 	bool		report_crit;
1765 
1766 	tmpth  = command->cdw11 & 0xffff;
1767 	tmpsel = (command->cdw11 >> 16) & 0xf;
1768 	thsel  = (command->cdw11 >> 20) & 0x3;
1769 
1770 	DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1771 
1772 	/* Check for unsupported values */
1773 	if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1774 	    (thsel > NVME_TEMP_THRESH_UNDER)) {
1775 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1776 		return;
1777 	}
1778 
1779 	if (((thsel == NVME_TEMP_THRESH_OVER)  && (NVME_TEMPERATURE >= tmpth)) ||
1780 	    ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1781 		set_crit = true;
1782 
1783 	pthread_mutex_lock(&sc->mtx);
1784 	if (set_crit)
1785 		sc->health_log.critical_warning |=
1786 		    NVME_CRIT_WARN_ST_TEMPERATURE;
1787 	else
1788 		sc->health_log.critical_warning &=
1789 		    ~NVME_CRIT_WARN_ST_TEMPERATURE;
1790 	pthread_mutex_unlock(&sc->mtx);
1791 
1792 	report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 &
1793 	    NVME_CRIT_WARN_ST_TEMPERATURE;
1794 
1795 	if (set_crit && report_crit)
1796 		pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1797 		    sc->health_log.critical_warning);
1798 
1799 	DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1800 }
1801 
1802 static void
1803 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1804     struct nvme_feature_obj *feat __unused,
1805     struct nvme_command *command,
1806     struct nvme_completion *compl)
1807 {
1808 	uint16_t nqr;	/* Number of Queues Requested */
1809 
1810 	if (sc->num_q_is_set) {
1811 		WPRINTF("%s: Number of Queues already set", __func__);
1812 		pci_nvme_status_genc(&compl->status,
1813 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1814 		return;
1815 	}
1816 
1817 	nqr = command->cdw11 & 0xFFFF;
1818 	if (nqr == 0xffff) {
1819 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1820 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1821 		return;
1822 	}
1823 
1824 	sc->num_squeues = ONE_BASED(nqr);
1825 	if (sc->num_squeues > sc->max_queues) {
1826 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1827 					sc->max_queues);
1828 		sc->num_squeues = sc->max_queues;
1829 	}
1830 
1831 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1832 	if (nqr == 0xffff) {
1833 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1834 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1835 		return;
1836 	}
1837 
1838 	sc->num_cqueues = ONE_BASED(nqr);
1839 	if (sc->num_cqueues > sc->max_queues) {
1840 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1841 					sc->max_queues);
1842 		sc->num_cqueues = sc->max_queues;
1843 	}
1844 
1845 	/* Patch the command value which will be saved on callback's return */
1846 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1847 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1848 
1849 	sc->num_q_is_set = true;
1850 }
1851 
1852 static int
1853 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1854 	struct nvme_completion *compl)
1855 {
1856 	struct nvme_feature_obj *feat;
1857 	uint32_t nsid = command->nsid;
1858 	uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10);
1859 	bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10);
1860 
1861 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1862 
1863 	if (fid >= NVME_FID_MAX) {
1864 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1865 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1866 		return (1);
1867 	}
1868 
1869 	if (sv) {
1870 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1871 		    NVME_SC_FEATURE_NOT_SAVEABLE);
1872 		return (1);
1873 	}
1874 
1875 	feat = &sc->feat[fid];
1876 
1877 	if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1878 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1879 		return (1);
1880 	}
1881 
1882 	if (!feat->namespace_specific &&
1883 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1884 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1885 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1886 		return (1);
1887 	}
1888 
1889 	compl->cdw0 = 0;
1890 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1891 
1892 	if (feat->set)
1893 		feat->set(sc, feat, command, compl);
1894 	else {
1895 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1896 		    NVME_SC_FEATURE_NOT_CHANGEABLE);
1897 		return (1);
1898 	}
1899 
1900 	DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1901 	if (compl->status == NVME_SC_SUCCESS) {
1902 		feat->cdw11 = command->cdw11;
1903 		if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1904 		    (command->cdw11 != 0))
1905 			pci_nvme_aen_notify(sc);
1906 	}
1907 
1908 	return (0);
1909 }
1910 
1911 #define NVME_FEATURES_SEL_SUPPORTED	0x3
1912 #define NVME_FEATURES_NS_SPECIFIC	(1 << 1)
1913 
1914 static int
1915 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1916 	struct nvme_completion* compl)
1917 {
1918 	struct nvme_feature_obj *feat;
1919 	uint8_t fid = command->cdw10 & 0xFF;
1920 	uint8_t sel = (command->cdw10 >> 8) & 0x7;
1921 
1922 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1923 
1924 	if (fid >= NVME_FID_MAX) {
1925 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1926 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1927 		return (1);
1928 	}
1929 
1930 	compl->cdw0 = 0;
1931 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1932 
1933 	feat = &sc->feat[fid];
1934 	if (feat->get) {
1935 		feat->get(sc, feat, command, compl);
1936 	}
1937 
1938 	if (compl->status == NVME_SC_SUCCESS) {
1939 		if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1940 			compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1941 		else
1942 			compl->cdw0 = feat->cdw11;
1943 	}
1944 
1945 	return (0);
1946 }
1947 
1948 static int
1949 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1950 	struct nvme_completion* compl)
1951 {
1952 	uint8_t	ses, lbaf, pi;
1953 
1954 	/* Only supports Secure Erase Setting - User Data Erase */
1955 	ses = (command->cdw10 >> 9) & 0x7;
1956 	if (ses > 0x1) {
1957 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1958 		return (1);
1959 	}
1960 
1961 	/* Only supports a single LBA Format */
1962 	lbaf = command->cdw10 & 0xf;
1963 	if (lbaf != 0) {
1964 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1965 		    NVME_SC_INVALID_FORMAT);
1966 		return (1);
1967 	}
1968 
1969 	/* Doesn't support Protection Infomation */
1970 	pi = (command->cdw10 >> 5) & 0x7;
1971 	if (pi != 0) {
1972 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1973 		return (1);
1974 	}
1975 
1976 	if (sc->nvstore.type == NVME_STOR_RAM) {
1977 		if (sc->nvstore.ctx)
1978 			free(sc->nvstore.ctx);
1979 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1980 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1981 	} else {
1982 		struct pci_nvme_ioreq *req;
1983 		int err;
1984 
1985 		req = pci_nvme_get_ioreq(sc);
1986 		if (req == NULL) {
1987 			pci_nvme_status_genc(&compl->status,
1988 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1989 			WPRINTF("%s: unable to allocate IO req", __func__);
1990 			return (1);
1991 		}
1992 		req->nvme_sq = &sc->submit_queues[0];
1993 		req->sqid = 0;
1994 		req->opc = command->opc;
1995 		req->cid = command->cid;
1996 		req->nsid = command->nsid;
1997 
1998 		req->io_req.br_offset = 0;
1999 		req->io_req.br_resid = sc->nvstore.size;
2000 		req->io_req.br_callback = pci_nvme_io_done;
2001 
2002 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
2003 		if (err) {
2004 			pci_nvme_status_genc(&compl->status,
2005 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2006 			pci_nvme_release_ioreq(sc, req);
2007 		} else
2008 			compl->status = NVME_NO_STATUS;
2009 	}
2010 
2011 	return (1);
2012 }
2013 
2014 static int
2015 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command,
2016     struct nvme_completion *compl)
2017 {
2018 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
2019 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
2020 
2021 	/* TODO: search for the command ID and abort it */
2022 
2023 	compl->cdw0 = 1;
2024 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
2025 	return (1);
2026 }
2027 
2028 static int
2029 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
2030 	struct nvme_command* command, struct nvme_completion* compl)
2031 {
2032 	DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
2033 	    sc->aer_count, sc->ctrldata.aerl, command->cid);
2034 
2035 	/* Don't exceed the Async Event Request Limit (AERL). */
2036 	if (pci_nvme_aer_limit_reached(sc)) {
2037 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
2038 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
2039 		return (1);
2040 	}
2041 
2042 	if (pci_nvme_aer_add(sc, command->cid)) {
2043 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
2044 				NVME_SC_INTERNAL_DEVICE_ERROR);
2045 		return (1);
2046 	}
2047 
2048 	/*
2049 	 * Raise events when they happen based on the Set Features cmd.
2050 	 * These events happen async, so only set completion successful if
2051 	 * there is an event reflective of the request to get event.
2052 	 */
2053 	compl->status = NVME_NO_STATUS;
2054 	pci_nvme_aen_notify(sc);
2055 
2056 	return (0);
2057 }
2058 
2059 static void
2060 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
2061 {
2062 	struct nvme_completion compl;
2063 	struct nvme_command *cmd;
2064 	struct nvme_submission_queue *sq;
2065 	struct nvme_completion_queue *cq;
2066 	uint16_t sqhead;
2067 
2068 	DPRINTF("%s index %u", __func__, (uint32_t)value);
2069 
2070 	sq = &sc->submit_queues[0];
2071 	cq = &sc->compl_queues[0];
2072 
2073 	pthread_mutex_lock(&sq->mtx);
2074 
2075 	sqhead = sq->head;
2076 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
2077 
2078 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2079 		cmd = &(sq->qbase)[sqhead];
2080 		compl.cdw0 = 0;
2081 		compl.status = 0;
2082 
2083 		switch (cmd->opc) {
2084 		case NVME_OPC_DELETE_IO_SQ:
2085 			DPRINTF("%s command DELETE_IO_SQ", __func__);
2086 			nvme_opc_delete_io_sq(sc, cmd, &compl);
2087 			break;
2088 		case NVME_OPC_CREATE_IO_SQ:
2089 			DPRINTF("%s command CREATE_IO_SQ", __func__);
2090 			nvme_opc_create_io_sq(sc, cmd, &compl);
2091 			break;
2092 		case NVME_OPC_DELETE_IO_CQ:
2093 			DPRINTF("%s command DELETE_IO_CQ", __func__);
2094 			nvme_opc_delete_io_cq(sc, cmd, &compl);
2095 			break;
2096 		case NVME_OPC_CREATE_IO_CQ:
2097 			DPRINTF("%s command CREATE_IO_CQ", __func__);
2098 			nvme_opc_create_io_cq(sc, cmd, &compl);
2099 			break;
2100 		case NVME_OPC_GET_LOG_PAGE:
2101 			DPRINTF("%s command GET_LOG_PAGE", __func__);
2102 			nvme_opc_get_log_page(sc, cmd, &compl);
2103 			break;
2104 		case NVME_OPC_IDENTIFY:
2105 			DPRINTF("%s command IDENTIFY", __func__);
2106 			nvme_opc_identify(sc, cmd, &compl);
2107 			break;
2108 		case NVME_OPC_ABORT:
2109 			DPRINTF("%s command ABORT", __func__);
2110 			nvme_opc_abort(sc, cmd, &compl);
2111 			break;
2112 		case NVME_OPC_SET_FEATURES:
2113 			DPRINTF("%s command SET_FEATURES", __func__);
2114 			nvme_opc_set_features(sc, cmd, &compl);
2115 			break;
2116 		case NVME_OPC_GET_FEATURES:
2117 			DPRINTF("%s command GET_FEATURES", __func__);
2118 			nvme_opc_get_features(sc, cmd, &compl);
2119 			break;
2120 		case NVME_OPC_FIRMWARE_ACTIVATE:
2121 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2122 			pci_nvme_status_tc(&compl.status,
2123 			    NVME_SCT_COMMAND_SPECIFIC,
2124 			    NVME_SC_INVALID_FIRMWARE_SLOT);
2125 			break;
2126 		case NVME_OPC_ASYNC_EVENT_REQUEST:
2127 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2128 			nvme_opc_async_event_req(sc, cmd, &compl);
2129 			break;
2130 		case NVME_OPC_FORMAT_NVM:
2131 			DPRINTF("%s command FORMAT_NVM", __func__);
2132 			if ((sc->ctrldata.oacs &
2133 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2134 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2135 				break;
2136 			}
2137 			nvme_opc_format_nvm(sc, cmd, &compl);
2138 			break;
2139 		case NVME_OPC_SECURITY_SEND:
2140 		case NVME_OPC_SECURITY_RECEIVE:
2141 		case NVME_OPC_SANITIZE:
2142 		case NVME_OPC_GET_LBA_STATUS:
2143 			DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2144 			    cmd->opc);
2145 			/* Valid but unsupported opcodes */
2146 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2147 			break;
2148 		default:
2149 			DPRINTF("%s command OPC=%#X (not implemented)",
2150 			    __func__,
2151 			    cmd->opc);
2152 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2153 		}
2154 		sqhead = (sqhead + 1) % sq->size;
2155 
2156 		if (NVME_COMPLETION_VALID(compl)) {
2157 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
2158 			    compl.cdw0,
2159 			    cmd->cid,
2160 			    0,		/* SQID */
2161 			    compl.status);
2162 		}
2163 	}
2164 
2165 	DPRINTF("setting sqhead %u", sqhead);
2166 	sq->head = sqhead;
2167 
2168 	if (cq->head != cq->tail)
2169 		pci_generate_msix(sc->nsc_pi, 0);
2170 
2171 	pthread_mutex_unlock(&sq->mtx);
2172 }
2173 
2174 /*
2175  * Update the Write and Read statistics reported in SMART data
2176  *
2177  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2178  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2179  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2180  */
2181 static void
2182 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2183     size_t bytes, uint16_t status)
2184 {
2185 
2186 	pthread_mutex_lock(&sc->mtx);
2187 	switch (opc) {
2188 	case NVME_OPC_WRITE:
2189 		sc->write_commands++;
2190 		if (status != NVME_SC_SUCCESS)
2191 			break;
2192 		sc->write_dunits_remainder += (bytes / 512);
2193 		while (sc->write_dunits_remainder >= 1000) {
2194 			sc->write_data_units++;
2195 			sc->write_dunits_remainder -= 1000;
2196 		}
2197 		break;
2198 	case NVME_OPC_READ:
2199 		sc->read_commands++;
2200 		if (status != NVME_SC_SUCCESS)
2201 			break;
2202 		sc->read_dunits_remainder += (bytes / 512);
2203 		while (sc->read_dunits_remainder >= 1000) {
2204 			sc->read_data_units++;
2205 			sc->read_dunits_remainder -= 1000;
2206 		}
2207 		break;
2208 	default:
2209 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2210 		break;
2211 	}
2212 	pthread_mutex_unlock(&sc->mtx);
2213 }
2214 
2215 /*
2216  * Check if the combination of Starting LBA (slba) and number of blocks
2217  * exceeds the range of the underlying storage.
2218  *
2219  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2220  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2221  * overflow.
2222  */
2223 static bool
2224 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2225     uint32_t nblocks)
2226 {
2227 	size_t	offset, bytes;
2228 
2229 	/* Overflow check of multiplying Starting LBA by the sector size */
2230 	if (slba >> (64 - nvstore->sectsz_bits))
2231 		return (true);
2232 
2233 	offset = slba << nvstore->sectsz_bits;
2234 	bytes = nblocks << nvstore->sectsz_bits;
2235 
2236 	/* Overflow check of Number of Logical Blocks */
2237 	if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes))
2238 		return (true);
2239 
2240 	return (false);
2241 }
2242 
2243 static int
2244 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused,
2245     struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset)
2246 {
2247 	int iovidx;
2248 	bool range_is_contiguous;
2249 
2250 	if (req == NULL)
2251 		return (-1);
2252 
2253 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2254 		return (-1);
2255 	}
2256 
2257 	/*
2258 	 * Minimize the number of IOVs by concatenating contiguous address
2259 	 * ranges. If the IOV count is zero, there is no previous range to
2260 	 * concatenate.
2261 	 */
2262 	if (req->io_req.br_iovcnt == 0)
2263 		range_is_contiguous = false;
2264 	else
2265 		range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr;
2266 
2267 	if (range_is_contiguous) {
2268 		iovidx = req->io_req.br_iovcnt - 1;
2269 
2270 		req->io_req.br_iov[iovidx].iov_base =
2271 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2272 				     req->prev_gpaddr, size);
2273 		if (req->io_req.br_iov[iovidx].iov_base == NULL)
2274 			return (-1);
2275 
2276 		req->prev_size += size;
2277 		req->io_req.br_resid += size;
2278 
2279 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2280 	} else {
2281 		iovidx = req->io_req.br_iovcnt;
2282 		if (iovidx == 0) {
2283 			req->io_req.br_offset = offset;
2284 			req->io_req.br_resid = 0;
2285 			req->io_req.br_param = req;
2286 		}
2287 
2288 		req->io_req.br_iov[iovidx].iov_base =
2289 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2290 				     gpaddr, size);
2291 		if (req->io_req.br_iov[iovidx].iov_base == NULL)
2292 			return (-1);
2293 
2294 		req->io_req.br_iov[iovidx].iov_len = size;
2295 
2296 		req->prev_gpaddr = gpaddr;
2297 		req->prev_size = size;
2298 		req->io_req.br_resid += size;
2299 
2300 		req->io_req.br_iovcnt++;
2301 	}
2302 
2303 	return (0);
2304 }
2305 
2306 static void
2307 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2308     struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status)
2309 {
2310 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2311 
2312 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2313 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2314 		 NVME_STATUS_GET_SC(status));
2315 
2316 	pci_nvme_cq_update(sc, cq, 0, cid, sqid, status);
2317 
2318 	if (cq->head != cq->tail) {
2319 		if (cq->intr_en & NVME_CQ_INTEN) {
2320 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2321 		} else {
2322 			DPRINTF("%s: CQ%u interrupt disabled",
2323 						__func__, sq->cqid);
2324 		}
2325 	}
2326 }
2327 
2328 static void
2329 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2330 {
2331 	req->sc = NULL;
2332 	req->nvme_sq = NULL;
2333 	req->sqid = 0;
2334 
2335 	pthread_mutex_lock(&sc->mtx);
2336 
2337 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2338 	sc->pending_ios--;
2339 
2340 	/* when no more IO pending, can set to ready if device reset/enabled */
2341 	if (sc->pending_ios == 0 &&
2342 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2343 		sc->regs.csts |= NVME_CSTS_RDY;
2344 
2345 	pthread_mutex_unlock(&sc->mtx);
2346 
2347 	sem_post(&sc->iosemlock);
2348 }
2349 
2350 static struct pci_nvme_ioreq *
2351 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2352 {
2353 	struct pci_nvme_ioreq *req = NULL;
2354 
2355 	sem_wait(&sc->iosemlock);
2356 	pthread_mutex_lock(&sc->mtx);
2357 
2358 	req = STAILQ_FIRST(&sc->ioreqs_free);
2359 	assert(req != NULL);
2360 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2361 
2362 	req->sc = sc;
2363 
2364 	sc->pending_ios++;
2365 
2366 	pthread_mutex_unlock(&sc->mtx);
2367 
2368 	req->io_req.br_iovcnt = 0;
2369 	req->io_req.br_offset = 0;
2370 	req->io_req.br_resid = 0;
2371 	req->io_req.br_param = req;
2372 	req->prev_gpaddr = 0;
2373 	req->prev_size = 0;
2374 
2375 	return req;
2376 }
2377 
2378 static void
2379 pci_nvme_io_done(struct blockif_req *br, int err)
2380 {
2381 	struct pci_nvme_ioreq *req = br->br_param;
2382 	struct nvme_submission_queue *sq = req->nvme_sq;
2383 	uint16_t code, status;
2384 
2385 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
2386 
2387 	/* TODO return correct error */
2388 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2389 	status = 0;
2390 	pci_nvme_status_genc(&status, code);
2391 
2392 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status);
2393 	pci_nvme_stats_write_read_update(req->sc, req->opc,
2394 	    req->bytes, status);
2395 	pci_nvme_release_ioreq(req->sc, req);
2396 }
2397 
2398 /*
2399  * Implements the Flush command. The specification states:
2400  *    If a volatile write cache is not present, Flush commands complete
2401  *    successfully and have no effect
2402  * in the description of the Volatile Write Cache (VWC) field of the Identify
2403  * Controller data. Therefore, set status to Success if the command is
2404  * not supported (i.e. RAM or as indicated by the blockif).
2405  */
2406 static bool
2407 nvme_opc_flush(struct pci_nvme_softc *sc __unused,
2408     struct nvme_command *cmd __unused,
2409     struct pci_nvme_blockstore *nvstore,
2410     struct pci_nvme_ioreq *req,
2411     uint16_t *status)
2412 {
2413 	bool pending = false;
2414 
2415 	if (nvstore->type == NVME_STOR_RAM) {
2416 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2417 	} else {
2418 		int err;
2419 
2420 		req->io_req.br_callback = pci_nvme_io_done;
2421 
2422 		err = blockif_flush(nvstore->ctx, &req->io_req);
2423 		switch (err) {
2424 		case 0:
2425 			pending = true;
2426 			break;
2427 		case EOPNOTSUPP:
2428 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2429 			break;
2430 		default:
2431 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2432 		}
2433 	}
2434 
2435 	return (pending);
2436 }
2437 
2438 static uint16_t
2439 nvme_write_read_ram(struct pci_nvme_softc *sc,
2440     struct pci_nvme_blockstore *nvstore,
2441     uint64_t prp1, uint64_t prp2,
2442     size_t offset, uint64_t bytes,
2443     bool is_write)
2444 {
2445 	uint8_t *buf = nvstore->ctx;
2446 	enum nvme_copy_dir dir;
2447 	uint16_t status;
2448 
2449 	if (is_write)
2450 		dir = NVME_COPY_TO_PRP;
2451 	else
2452 		dir = NVME_COPY_FROM_PRP;
2453 
2454 	status = 0;
2455 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2456 	    buf + offset, bytes, dir))
2457 		pci_nvme_status_genc(&status,
2458 		    NVME_SC_DATA_TRANSFER_ERROR);
2459 	else
2460 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2461 
2462 	return (status);
2463 }
2464 
2465 static uint16_t
2466 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2467     struct pci_nvme_blockstore *nvstore,
2468     struct pci_nvme_ioreq *req,
2469     uint64_t prp1, uint64_t prp2,
2470     size_t offset, uint64_t bytes,
2471     bool is_write)
2472 {
2473 	uint64_t size;
2474 	int err;
2475 	uint16_t status = NVME_NO_STATUS;
2476 
2477 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2478 	if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) {
2479 		err = -1;
2480 		goto out;
2481 	}
2482 
2483 	offset += size;
2484 	bytes  -= size;
2485 
2486 	if (bytes == 0) {
2487 		;
2488 	} else if (bytes <= PAGE_SIZE) {
2489 		size = bytes;
2490 		if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) {
2491 			err = -1;
2492 			goto out;
2493 		}
2494 	} else {
2495 		void *vmctx = sc->nsc_pi->pi_vmctx;
2496 		uint64_t *prp_list = &prp2;
2497 		uint64_t *last = prp_list;
2498 
2499 		/* PRP2 is pointer to a physical region page list */
2500 		while (bytes) {
2501 			/* Last entry in list points to the next list */
2502 			if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2503 				uint64_t prp = *prp_list;
2504 
2505 				prp_list = paddr_guest2host(vmctx, prp,
2506 				    PAGE_SIZE - (prp % PAGE_SIZE));
2507 				if (prp_list == NULL) {
2508 					err = -1;
2509 					goto out;
2510 				}
2511 				last = prp_list + (NVME_PRP2_ITEMS - 1);
2512 			}
2513 
2514 			size = MIN(bytes, PAGE_SIZE);
2515 
2516 			if (pci_nvme_append_iov_req(sc, req, *prp_list, size,
2517 			    offset)) {
2518 				err = -1;
2519 				goto out;
2520 			}
2521 
2522 			offset += size;
2523 			bytes  -= size;
2524 
2525 			prp_list++;
2526 		}
2527 	}
2528 	req->io_req.br_callback = pci_nvme_io_done;
2529 	if (is_write)
2530 		err = blockif_write(nvstore->ctx, &req->io_req);
2531 	else
2532 		err = blockif_read(nvstore->ctx, &req->io_req);
2533 out:
2534 	if (err)
2535 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2536 
2537 	return (status);
2538 }
2539 
2540 static bool
2541 nvme_opc_write_read(struct pci_nvme_softc *sc,
2542     struct nvme_command *cmd,
2543     struct pci_nvme_blockstore *nvstore,
2544     struct pci_nvme_ioreq *req,
2545     uint16_t *status)
2546 {
2547 	uint64_t lba, nblocks, bytes;
2548 	size_t offset;
2549 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2550 	bool pending = false;
2551 
2552 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2553 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2554 	bytes = nblocks << nvstore->sectsz_bits;
2555 	if (bytes > NVME_MAX_DATA_SIZE) {
2556 		WPRINTF("%s command would exceed MDTS", __func__);
2557 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2558 		goto out;
2559 	}
2560 
2561 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2562 		WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
2563 		    __func__, lba, nblocks);
2564 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2565 		goto out;
2566 	}
2567 
2568 	offset = lba << nvstore->sectsz_bits;
2569 
2570 	req->bytes = bytes;
2571 	req->io_req.br_offset = lba;
2572 
2573 	/* PRP bits 1:0 must be zero */
2574 	cmd->prp1 &= ~0x3UL;
2575 	cmd->prp2 &= ~0x3UL;
2576 
2577 	if (nvstore->type == NVME_STOR_RAM) {
2578 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2579 		    cmd->prp2, offset, bytes, is_write);
2580 	} else {
2581 		*status = nvme_write_read_blockif(sc, nvstore, req,
2582 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2583 
2584 		if (*status == NVME_NO_STATUS)
2585 			pending = true;
2586 	}
2587 out:
2588 	if (!pending)
2589 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2590 
2591 	return (pending);
2592 }
2593 
2594 static void
2595 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2596 {
2597 	struct pci_nvme_ioreq *req = br->br_param;
2598 	struct pci_nvme_softc *sc = req->sc;
2599 	bool done = true;
2600 	uint16_t status;
2601 
2602 	status = 0;
2603 	if (err) {
2604 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2605 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2606 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2607 	} else {
2608 		struct iovec *iov = req->io_req.br_iov;
2609 
2610 		req->prev_gpaddr++;
2611 		iov += req->prev_gpaddr;
2612 
2613 		/* The iov_* values already include the sector size */
2614 		req->io_req.br_offset = (off_t)iov->iov_base;
2615 		req->io_req.br_resid = iov->iov_len;
2616 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2617 			pci_nvme_status_genc(&status,
2618 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2619 		} else
2620 			done = false;
2621 	}
2622 
2623 	if (done) {
2624 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid,
2625 		    status);
2626 		pci_nvme_release_ioreq(sc, req);
2627 	}
2628 }
2629 
2630 static bool
2631 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2632     struct nvme_command *cmd,
2633     struct pci_nvme_blockstore *nvstore,
2634     struct pci_nvme_ioreq *req,
2635     uint16_t *status)
2636 {
2637 	struct nvme_dsm_range *range = NULL;
2638 	uint32_t nr, r, non_zero, dr;
2639 	int err;
2640 	bool pending = false;
2641 
2642 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2643 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2644 		goto out;
2645 	}
2646 
2647 	nr = cmd->cdw10 & 0xff;
2648 
2649 	/* copy locally because a range entry could straddle PRPs */
2650 	range = calloc(1, NVME_MAX_DSM_TRIM);
2651 	if (range == NULL) {
2652 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2653 		goto out;
2654 	}
2655 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2656 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2657 
2658 	/* Check for invalid ranges and the number of non-zero lengths */
2659 	non_zero = 0;
2660 	for (r = 0; r <= nr; r++) {
2661 		if (pci_nvme_out_of_range(nvstore,
2662 		    range[r].starting_lba, range[r].length)) {
2663 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2664 			goto out;
2665 		}
2666 		if (range[r].length != 0)
2667 			non_zero++;
2668 	}
2669 
2670 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2671 		size_t offset, bytes;
2672 		int sectsz_bits = sc->nvstore.sectsz_bits;
2673 
2674 		/*
2675 		 * DSM calls are advisory only, and compliant controllers
2676 		 * may choose to take no actions (i.e. return Success).
2677 		 */
2678 		if (!nvstore->deallocate) {
2679 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2680 			goto out;
2681 		}
2682 
2683 		/* If all ranges have a zero length, return Success */
2684 		if (non_zero == 0) {
2685 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2686 			goto out;
2687 		}
2688 
2689 		if (req == NULL) {
2690 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2691 			goto out;
2692 		}
2693 
2694 		offset = range[0].starting_lba << sectsz_bits;
2695 		bytes = range[0].length << sectsz_bits;
2696 
2697 		/*
2698 		 * If the request is for more than a single range, store
2699 		 * the ranges in the br_iov. Optimize for the common case
2700 		 * of a single range.
2701 		 *
2702 		 * Note that NVMe Number of Ranges is a zero based value
2703 		 */
2704 		req->io_req.br_iovcnt = 0;
2705 		req->io_req.br_offset = offset;
2706 		req->io_req.br_resid = bytes;
2707 
2708 		if (nr == 0) {
2709 			req->io_req.br_callback = pci_nvme_io_done;
2710 		} else {
2711 			struct iovec *iov = req->io_req.br_iov;
2712 
2713 			for (r = 0, dr = 0; r <= nr; r++) {
2714 				offset = range[r].starting_lba << sectsz_bits;
2715 				bytes = range[r].length << sectsz_bits;
2716 				if (bytes == 0)
2717 					continue;
2718 
2719 				if ((nvstore->size - offset) < bytes) {
2720 					pci_nvme_status_genc(status,
2721 					    NVME_SC_LBA_OUT_OF_RANGE);
2722 					goto out;
2723 				}
2724 				iov[dr].iov_base = (void *)offset;
2725 				iov[dr].iov_len = bytes;
2726 				dr++;
2727 			}
2728 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2729 
2730 			/*
2731 			 * Use prev_gpaddr to track the current entry and
2732 			 * prev_size to track the number of entries
2733 			 */
2734 			req->prev_gpaddr = 0;
2735 			req->prev_size = dr;
2736 		}
2737 
2738 		err = blockif_delete(nvstore->ctx, &req->io_req);
2739 		if (err)
2740 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2741 		else
2742 			pending = true;
2743 	}
2744 out:
2745 	free(range);
2746 	return (pending);
2747 }
2748 
2749 static void
2750 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2751 {
2752 	struct nvme_submission_queue *sq;
2753 	uint16_t status;
2754 	uint16_t sqhead;
2755 
2756 	/* handle all submissions up to sq->tail index */
2757 	sq = &sc->submit_queues[idx];
2758 
2759 	pthread_mutex_lock(&sq->mtx);
2760 
2761 	sqhead = sq->head;
2762 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2763 	         idx, sqhead, sq->tail, sq->qbase);
2764 
2765 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2766 		struct nvme_command *cmd;
2767 		struct pci_nvme_ioreq *req;
2768 		uint32_t nsid;
2769 		bool pending;
2770 
2771 		pending = false;
2772 		req = NULL;
2773 		status = 0;
2774 
2775 		cmd = &sq->qbase[sqhead];
2776 		sqhead = (sqhead + 1) % sq->size;
2777 
2778 		nsid = le32toh(cmd->nsid);
2779 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2780 			pci_nvme_status_genc(&status,
2781 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2782 			status |=
2783 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2784 			goto complete;
2785  		}
2786 
2787 		req = pci_nvme_get_ioreq(sc);
2788 		if (req == NULL) {
2789 			pci_nvme_status_genc(&status,
2790 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2791 			WPRINTF("%s: unable to allocate IO req", __func__);
2792 			goto complete;
2793 		}
2794 		req->nvme_sq = sq;
2795 		req->sqid = idx;
2796 		req->opc = cmd->opc;
2797 		req->cid = cmd->cid;
2798 		req->nsid = cmd->nsid;
2799 
2800 		switch (cmd->opc) {
2801 		case NVME_OPC_FLUSH:
2802 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2803 			    req, &status);
2804  			break;
2805 		case NVME_OPC_WRITE:
2806 		case NVME_OPC_READ:
2807 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2808 			    req, &status);
2809 			break;
2810 		case NVME_OPC_WRITE_ZEROES:
2811 			/* TODO: write zeroes
2812 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2813 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2814 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2815 			break;
2816 		case NVME_OPC_DATASET_MANAGEMENT:
2817  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2818 			    req, &status);
2819 			break;
2820  		default:
2821  			WPRINTF("%s unhandled io command 0x%x",
2822 			    __func__, cmd->opc);
2823 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2824 		}
2825 complete:
2826 		if (!pending) {
2827 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, status);
2828 			if (req != NULL)
2829 				pci_nvme_release_ioreq(sc, req);
2830 		}
2831 	}
2832 
2833 	sq->head = sqhead;
2834 
2835 	pthread_mutex_unlock(&sq->mtx);
2836 }
2837 
2838 static void
2839 pci_nvme_handle_doorbell(struct vmctx *ctx __unused, struct pci_nvme_softc* sc,
2840 	uint64_t idx, int is_sq, uint64_t value)
2841 {
2842 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2843 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2844 
2845 	if (is_sq) {
2846 		if (idx > sc->num_squeues) {
2847 			WPRINTF("%s queue index %lu overflow from "
2848 			         "guest (max %u)",
2849 			         __func__, idx, sc->num_squeues);
2850 			return;
2851 		}
2852 
2853 		atomic_store_short(&sc->submit_queues[idx].tail,
2854 		                   (uint16_t)value);
2855 
2856 		if (idx == 0) {
2857 			pci_nvme_handle_admin_cmd(sc, value);
2858 		} else {
2859 			/* submission queue; handle new entries in SQ */
2860 			if (idx > sc->num_squeues) {
2861 				WPRINTF("%s SQ index %lu overflow from "
2862 				         "guest (max %u)",
2863 				         __func__, idx, sc->num_squeues);
2864 				return;
2865 			}
2866 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2867 		}
2868 	} else {
2869 		if (idx > sc->num_cqueues) {
2870 			WPRINTF("%s queue index %lu overflow from "
2871 			         "guest (max %u)",
2872 			         __func__, idx, sc->num_cqueues);
2873 			return;
2874 		}
2875 
2876 		atomic_store_short(&sc->compl_queues[idx].head,
2877 				(uint16_t)value);
2878 	}
2879 }
2880 
2881 static void
2882 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2883 {
2884 	const char *s = iswrite ? "WRITE" : "READ";
2885 
2886 	switch (offset) {
2887 	case NVME_CR_CAP_LOW:
2888 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2889 		break;
2890 	case NVME_CR_CAP_HI:
2891 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2892 		break;
2893 	case NVME_CR_VS:
2894 		DPRINTF("%s %s NVME_CR_VS", func, s);
2895 		break;
2896 	case NVME_CR_INTMS:
2897 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2898 		break;
2899 	case NVME_CR_INTMC:
2900 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2901 		break;
2902 	case NVME_CR_CC:
2903 		DPRINTF("%s %s NVME_CR_CC", func, s);
2904 		break;
2905 	case NVME_CR_CSTS:
2906 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2907 		break;
2908 	case NVME_CR_NSSR:
2909 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2910 		break;
2911 	case NVME_CR_AQA:
2912 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2913 		break;
2914 	case NVME_CR_ASQ_LOW:
2915 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2916 		break;
2917 	case NVME_CR_ASQ_HI:
2918 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2919 		break;
2920 	case NVME_CR_ACQ_LOW:
2921 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2922 		break;
2923 	case NVME_CR_ACQ_HI:
2924 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2925 		break;
2926 	default:
2927 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2928 	}
2929 
2930 }
2931 
2932 static void
2933 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2934 	uint64_t offset, int size, uint64_t value)
2935 {
2936 	uint32_t ccreg;
2937 
2938 	if (offset >= NVME_DOORBELL_OFFSET) {
2939 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2940 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2941 		int is_sq = (belloffset % 8) < 4;
2942 
2943 		if ((sc->regs.csts & NVME_CSTS_RDY) == 0) {
2944 			WPRINTF("doorbell write prior to RDY (offset=%#lx)\n",
2945 			    offset);
2946 			return;
2947 		}
2948 
2949 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2950 			WPRINTF("guest attempted an overflow write offset "
2951 			         "0x%lx, val 0x%lx in %s",
2952 			         offset, value, __func__);
2953 			return;
2954 		}
2955 
2956 		if (is_sq) {
2957 			if (sc->submit_queues[idx].qbase == NULL)
2958 				return;
2959 		} else if (sc->compl_queues[idx].qbase == NULL)
2960 			return;
2961 
2962 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2963 		return;
2964 	}
2965 
2966 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2967 	        offset, size, value);
2968 
2969 	if (size != 4) {
2970 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2971 		         "val 0x%lx) to bar0 in %s",
2972 		         size, offset, value, __func__);
2973 		/* TODO: shutdown device */
2974 		return;
2975 	}
2976 
2977 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2978 
2979 	pthread_mutex_lock(&sc->mtx);
2980 
2981 	switch (offset) {
2982 	case NVME_CR_CAP_LOW:
2983 	case NVME_CR_CAP_HI:
2984 		/* readonly */
2985 		break;
2986 	case NVME_CR_VS:
2987 		/* readonly */
2988 		break;
2989 	case NVME_CR_INTMS:
2990 		/* MSI-X, so ignore */
2991 		break;
2992 	case NVME_CR_INTMC:
2993 		/* MSI-X, so ignore */
2994 		break;
2995 	case NVME_CR_CC:
2996 		ccreg = (uint32_t)value;
2997 
2998 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2999 		         "iocqes %u",
3000 		        __func__,
3001 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
3002 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
3003 			 NVME_CC_GET_IOCQES(ccreg));
3004 
3005 		if (NVME_CC_GET_SHN(ccreg)) {
3006 			/* perform shutdown - flush out data to backend */
3007 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
3008 			    NVME_CSTS_REG_SHST_SHIFT);
3009 			sc->regs.csts |= NVME_SHST_COMPLETE <<
3010 			    NVME_CSTS_REG_SHST_SHIFT;
3011 		}
3012 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
3013 			if (NVME_CC_GET_EN(ccreg) == 0)
3014 				/* transition 1-> causes controller reset */
3015 				pci_nvme_reset_locked(sc);
3016 			else
3017 				pci_nvme_init_controller(ctx, sc);
3018 		}
3019 
3020 		/* Insert the iocqes, iosqes and en bits from the write */
3021 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
3022 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
3023 		if (NVME_CC_GET_EN(ccreg) == 0) {
3024 			/* Insert the ams, mps and css bit fields */
3025 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
3026 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
3027 			sc->regs.csts &= ~NVME_CSTS_RDY;
3028 		} else if ((sc->pending_ios == 0) &&
3029 		    !(sc->regs.csts & NVME_CSTS_CFS)) {
3030 			sc->regs.csts |= NVME_CSTS_RDY;
3031 		}
3032 		break;
3033 	case NVME_CR_CSTS:
3034 		break;
3035 	case NVME_CR_NSSR:
3036 		/* ignore writes; don't support subsystem reset */
3037 		break;
3038 	case NVME_CR_AQA:
3039 		sc->regs.aqa = (uint32_t)value;
3040 		break;
3041 	case NVME_CR_ASQ_LOW:
3042 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
3043 		               (0xFFFFF000 & value);
3044 		break;
3045 	case NVME_CR_ASQ_HI:
3046 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
3047 		               (value << 32);
3048 		break;
3049 	case NVME_CR_ACQ_LOW:
3050 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
3051 		               (0xFFFFF000 & value);
3052 		break;
3053 	case NVME_CR_ACQ_HI:
3054 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
3055 		               (value << 32);
3056 		break;
3057 	default:
3058 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
3059 		         __func__, offset, value, size);
3060 	}
3061 	pthread_mutex_unlock(&sc->mtx);
3062 }
3063 
3064 static void
3065 pci_nvme_write(struct vmctx *ctx, int vcpu __unused, struct pci_devinst *pi,
3066     int baridx, uint64_t offset, int size, uint64_t value)
3067 {
3068 	struct pci_nvme_softc* sc = pi->pi_arg;
3069 
3070 	if (baridx == pci_msix_table_bar(pi) ||
3071 	    baridx == pci_msix_pba_bar(pi)) {
3072 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
3073 		         " value 0x%lx", baridx, offset, size, value);
3074 
3075 		pci_emul_msix_twrite(pi, offset, size, value);
3076 		return;
3077 	}
3078 
3079 	switch (baridx) {
3080 	case 0:
3081 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
3082 		break;
3083 
3084 	default:
3085 		DPRINTF("%s unknown baridx %d, val 0x%lx",
3086 		         __func__, baridx, value);
3087 	}
3088 }
3089 
3090 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
3091 	uint64_t offset, int size)
3092 {
3093 	uint64_t value;
3094 
3095 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
3096 
3097 	if (offset < NVME_DOORBELL_OFFSET) {
3098 		void *p = &(sc->regs);
3099 		pthread_mutex_lock(&sc->mtx);
3100 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
3101 		pthread_mutex_unlock(&sc->mtx);
3102 	} else {
3103 		value = 0;
3104                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
3105 	}
3106 
3107 	switch (size) {
3108 	case 1:
3109 		value &= 0xFF;
3110 		break;
3111 	case 2:
3112 		value &= 0xFFFF;
3113 		break;
3114 	case 4:
3115 		value &= 0xFFFFFFFF;
3116 		break;
3117 	}
3118 
3119 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
3120 	         offset, size, (uint32_t)value);
3121 
3122 	return (value);
3123 }
3124 
3125 
3126 
3127 static uint64_t
3128 pci_nvme_read(struct vmctx *ctx __unused, int vcpu __unused,
3129     struct pci_devinst *pi, int baridx, uint64_t offset, int size)
3130 {
3131 	struct pci_nvme_softc* sc = pi->pi_arg;
3132 
3133 	if (baridx == pci_msix_table_bar(pi) ||
3134 	    baridx == pci_msix_pba_bar(pi)) {
3135 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3136 		        baridx, offset, size);
3137 
3138 		return pci_emul_msix_tread(pi, offset, size);
3139 	}
3140 
3141 	switch (baridx) {
3142 	case 0:
3143        		return pci_nvme_read_bar_0(sc, offset, size);
3144 
3145 	default:
3146 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3147 	}
3148 
3149 	return (0);
3150 }
3151 
3152 static int
3153 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3154 {
3155 	char bident[sizeof("XXX:XXX")];
3156 	const char *value;
3157 	uint32_t sectsz;
3158 
3159 	sc->max_queues = NVME_QUEUES;
3160 	sc->max_qentries = NVME_MAX_QENTRIES;
3161 	sc->ioslots = NVME_IOSLOTS;
3162 	sc->num_squeues = sc->max_queues;
3163 	sc->num_cqueues = sc->max_queues;
3164 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3165 	sectsz = 0;
3166 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3167 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3168 
3169 	value = get_config_value_node(nvl, "maxq");
3170 	if (value != NULL)
3171 		sc->max_queues = atoi(value);
3172 	value = get_config_value_node(nvl, "qsz");
3173 	if (value != NULL) {
3174 		sc->max_qentries = atoi(value);
3175 		if (sc->max_qentries <= 0) {
3176 			EPRINTLN("nvme: Invalid qsz option %d",
3177 			    sc->max_qentries);
3178 			return (-1);
3179 		}
3180 	}
3181 	value = get_config_value_node(nvl, "ioslots");
3182 	if (value != NULL) {
3183 		sc->ioslots = atoi(value);
3184 		if (sc->ioslots <= 0) {
3185 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3186 			return (-1);
3187 		}
3188 	}
3189 	value = get_config_value_node(nvl, "sectsz");
3190 	if (value != NULL)
3191 		sectsz = atoi(value);
3192 	value = get_config_value_node(nvl, "ser");
3193 	if (value != NULL) {
3194 		/*
3195 		 * This field indicates the Product Serial Number in
3196 		 * 7-bit ASCII, unused bytes should be space characters.
3197 		 * Ref: NVMe v1.3c.
3198 		 */
3199 		cpywithpad((char *)sc->ctrldata.sn,
3200 		    sizeof(sc->ctrldata.sn), value, ' ');
3201 	}
3202 	value = get_config_value_node(nvl, "eui64");
3203 	if (value != NULL)
3204 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3205 	value = get_config_value_node(nvl, "dsm");
3206 	if (value != NULL) {
3207 		if (strcmp(value, "auto") == 0)
3208 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3209 		else if (strcmp(value, "enable") == 0)
3210 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3211 		else if (strcmp(value, "disable") == 0)
3212 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3213 	}
3214 
3215 	value = get_config_value_node(nvl, "ram");
3216 	if (value != NULL) {
3217 		uint64_t sz = strtoull(value, NULL, 10);
3218 
3219 		sc->nvstore.type = NVME_STOR_RAM;
3220 		sc->nvstore.size = sz * 1024 * 1024;
3221 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3222 		sc->nvstore.sectsz = 4096;
3223 		sc->nvstore.sectsz_bits = 12;
3224 		if (sc->nvstore.ctx == NULL) {
3225 			EPRINTLN("nvme: Unable to allocate RAM");
3226 			return (-1);
3227 		}
3228 	} else {
3229 		snprintf(bident, sizeof(bident), "%u:%u",
3230 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3231 		sc->nvstore.ctx = blockif_open(nvl, bident);
3232 		if (sc->nvstore.ctx == NULL) {
3233 			EPRINTLN("nvme: Could not open backing file: %s",
3234 			    strerror(errno));
3235 			return (-1);
3236 		}
3237 		sc->nvstore.type = NVME_STOR_BLOCKIF;
3238 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3239 	}
3240 
3241 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3242 		sc->nvstore.sectsz = sectsz;
3243 	else if (sc->nvstore.type != NVME_STOR_RAM)
3244 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3245 	for (sc->nvstore.sectsz_bits = 9;
3246 	     (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3247 	     sc->nvstore.sectsz_bits++);
3248 
3249 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3250 		sc->max_queues = NVME_QUEUES;
3251 
3252 	return (0);
3253 }
3254 
3255 static void
3256 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg,
3257     size_t new_size)
3258 {
3259 	struct pci_nvme_softc *sc;
3260 	struct pci_nvme_blockstore *nvstore;
3261 	struct nvme_namespace_data *nd;
3262 
3263 	sc = arg;
3264 	nvstore = &sc->nvstore;
3265 	nd = &sc->nsdata;
3266 
3267 	nvstore->size = new_size;
3268 	pci_nvme_init_nsdata_size(nvstore, nd);
3269 
3270 	/* Add changed NSID to list */
3271 	sc->ns_log.ns[0] = 1;
3272 	sc->ns_log.ns[1] = 0;
3273 
3274 	pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3275 	    PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED);
3276 }
3277 
3278 static int
3279 pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl)
3280 {
3281 	struct pci_nvme_softc *sc;
3282 	uint32_t pci_membar_sz;
3283 	int	error;
3284 
3285 	error = 0;
3286 
3287 	sc = calloc(1, sizeof(struct pci_nvme_softc));
3288 	pi->pi_arg = sc;
3289 	sc->nsc_pi = pi;
3290 
3291 	error = pci_nvme_parse_config(sc, nvl);
3292 	if (error < 0)
3293 		goto done;
3294 	else
3295 		error = 0;
3296 
3297 	STAILQ_INIT(&sc->ioreqs_free);
3298 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3299 	for (uint32_t i = 0; i < sc->ioslots; i++) {
3300 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3301 	}
3302 
3303 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3304 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3305 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3306 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3307 	pci_set_cfgdata8(pi, PCIR_PROGIF,
3308 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3309 
3310 	/*
3311 	 * Allocate size of NVMe registers + doorbell space for all queues.
3312 	 *
3313 	 * The specification requires a minimum memory I/O window size of 16K.
3314 	 * The Windows driver will refuse to start a device with a smaller
3315 	 * window.
3316 	 */
3317 	pci_membar_sz = sizeof(struct nvme_registers) +
3318 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
3319 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3320 
3321 	DPRINTF("nvme membar size: %u", pci_membar_sz);
3322 
3323 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3324 	if (error) {
3325 		WPRINTF("%s pci alloc mem bar failed", __func__);
3326 		goto done;
3327 	}
3328 
3329 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3330 	if (error) {
3331 		WPRINTF("%s pci add msixcap failed", __func__);
3332 		goto done;
3333 	}
3334 
3335 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3336 	if (error) {
3337 		WPRINTF("%s pci add Express capability failed", __func__);
3338 		goto done;
3339 	}
3340 
3341 	pthread_mutex_init(&sc->mtx, NULL);
3342 	sem_init(&sc->iosemlock, 0, sc->ioslots);
3343 	blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3344 
3345 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3346 	/*
3347 	 * Controller data depends on Namespace data so initialize Namespace
3348 	 * data first.
3349 	 */
3350 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3351 	pci_nvme_init_ctrldata(sc);
3352 	pci_nvme_init_logpages(sc);
3353 	pci_nvme_init_features(sc);
3354 
3355 	pci_nvme_aer_init(sc);
3356 	pci_nvme_aen_init(sc);
3357 
3358 	pci_nvme_reset(sc);
3359 
3360 	pci_lintr_request(pi);
3361 
3362 done:
3363 	return (error);
3364 }
3365 
3366 static int
3367 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3368 {
3369 	char *cp, *ram;
3370 
3371 	if (opts == NULL)
3372 		return (0);
3373 
3374 	if (strncmp(opts, "ram=", 4) == 0) {
3375 		cp = strchr(opts, ',');
3376 		if (cp == NULL) {
3377 			set_config_value_node(nvl, "ram", opts + 4);
3378 			return (0);
3379 		}
3380 		ram = strndup(opts + 4, cp - opts - 4);
3381 		set_config_value_node(nvl, "ram", ram);
3382 		free(ram);
3383 		return (pci_parse_legacy_config(nvl, cp + 1));
3384 	} else
3385 		return (blockif_legacy_config(nvl, opts));
3386 }
3387 
3388 static const struct pci_devemu pci_de_nvme = {
3389 	.pe_emu =	"nvme",
3390 	.pe_init =	pci_nvme_init,
3391 	.pe_legacy_config = pci_nvme_legacy_config,
3392 	.pe_barwrite =	pci_nvme_write,
3393 	.pe_barread =	pci_nvme_read
3394 };
3395 PCI_EMUL_SET(pci_de_nvme);
3396