xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision 206b73d0)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Function crc16 Copyright (c) 2017, Fedor Uporov
8  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  * bhyve PCIe-NVMe device emulation.
34  *
35  * options:
36  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
37  *
38  *  accepted devpath:
39  *    /dev/blockdev
40  *    /path/to/image
41  *    ram=size_in_MiB
42  *
43  *  maxq    = max number of queues
44  *  qsz     = max elements in each queue
45  *  ioslots = max number of concurrent io requests
46  *  sectsz  = sector size (defaults to blockif sector size)
47  *  ser     = serial number (20-chars max)
48  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
49  *
50  */
51 
52 /* TODO:
53     - create async event for smart and log
54     - intr coalesce
55  */
56 
57 #include <sys/cdefs.h>
58 __FBSDID("$FreeBSD$");
59 
60 #include <sys/types.h>
61 #include <net/ieee_oui.h>
62 
63 #include <assert.h>
64 #include <pthread.h>
65 #include <semaphore.h>
66 #include <stdbool.h>
67 #include <stddef.h>
68 #include <stdint.h>
69 #include <stdio.h>
70 #include <stdlib.h>
71 #include <string.h>
72 
73 #include <machine/atomic.h>
74 #include <machine/vmm.h>
75 #include <vmmapi.h>
76 
77 #include <dev/nvme/nvme.h>
78 
79 #include "bhyverun.h"
80 #include "block_if.h"
81 #include "pci_emul.h"
82 
83 
84 static int nvme_debug = 0;
85 #define	DPRINTF(params) if (nvme_debug) printf params
86 #define	WPRINTF(params) printf params
87 
88 /* defaults; can be overridden */
89 #define	NVME_MSIX_BAR		4
90 
91 #define	NVME_IOSLOTS		8
92 
93 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
94 #define NVME_MMIO_SPACE_MIN	(1 << 14)
95 
96 #define	NVME_QUEUES		16
97 #define	NVME_MAX_QENTRIES	2048
98 
99 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
100 #define	NVME_MAX_BLOCKIOVS	512
101 
102 /* This is a synthetic status code to indicate there is no status */
103 #define NVME_NO_STATUS		0xffff
104 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
105 
106 /* helpers */
107 
108 /* Convert a zero-based value into a one-based value */
109 #define ONE_BASED(zero)		((zero) + 1)
110 /* Convert a one-based value into a zero-based value */
111 #define ZERO_BASED(one)		((one)  - 1)
112 
113 /* Encode number of SQ's and CQ's for Set/Get Features */
114 #define NVME_FEATURE_NUM_QUEUES(sc) \
115 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
116 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
117 
118 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
119 
120 enum nvme_controller_register_offsets {
121 	NVME_CR_CAP_LOW = 0x00,
122 	NVME_CR_CAP_HI  = 0x04,
123 	NVME_CR_VS      = 0x08,
124 	NVME_CR_INTMS   = 0x0c,
125 	NVME_CR_INTMC   = 0x10,
126 	NVME_CR_CC      = 0x14,
127 	NVME_CR_CSTS    = 0x1c,
128 	NVME_CR_NSSR    = 0x20,
129 	NVME_CR_AQA     = 0x24,
130 	NVME_CR_ASQ_LOW = 0x28,
131 	NVME_CR_ASQ_HI  = 0x2c,
132 	NVME_CR_ACQ_LOW = 0x30,
133 	NVME_CR_ACQ_HI  = 0x34,
134 };
135 
136 enum nvme_cmd_cdw11 {
137 	NVME_CMD_CDW11_PC  = 0x0001,
138 	NVME_CMD_CDW11_IEN = 0x0002,
139 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
140 };
141 
142 #define	NVME_CQ_INTEN	0x01
143 #define	NVME_CQ_INTCOAL	0x02
144 
145 struct nvme_completion_queue {
146 	struct nvme_completion *qbase;
147 	uint32_t	size;
148 	uint16_t	tail; /* nvme progress */
149 	uint16_t	head; /* guest progress */
150 	uint16_t	intr_vec;
151 	uint32_t	intr_en;
152 	pthread_mutex_t	mtx;
153 };
154 
155 struct nvme_submission_queue {
156 	struct nvme_command *qbase;
157 	uint32_t	size;
158 	uint16_t	head; /* nvme progress */
159 	uint16_t	tail; /* guest progress */
160 	uint16_t	cqid; /* completion queue id */
161 	int		busy; /* queue is being processed */
162 	int		qpriority;
163 };
164 
165 enum nvme_storage_type {
166 	NVME_STOR_BLOCKIF = 0,
167 	NVME_STOR_RAM = 1,
168 };
169 
170 struct pci_nvme_blockstore {
171 	enum nvme_storage_type type;
172 	void		*ctx;
173 	uint64_t	size;
174 	uint32_t	sectsz;
175 	uint32_t	sectsz_bits;
176 	uint64_t	eui64;
177 };
178 
179 struct pci_nvme_ioreq {
180 	struct pci_nvme_softc *sc;
181 	struct pci_nvme_ioreq *next;
182 	struct nvme_submission_queue *nvme_sq;
183 	uint16_t	sqid;
184 
185 	/* command information */
186 	uint16_t	opc;
187 	uint16_t	cid;
188 	uint32_t	nsid;
189 
190 	uint64_t	prev_gpaddr;
191 	size_t		prev_size;
192 
193 	/*
194 	 * lock if all iovs consumed (big IO);
195 	 * complete transaction before continuing
196 	 */
197 	pthread_mutex_t	mtx;
198 	pthread_cond_t	cv;
199 
200 	struct blockif_req io_req;
201 
202 	/* pad to fit up to 512 page descriptors from guest IO request */
203 	struct iovec	iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
204 };
205 
206 struct pci_nvme_softc {
207 	struct pci_devinst *nsc_pi;
208 
209 	pthread_mutex_t	mtx;
210 
211 	struct nvme_registers regs;
212 
213 	struct nvme_namespace_data  nsdata;
214 	struct nvme_controller_data ctrldata;
215 	struct nvme_error_information_entry err_log;
216 	struct nvme_health_information_page health_log;
217 	struct nvme_firmware_page fw_log;
218 
219 	struct pci_nvme_blockstore nvstore;
220 
221 	uint16_t	max_qentries;	/* max entries per queue */
222 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
223 	uint32_t	num_cqueues;
224 	uint32_t	num_squeues;
225 
226 	struct pci_nvme_ioreq *ioreqs;
227 	struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
228 	uint32_t	pending_ios;
229 	uint32_t	ioslots;
230 	sem_t		iosemlock;
231 
232 	/*
233 	 * Memory mapped Submission and Completion queues
234 	 * Each array includes both Admin and IO queues
235 	 */
236 	struct nvme_completion_queue *compl_queues;
237 	struct nvme_submission_queue *submit_queues;
238 
239 	/* controller features */
240 	uint32_t	intr_coales_aggr_time;   /* 0x08: uS to delay intr */
241 	uint32_t	intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
242 	uint32_t	async_ev_config;         /* 0x0B: async event config */
243 };
244 
245 
246 static void pci_nvme_io_partial(struct blockif_req *br, int err);
247 
248 /* Controller Configuration utils */
249 #define	NVME_CC_GET_EN(cc) \
250 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
251 #define	NVME_CC_GET_CSS(cc) \
252 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
253 #define	NVME_CC_GET_SHN(cc) \
254 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
255 #define	NVME_CC_GET_IOSQES(cc) \
256 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
257 #define	NVME_CC_GET_IOCQES(cc) \
258 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
259 
260 #define	NVME_CC_WRITE_MASK \
261 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
262 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
263 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
264 
265 #define	NVME_CC_NEN_WRITE_MASK \
266 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
267 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
268 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
269 
270 /* Controller Status utils */
271 #define	NVME_CSTS_GET_RDY(sts) \
272 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
273 
274 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
275 
276 /* Completion Queue status word utils */
277 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
278 #define	NVME_STATUS_MASK \
279 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
280 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
281 
282 static __inline void
283 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
284 {
285 	size_t len;
286 
287 	len = strnlen(src, dst_size);
288 	memset(dst, pad, dst_size);
289 	memcpy(dst, src, len);
290 }
291 
292 static __inline void
293 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
294 {
295 
296 	*status &= ~NVME_STATUS_MASK;
297 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
298 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
299 }
300 
301 static __inline void
302 pci_nvme_status_genc(uint16_t *status, uint16_t code)
303 {
304 
305 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
306 }
307 
308 static __inline void
309 pci_nvme_toggle_phase(uint16_t *status, int prev)
310 {
311 
312 	if (prev)
313 		*status &= ~NVME_STATUS_P;
314 	else
315 		*status |= NVME_STATUS_P;
316 }
317 
318 static void
319 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
320 {
321 	struct nvme_controller_data *cd = &sc->ctrldata;
322 
323 	cd->vid = 0xFB5D;
324 	cd->ssvid = 0x0000;
325 
326 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
327 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
328 
329 	/* Num of submission commands that we can handle at a time (2^rab) */
330 	cd->rab   = 4;
331 
332 	/* FreeBSD OUI */
333 	cd->ieee[0] = 0x58;
334 	cd->ieee[1] = 0x9c;
335 	cd->ieee[2] = 0xfc;
336 
337 	cd->mic = 0;
338 
339 	cd->mdts = 9;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
340 
341 	cd->ver = 0x00010300;
342 
343 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
344 	cd->acl = 2;
345 	cd->aerl = 4;
346 
347 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
348 	cd->elpe = 0;	/* max error log page entries */
349 	cd->npss = 1;	/* number of power states support */
350 
351 	/* Warning Composite Temperature Threshold */
352 	cd->wctemp = 0x0157;
353 
354 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
355 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
356 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
357 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
358 	cd->nn = 1;	/* number of namespaces */
359 
360 	cd->fna = 0x03;
361 
362 	cd->power_state[0].mp = 10;
363 }
364 
365 /*
366  * Calculate the CRC-16 of the given buffer
367  * See copyright attribution at top of file
368  */
369 static uint16_t
370 crc16(uint16_t crc, const void *buffer, unsigned int len)
371 {
372 	const unsigned char *cp = buffer;
373 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
374 	static uint16_t const crc16_table[256] = {
375 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
376 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
377 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
378 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
379 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
380 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
381 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
382 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
383 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
384 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
385 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
386 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
387 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
388 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
389 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
390 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
391 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
392 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
393 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
394 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
395 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
396 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
397 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
398 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
399 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
400 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
401 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
402 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
403 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
404 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
405 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
406 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
407 	};
408 
409 	while (len--)
410 		crc = (((crc >> 8) & 0xffU) ^
411 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
412 	return crc;
413 }
414 
415 static void
416 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
417     struct nvme_namespace_data *nd, uint32_t nsid,
418     uint64_t eui64)
419 {
420 
421 	nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
422 	nd->ncap = nd->nsze;
423 	nd->nuse = nd->nsze;
424 
425 	/* Get LBA and backstore information from backing store */
426 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
427 	nd->flbas = 0;
428 
429 	/* Create an EUI-64 if user did not provide one */
430 	if (eui64 == 0) {
431 		char *data = NULL;
432 
433 		asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
434 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
435 
436 		if (data != NULL) {
437 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
438 			free(data);
439 		}
440 		eui64 = (eui64 << 16) | (nsid & 0xffff);
441 	}
442 	be64enc(nd->eui64, eui64);
443 
444 	/* LBA data-sz = 2^lbads */
445 	nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
446 }
447 
448 static void
449 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
450 {
451 
452 	memset(&sc->err_log, 0, sizeof(sc->err_log));
453 	memset(&sc->health_log, 0, sizeof(sc->health_log));
454 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
455 }
456 
457 static void
458 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
459 {
460 	DPRINTF(("%s\r\n", __func__));
461 
462 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
463 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
464 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
465 
466 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
467 
468 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
469 
470 	sc->regs.cc = 0;
471 	sc->regs.csts = 0;
472 
473 	sc->num_cqueues = sc->num_squeues = sc->max_queues;
474 	if (sc->submit_queues != NULL) {
475 		for (int i = 0; i < sc->num_squeues + 1; i++) {
476 			/*
477 			 * The Admin Submission Queue is at index 0.
478 			 * It must not be changed at reset otherwise the
479 			 * emulation will be out of sync with the guest.
480 			 */
481 			if (i != 0) {
482 				sc->submit_queues[i].qbase = NULL;
483 				sc->submit_queues[i].size = 0;
484 				sc->submit_queues[i].cqid = 0;
485 			}
486 			sc->submit_queues[i].tail = 0;
487 			sc->submit_queues[i].head = 0;
488 			sc->submit_queues[i].busy = 0;
489 		}
490 	} else
491 		sc->submit_queues = calloc(sc->num_squeues + 1,
492 		                        sizeof(struct nvme_submission_queue));
493 
494 	if (sc->compl_queues != NULL) {
495 		for (int i = 0; i < sc->num_cqueues + 1; i++) {
496 			/* See Admin Submission Queue note above */
497 			if (i != 0) {
498 				sc->compl_queues[i].qbase = NULL;
499 				sc->compl_queues[i].size = 0;
500 			}
501 
502 			sc->compl_queues[i].tail = 0;
503 			sc->compl_queues[i].head = 0;
504 		}
505 	} else {
506 		sc->compl_queues = calloc(sc->num_cqueues + 1,
507 		                        sizeof(struct nvme_completion_queue));
508 
509 		for (int i = 0; i < sc->num_cqueues + 1; i++)
510 			pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
511 	}
512 }
513 
514 static void
515 pci_nvme_reset(struct pci_nvme_softc *sc)
516 {
517 	pthread_mutex_lock(&sc->mtx);
518 	pci_nvme_reset_locked(sc);
519 	pthread_mutex_unlock(&sc->mtx);
520 }
521 
522 static void
523 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
524 {
525 	uint16_t acqs, asqs;
526 
527 	DPRINTF(("%s\r\n", __func__));
528 
529 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
530 	sc->submit_queues[0].size = asqs;
531 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
532 	            sizeof(struct nvme_command) * asqs);
533 
534 	DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
535 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase));
536 
537 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
538 	    NVME_AQA_REG_ACQS_MASK) + 1;
539 	sc->compl_queues[0].size = acqs;
540 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
541 	         sizeof(struct nvme_completion) * acqs);
542 	DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
543 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase));
544 }
545 
546 static int
547 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src,
548 	size_t len)
549 {
550 	uint8_t *dst;
551 	size_t bytes;
552 
553 	if (len > (8 * 1024)) {
554 		return (-1);
555 	}
556 
557 	/* Copy from the start of prp1 to the end of the physical page */
558 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
559 	bytes = MIN(bytes, len);
560 
561 	dst = vm_map_gpa(ctx, prp1, bytes);
562 	if (dst == NULL) {
563 		return (-1);
564 	}
565 
566 	memcpy(dst, src, bytes);
567 
568 	src += bytes;
569 
570 	len -= bytes;
571 	if (len == 0) {
572 		return (0);
573 	}
574 
575 	len = MIN(len, PAGE_SIZE);
576 
577 	dst = vm_map_gpa(ctx, prp2, len);
578 	if (dst == NULL) {
579 		return (-1);
580 	}
581 
582 	memcpy(dst, src, len);
583 
584 	return (0);
585 }
586 
587 static int
588 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
589 	struct nvme_completion* compl)
590 {
591 	uint16_t qid = command->cdw10 & 0xffff;
592 
593 	DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
594 	if (qid == 0 || qid > sc->num_squeues) {
595 		WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
596 		        __func__, qid, sc->num_squeues));
597 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
598 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
599 		return (1);
600 	}
601 
602 	sc->submit_queues[qid].qbase = NULL;
603 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
604 	return (1);
605 }
606 
607 static int
608 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
609 	struct nvme_completion* compl)
610 {
611 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
612 		uint16_t qid = command->cdw10 & 0xffff;
613 		struct nvme_submission_queue *nsq;
614 
615 		if ((qid == 0) || (qid > sc->num_squeues)) {
616 			WPRINTF(("%s queue index %u > num_squeues %u\r\n",
617 			        __func__, qid, sc->num_squeues));
618 			pci_nvme_status_tc(&compl->status,
619 			    NVME_SCT_COMMAND_SPECIFIC,
620 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
621 			return (1);
622 		}
623 
624 		nsq = &sc->submit_queues[qid];
625 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
626 
627 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
628 		              sizeof(struct nvme_command) * (size_t)nsq->size);
629 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
630 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
631 
632 		DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
633 		        qid, nsq->size, nsq->qbase, nsq->cqid));
634 
635 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
636 
637 		DPRINTF(("%s completed creating IOSQ qid %u\r\n",
638 		         __func__, qid));
639 	} else {
640 		/*
641 		 * Guest sent non-cont submission queue request.
642 		 * This setting is unsupported by this emulation.
643 		 */
644 		WPRINTF(("%s unsupported non-contig (list-based) "
645 		         "create i/o submission queue\r\n", __func__));
646 
647 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
648 	}
649 	return (1);
650 }
651 
652 static int
653 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
654 	struct nvme_completion* compl)
655 {
656 	uint16_t qid = command->cdw10 & 0xffff;
657 
658 	DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
659 	if (qid == 0 || qid > sc->num_cqueues) {
660 		WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
661 		        __func__, qid, sc->num_cqueues));
662 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
663 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
664 		return (1);
665 	}
666 
667 	sc->compl_queues[qid].qbase = NULL;
668 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
669 	return (1);
670 }
671 
672 static int
673 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
674 	struct nvme_completion* compl)
675 {
676 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
677 		uint16_t qid = command->cdw10 & 0xffff;
678 		struct nvme_completion_queue *ncq;
679 
680 		if ((qid == 0) || (qid > sc->num_cqueues)) {
681 			WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
682 			        __func__, qid, sc->num_cqueues));
683 			pci_nvme_status_tc(&compl->status,
684 			    NVME_SCT_COMMAND_SPECIFIC,
685 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
686 			return (1);
687 		}
688 
689 		ncq = &sc->compl_queues[qid];
690 		ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
691 		ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
692 		ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
693 
694 		ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
695 		             command->prp1,
696 		             sizeof(struct nvme_command) * (size_t)ncq->size);
697 
698 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
699 	} else {
700 		/*
701 		 * Non-contig completion queue unsupported.
702 		 */
703 		WPRINTF(("%s unsupported non-contig (list-based) "
704 		         "create i/o completion queue\r\n",
705 		         __func__));
706 
707 		/* 0x12 = Invalid Use of Controller Memory Buffer */
708 		pci_nvme_status_genc(&compl->status, 0x12);
709 	}
710 
711 	return (1);
712 }
713 
714 static int
715 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
716 	struct nvme_completion* compl)
717 {
718 	uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
719 	uint8_t logpage = command->cdw10 & 0xFF;
720 
721 	DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
722 
723 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
724 
725 	switch (logpage) {
726 	case NVME_LOG_ERROR:
727 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
728 		    command->prp2, (uint8_t *)&sc->err_log, logsize);
729 		break;
730 	case NVME_LOG_HEALTH_INFORMATION:
731 		/* TODO: present some smart info */
732 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
733 		    command->prp2, (uint8_t *)&sc->health_log, logsize);
734 		break;
735 	case NVME_LOG_FIRMWARE_SLOT:
736 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
737 		    command->prp2, (uint8_t *)&sc->fw_log, logsize);
738 		break;
739 	default:
740 		WPRINTF(("%s get log page %x command not supported\r\n",
741 		        __func__, logpage));
742 
743 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
744 		    NVME_SC_INVALID_LOG_PAGE);
745 	}
746 
747 	return (1);
748 }
749 
750 static int
751 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
752 	struct nvme_completion* compl)
753 {
754 	void *dest;
755 
756 	DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
757 	        command->cdw10 & 0xFF, command->nsid));
758 
759 	switch (command->cdw10 & 0xFF) {
760 	case 0x00: /* return Identify Namespace data structure */
761 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
762 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata));
763 		break;
764 	case 0x01: /* return Identify Controller data structure */
765 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
766 		    command->prp2, (uint8_t *)&sc->ctrldata,
767 		    sizeof(sc->ctrldata));
768 		break;
769 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
770 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
771 		                  sizeof(uint32_t) * 1024);
772 		((uint32_t *)dest)[0] = 1;
773 		((uint32_t *)dest)[1] = 0;
774 		break;
775 	case 0x11:
776 		pci_nvme_status_genc(&compl->status,
777 		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
778 		return (1);
779 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
780 	case 0x10:
781 	case 0x12:
782 	case 0x13:
783 	case 0x14:
784 	case 0x15:
785 	default:
786 		DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
787 		         __func__, command->cdw10 & 0xFF));
788 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
789 		return (1);
790 	}
791 
792 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
793 	return (1);
794 }
795 
796 static int
797 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
798 	struct nvme_completion* compl)
799 {
800 	uint16_t nqr;	/* Number of Queues Requested */
801 
802 	nqr = command->cdw11 & 0xFFFF;
803 	if (nqr == 0xffff) {
804 		WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr));
805 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
806 		return (-1);
807 	}
808 
809 	sc->num_squeues = ONE_BASED(nqr);
810 	if (sc->num_squeues > sc->max_queues) {
811 		DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues,
812 					sc->max_queues));
813 		sc->num_squeues = sc->max_queues;
814 	}
815 
816 	nqr = (command->cdw11 >> 16) & 0xFFFF;
817 	if (nqr == 0xffff) {
818 		WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr));
819 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
820 		return (-1);
821 	}
822 
823 	sc->num_cqueues = ONE_BASED(nqr);
824 	if (sc->num_cqueues > sc->max_queues) {
825 		DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues,
826 					sc->max_queues));
827 		sc->num_cqueues = sc->max_queues;
828 	}
829 
830 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
831 
832 	return (0);
833 }
834 
835 static int
836 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
837 	struct nvme_completion* compl)
838 {
839 	int feature = command->cdw10 & 0xFF;
840 	uint32_t iv;
841 
842 	DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
843 	compl->cdw0 = 0;
844 
845 	switch (feature) {
846 	case NVME_FEAT_ARBITRATION:
847 		DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
848 		break;
849 	case NVME_FEAT_POWER_MANAGEMENT:
850 		DPRINTF(("  power management 0x%x\r\n", command->cdw11));
851 		break;
852 	case NVME_FEAT_LBA_RANGE_TYPE:
853 		DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
854 		break;
855 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
856 		DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
857 		break;
858 	case NVME_FEAT_ERROR_RECOVERY:
859 		DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
860 		break;
861 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
862 		DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
863 		break;
864 	case NVME_FEAT_NUMBER_OF_QUEUES:
865 		nvme_set_feature_queues(sc, command, compl);
866 		break;
867 	case NVME_FEAT_INTERRUPT_COALESCING:
868 		DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
869 
870 		/* in uS */
871 		sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
872 
873 		sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
874 		break;
875 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
876 		iv = command->cdw11 & 0xFFFF;
877 
878 		DPRINTF(("  interrupt vector configuration 0x%x\r\n",
879 		        command->cdw11));
880 
881 		for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
882 			if (sc->compl_queues[i].intr_vec == iv) {
883 				if (command->cdw11 & (1 << 16))
884 					sc->compl_queues[i].intr_en |=
885 					                      NVME_CQ_INTCOAL;
886 				else
887 					sc->compl_queues[i].intr_en &=
888 					                     ~NVME_CQ_INTCOAL;
889 			}
890 		}
891 		break;
892 	case NVME_FEAT_WRITE_ATOMICITY:
893 		DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
894 		break;
895 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
896 		DPRINTF(("  async event configuration 0x%x\r\n",
897 		        command->cdw11));
898 		sc->async_ev_config = command->cdw11;
899 		break;
900 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
901 		DPRINTF(("  software progress marker 0x%x\r\n",
902 		        command->cdw11));
903 		break;
904 	case 0x0C:
905 		DPRINTF(("  autonomous power state transition 0x%x\r\n",
906 		        command->cdw11));
907 		break;
908 	default:
909 		WPRINTF(("%s invalid feature\r\n", __func__));
910 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
911 		return (1);
912 	}
913 
914 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
915 	return (1);
916 }
917 
918 static int
919 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
920 	struct nvme_completion* compl)
921 {
922 	int feature = command->cdw10 & 0xFF;
923 
924 	DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
925 
926 	compl->cdw0 = 0;
927 
928 	switch (feature) {
929 	case NVME_FEAT_ARBITRATION:
930 		DPRINTF(("  arbitration\r\n"));
931 		break;
932 	case NVME_FEAT_POWER_MANAGEMENT:
933 		DPRINTF(("  power management\r\n"));
934 		break;
935 	case NVME_FEAT_LBA_RANGE_TYPE:
936 		DPRINTF(("  lba range\r\n"));
937 		break;
938 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
939 		DPRINTF(("  temperature threshold\r\n"));
940 		switch ((command->cdw11 >> 20) & 0x3) {
941 		case 0:
942 			/* Over temp threshold */
943 			compl->cdw0 = 0xFFFF;
944 			break;
945 		case 1:
946 			/* Under temp threshold */
947 			compl->cdw0 = 0;
948 			break;
949 		default:
950 			WPRINTF(("  invalid threshold type select\r\n"));
951 			pci_nvme_status_genc(&compl->status,
952 			    NVME_SC_INVALID_FIELD);
953 			return (1);
954 		}
955 		break;
956 	case NVME_FEAT_ERROR_RECOVERY:
957 		DPRINTF(("  error recovery\r\n"));
958 		break;
959 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
960 		DPRINTF(("  volatile write cache\r\n"));
961 		break;
962 	case NVME_FEAT_NUMBER_OF_QUEUES:
963 		compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
964 
965 		DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
966 		        compl->cdw0 & 0xFFFF,
967 		        (compl->cdw0 >> 16) & 0xFFFF));
968 
969 		break;
970 	case NVME_FEAT_INTERRUPT_COALESCING:
971 		DPRINTF(("  interrupt coalescing\r\n"));
972 		break;
973 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
974 		DPRINTF(("  interrupt vector configuration\r\n"));
975 		break;
976 	case NVME_FEAT_WRITE_ATOMICITY:
977 		DPRINTF(("  write atomicity\r\n"));
978 		break;
979 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
980 		DPRINTF(("  async event configuration\r\n"));
981 		sc->async_ev_config = command->cdw11;
982 		break;
983 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
984 		DPRINTF(("  software progress marker\r\n"));
985 		break;
986 	case 0x0C:
987 		DPRINTF(("  autonomous power state transition\r\n"));
988 		break;
989 	default:
990 		WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
991 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
992 		return (1);
993 	}
994 
995 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
996 	return (1);
997 }
998 
999 static int
1000 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1001 	struct nvme_completion* compl)
1002 {
1003 	DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
1004 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
1005 
1006 	/* TODO: search for the command ID and abort it */
1007 
1008 	compl->cdw0 = 1;
1009 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1010 	return (1);
1011 }
1012 
1013 static int
1014 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1015 	struct nvme_command* command, struct nvme_completion* compl)
1016 {
1017 	DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
1018 
1019 	/*
1020 	 * TODO: raise events when they happen based on the Set Features cmd.
1021 	 * These events happen async, so only set completion successful if
1022 	 * there is an event reflective of the request to get event.
1023 	 */
1024 	pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1025 	    NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1026 	return (0);
1027 }
1028 
1029 static void
1030 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1031 {
1032 	struct nvme_completion compl;
1033 	struct nvme_command *cmd;
1034 	struct nvme_submission_queue *sq;
1035 	struct nvme_completion_queue *cq;
1036 	int do_intr = 0;
1037 	uint16_t sqhead;
1038 
1039 	DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
1040 
1041 	sq = &sc->submit_queues[0];
1042 
1043 	sqhead = atomic_load_acq_short(&sq->head);
1044 
1045 	if (atomic_testandset_int(&sq->busy, 1)) {
1046 		DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
1047 		        __func__, sqhead, sq->tail));
1048 		return;
1049 	}
1050 
1051 	DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
1052 
1053 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1054 		cmd = &(sq->qbase)[sqhead];
1055 		compl.cdw0 = 0;
1056 		compl.status = 0;
1057 
1058 		switch (cmd->opc) {
1059 		case NVME_OPC_DELETE_IO_SQ:
1060 			DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
1061 			do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
1062 			break;
1063 		case NVME_OPC_CREATE_IO_SQ:
1064 			DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
1065 			do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
1066 			break;
1067 		case NVME_OPC_DELETE_IO_CQ:
1068 			DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
1069 			do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
1070 			break;
1071 		case NVME_OPC_CREATE_IO_CQ:
1072 			DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
1073 			do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
1074 			break;
1075 		case NVME_OPC_GET_LOG_PAGE:
1076 			DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
1077 			do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
1078 			break;
1079 		case NVME_OPC_IDENTIFY:
1080 			DPRINTF(("%s command IDENTIFY\r\n", __func__));
1081 			do_intr |= nvme_opc_identify(sc, cmd, &compl);
1082 			break;
1083 		case NVME_OPC_ABORT:
1084 			DPRINTF(("%s command ABORT\r\n", __func__));
1085 			do_intr |= nvme_opc_abort(sc, cmd, &compl);
1086 			break;
1087 		case NVME_OPC_SET_FEATURES:
1088 			DPRINTF(("%s command SET_FEATURES\r\n", __func__));
1089 			do_intr |= nvme_opc_set_features(sc, cmd, &compl);
1090 			break;
1091 		case NVME_OPC_GET_FEATURES:
1092 			DPRINTF(("%s command GET_FEATURES\r\n", __func__));
1093 			do_intr |= nvme_opc_get_features(sc, cmd, &compl);
1094 			break;
1095 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1096 			DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
1097 			/* XXX dont care, unhandled for now
1098 			do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
1099 			*/
1100 			compl.status = NVME_NO_STATUS;
1101 			break;
1102 		default:
1103 			WPRINTF(("0x%x command is not implemented\r\n",
1104 			    cmd->opc));
1105 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1106 			do_intr |= 1;
1107 		}
1108 
1109 		if (NVME_COMPLETION_VALID(compl)) {
1110 			struct nvme_completion *cp;
1111 			int phase;
1112 
1113 			cq = &sc->compl_queues[0];
1114 
1115 			cp = &(cq->qbase)[cq->tail];
1116 			cp->cdw0 = compl.cdw0;
1117 			cp->sqid = 0;
1118 			cp->sqhd = sqhead;
1119 			cp->cid = cmd->cid;
1120 
1121 			phase = NVME_STATUS_GET_P(cp->status);
1122 			cp->status = compl.status;
1123 			pci_nvme_toggle_phase(&cp->status, phase);
1124 
1125 			cq->tail = (cq->tail + 1) % cq->size;
1126 		}
1127 		sqhead = (sqhead + 1) % sq->size;
1128 	}
1129 
1130 	DPRINTF(("setting sqhead %u\r\n", sqhead));
1131 	atomic_store_short(&sq->head, sqhead);
1132 	atomic_store_int(&sq->busy, 0);
1133 
1134 	if (do_intr)
1135 		pci_generate_msix(sc->nsc_pi, 0);
1136 
1137 }
1138 
1139 static int
1140 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1141 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1142 {
1143 	int iovidx;
1144 
1145 	if (req != NULL) {
1146 		/* concatenate contig block-iovs to minimize number of iovs */
1147 		if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1148 			iovidx = req->io_req.br_iovcnt - 1;
1149 
1150 			req->io_req.br_iov[iovidx].iov_base =
1151 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1152 			                     req->prev_gpaddr, size);
1153 
1154 			req->prev_size += size;
1155 			req->io_req.br_resid += size;
1156 
1157 			req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1158 		} else {
1159 			pthread_mutex_lock(&req->mtx);
1160 
1161 			iovidx = req->io_req.br_iovcnt;
1162 			if (iovidx == NVME_MAX_BLOCKIOVS) {
1163 				int err = 0;
1164 
1165 				DPRINTF(("large I/O, doing partial req\r\n"));
1166 
1167 				iovidx = 0;
1168 				req->io_req.br_iovcnt = 0;
1169 
1170 				req->io_req.br_callback = pci_nvme_io_partial;
1171 
1172 				if (!do_write)
1173 					err = blockif_read(sc->nvstore.ctx,
1174 					                   &req->io_req);
1175 				else
1176 					err = blockif_write(sc->nvstore.ctx,
1177 					                    &req->io_req);
1178 
1179 				/* wait until req completes before cont */
1180 				if (err == 0)
1181 					pthread_cond_wait(&req->cv, &req->mtx);
1182 			}
1183 			if (iovidx == 0) {
1184 				req->io_req.br_offset = lba;
1185 				req->io_req.br_resid = 0;
1186 				req->io_req.br_param = req;
1187 			}
1188 
1189 			req->io_req.br_iov[iovidx].iov_base =
1190 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1191 			                     gpaddr, size);
1192 
1193 			req->io_req.br_iov[iovidx].iov_len = size;
1194 
1195 			req->prev_gpaddr = gpaddr;
1196 			req->prev_size = size;
1197 			req->io_req.br_resid += size;
1198 
1199 			req->io_req.br_iovcnt++;
1200 
1201 			pthread_mutex_unlock(&req->mtx);
1202 		}
1203 	} else {
1204 		/* RAM buffer: read/write directly */
1205 		void *p = sc->nvstore.ctx;
1206 		void *gptr;
1207 
1208 		if ((lba + size) > sc->nvstore.size) {
1209 			WPRINTF(("%s write would overflow RAM\r\n", __func__));
1210 			return (-1);
1211 		}
1212 
1213 		p = (void *)((uintptr_t)p + (uintptr_t)lba);
1214 		gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1215 		if (do_write)
1216 			memcpy(p, gptr, size);
1217 		else
1218 			memcpy(gptr, p, size);
1219 	}
1220 	return (0);
1221 }
1222 
1223 static void
1224 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1225 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1226 	uint32_t cdw0, uint16_t status, int ignore_busy)
1227 {
1228 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1229 	struct nvme_completion *compl;
1230 	int do_intr = 0;
1231 	int phase;
1232 
1233 	DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1234 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1235 		 NVME_STATUS_GET_SC(status)));
1236 
1237 	pthread_mutex_lock(&cq->mtx);
1238 
1239 	assert(cq->qbase != NULL);
1240 
1241 	compl = &cq->qbase[cq->tail];
1242 
1243 	compl->sqhd = atomic_load_acq_short(&sq->head);
1244 	compl->sqid = sqid;
1245 	compl->cid = cid;
1246 
1247 	// toggle phase
1248 	phase = NVME_STATUS_GET_P(compl->status);
1249 	compl->status = status;
1250 	pci_nvme_toggle_phase(&compl->status, phase);
1251 
1252 	cq->tail = (cq->tail + 1) % cq->size;
1253 
1254 	if (cq->intr_en & NVME_CQ_INTEN)
1255 		do_intr = 1;
1256 
1257 	pthread_mutex_unlock(&cq->mtx);
1258 
1259 	if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1260 		if (do_intr)
1261 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1262 }
1263 
1264 static void
1265 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1266 {
1267 	req->sc = NULL;
1268 	req->nvme_sq = NULL;
1269 	req->sqid = 0;
1270 
1271 	pthread_mutex_lock(&sc->mtx);
1272 
1273 	req->next = sc->ioreqs_free;
1274 	sc->ioreqs_free = req;
1275 	sc->pending_ios--;
1276 
1277 	/* when no more IO pending, can set to ready if device reset/enabled */
1278 	if (sc->pending_ios == 0 &&
1279 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1280 		sc->regs.csts |= NVME_CSTS_RDY;
1281 
1282 	pthread_mutex_unlock(&sc->mtx);
1283 
1284 	sem_post(&sc->iosemlock);
1285 }
1286 
1287 static struct pci_nvme_ioreq *
1288 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1289 {
1290 	struct pci_nvme_ioreq *req = NULL;;
1291 
1292 	sem_wait(&sc->iosemlock);
1293 	pthread_mutex_lock(&sc->mtx);
1294 
1295 	req = sc->ioreqs_free;
1296 	assert(req != NULL);
1297 
1298 	sc->ioreqs_free = req->next;
1299 
1300 	req->next = NULL;
1301 	req->sc = sc;
1302 
1303 	sc->pending_ios++;
1304 
1305 	pthread_mutex_unlock(&sc->mtx);
1306 
1307 	req->io_req.br_iovcnt = 0;
1308 	req->io_req.br_offset = 0;
1309 	req->io_req.br_resid = 0;
1310 	req->io_req.br_param = req;
1311 	req->prev_gpaddr = 0;
1312 	req->prev_size = 0;
1313 
1314 	return req;
1315 }
1316 
1317 static void
1318 pci_nvme_io_done(struct blockif_req *br, int err)
1319 {
1320 	struct pci_nvme_ioreq *req = br->br_param;
1321 	struct nvme_submission_queue *sq = req->nvme_sq;
1322 	uint16_t code, status;
1323 
1324 	DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1325 
1326 	/* TODO return correct error */
1327 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1328 	pci_nvme_status_genc(&status, code);
1329 
1330 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1331 	pci_nvme_release_ioreq(req->sc, req);
1332 }
1333 
1334 static void
1335 pci_nvme_io_partial(struct blockif_req *br, int err)
1336 {
1337 	struct pci_nvme_ioreq *req = br->br_param;
1338 
1339 	DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1340 
1341 	pthread_cond_signal(&req->cv);
1342 }
1343 
1344 
1345 static void
1346 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1347 {
1348 	struct nvme_submission_queue *sq;
1349 	uint16_t status;
1350 	uint16_t sqhead;
1351 	int err;
1352 
1353 	/* handle all submissions up to sq->tail index */
1354 	sq = &sc->submit_queues[idx];
1355 
1356 	if (atomic_testandset_int(&sq->busy, 1)) {
1357 		DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1358 		return;
1359 	}
1360 
1361 	sqhead = atomic_load_acq_short(&sq->head);
1362 
1363 	DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1364 	         idx, sqhead, sq->tail, sq->qbase));
1365 
1366 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1367 		struct nvme_command *cmd;
1368 		struct pci_nvme_ioreq *req = NULL;
1369 		uint64_t lba;
1370 		uint64_t nblocks, bytes, size, cpsz;
1371 
1372 		/* TODO: support scatter gather list handling */
1373 
1374 		cmd = &sq->qbase[sqhead];
1375 		sqhead = (sqhead + 1) % sq->size;
1376 
1377 		lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1378 
1379 		if (cmd->opc == NVME_OPC_FLUSH) {
1380 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1381 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1382 			                        status, 1);
1383 
1384 			continue;
1385 		} else if (cmd->opc == 0x08) {
1386 			/* TODO: write zeroes */
1387 			WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1388 			        __func__, lba, cmd->cdw12 & 0xFFFF));
1389 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1390 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1391 			                        status, 1);
1392 
1393 			continue;
1394 		}
1395 
1396 		nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1397 
1398 		bytes = nblocks * sc->nvstore.sectsz;
1399 
1400 		if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1401 			req = pci_nvme_get_ioreq(sc);
1402 			req->nvme_sq = sq;
1403 			req->sqid = idx;
1404 		}
1405 
1406 		/*
1407 		 * If data starts mid-page and flows into the next page, then
1408 		 * increase page count
1409 		 */
1410 
1411 		DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1412 		         "(%lu-bytes)\r\n",
1413 		         sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1414 		         cmd->opc == NVME_OPC_WRITE ?
1415 			     "WRITE" : "READ",
1416 		         lba, nblocks, bytes));
1417 
1418 		cmd->prp1 &= ~(0x03UL);
1419 		cmd->prp2 &= ~(0x03UL);
1420 
1421 		DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1422 
1423 		size = bytes;
1424 		lba *= sc->nvstore.sectsz;
1425 
1426 		cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1427 
1428 		if (cpsz > bytes)
1429 			cpsz = bytes;
1430 
1431 		if (req != NULL) {
1432 			req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1433 			                        cmd->cdw10;
1434 			req->opc = cmd->opc;
1435 			req->cid = cmd->cid;
1436 			req->nsid = cmd->nsid;
1437 		}
1438 
1439 		err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1440 		    cmd->opc == NVME_OPC_WRITE, lba);
1441 		lba += cpsz;
1442 		size -= cpsz;
1443 
1444 		if (size == 0)
1445 			goto iodone;
1446 
1447 		if (size <= PAGE_SIZE) {
1448 			/* prp2 is second (and final) page in transfer */
1449 
1450 			err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1451 			    size,
1452 			    cmd->opc == NVME_OPC_WRITE,
1453 			    lba);
1454 		} else {
1455 			uint64_t *prp_list;
1456 			int i;
1457 
1458 			/* prp2 is pointer to a physical region page list */
1459 			prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1460 			                            cmd->prp2, PAGE_SIZE);
1461 
1462 			i = 0;
1463 			while (size != 0) {
1464 				cpsz = MIN(size, PAGE_SIZE);
1465 
1466 				/*
1467 				 * Move to linked physical region page list
1468 				 * in last item.
1469 				 */
1470 				if (i == (NVME_PRP2_ITEMS-1) &&
1471 				    size > PAGE_SIZE) {
1472 					assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1473 					prp_list = paddr_guest2host(
1474 					              sc->nsc_pi->pi_vmctx,
1475 					              prp_list[i], PAGE_SIZE);
1476 					i = 0;
1477 				}
1478 				if (prp_list[i] == 0) {
1479 					WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1480 					err = 1;
1481 					break;
1482 				}
1483 
1484 				err = pci_nvme_append_iov_req(sc, req,
1485 				    prp_list[i], cpsz,
1486 				    cmd->opc == NVME_OPC_WRITE, lba);
1487 				if (err)
1488 					break;
1489 
1490 				lba += cpsz;
1491 				size -= cpsz;
1492 				i++;
1493 			}
1494 		}
1495 
1496 iodone:
1497 		if (sc->nvstore.type == NVME_STOR_RAM) {
1498 			uint16_t code, status;
1499 
1500 			code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1501 			    NVME_SC_SUCCESS;
1502 			pci_nvme_status_genc(&status, code);
1503 
1504 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1505 			                        status, 1);
1506 
1507 			continue;
1508 		}
1509 
1510 
1511 		if (err)
1512 			goto do_error;
1513 
1514 		req->io_req.br_callback = pci_nvme_io_done;
1515 
1516 		err = 0;
1517 		switch (cmd->opc) {
1518 		case NVME_OPC_READ:
1519 			err = blockif_read(sc->nvstore.ctx, &req->io_req);
1520 			break;
1521 		case NVME_OPC_WRITE:
1522 			err = blockif_write(sc->nvstore.ctx, &req->io_req);
1523 			break;
1524 		default:
1525 			WPRINTF(("%s unhandled io command 0x%x\r\n",
1526 				 __func__, cmd->opc));
1527 			err = 1;
1528 		}
1529 
1530 do_error:
1531 		if (err) {
1532 			uint16_t status;
1533 
1534 			pci_nvme_status_genc(&status,
1535 			    NVME_SC_DATA_TRANSFER_ERROR);
1536 
1537 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1538 			                        status, 1);
1539 			pci_nvme_release_ioreq(sc, req);
1540 		}
1541 	}
1542 
1543 	atomic_store_short(&sq->head, sqhead);
1544 	atomic_store_int(&sq->busy, 0);
1545 }
1546 
1547 static void
1548 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1549 	uint64_t idx, int is_sq, uint64_t value)
1550 {
1551 	DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1552 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1553 
1554 	if (is_sq) {
1555 		atomic_store_short(&sc->submit_queues[idx].tail,
1556 		                   (uint16_t)value);
1557 
1558 		if (idx == 0) {
1559 			pci_nvme_handle_admin_cmd(sc, value);
1560 		} else {
1561 			/* submission queue; handle new entries in SQ */
1562 			if (idx > sc->num_squeues) {
1563 				WPRINTF(("%s SQ index %lu overflow from "
1564 				         "guest (max %u)\r\n",
1565 				         __func__, idx, sc->num_squeues));
1566 				return;
1567 			}
1568 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1569 		}
1570 	} else {
1571 		if (idx > sc->num_cqueues) {
1572 			WPRINTF(("%s queue index %lu overflow from "
1573 			         "guest (max %u)\r\n",
1574 			         __func__, idx, sc->num_cqueues));
1575 			return;
1576 		}
1577 
1578 		sc->compl_queues[idx].head = (uint16_t)value;
1579 	}
1580 }
1581 
1582 static void
1583 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1584 {
1585 	const char *s = iswrite ? "WRITE" : "READ";
1586 
1587 	switch (offset) {
1588 	case NVME_CR_CAP_LOW:
1589 		DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1590 		break;
1591 	case NVME_CR_CAP_HI:
1592 		DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1593 		break;
1594 	case NVME_CR_VS:
1595 		DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1596 		break;
1597 	case NVME_CR_INTMS:
1598 		DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1599 		break;
1600 	case NVME_CR_INTMC:
1601 		DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1602 		break;
1603 	case NVME_CR_CC:
1604 		DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1605 		break;
1606 	case NVME_CR_CSTS:
1607 		DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1608 		break;
1609 	case NVME_CR_NSSR:
1610 		DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1611 		break;
1612 	case NVME_CR_AQA:
1613 		DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1614 		break;
1615 	case NVME_CR_ASQ_LOW:
1616 		DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1617 		break;
1618 	case NVME_CR_ASQ_HI:
1619 		DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1620 		break;
1621 	case NVME_CR_ACQ_LOW:
1622 		DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1623 		break;
1624 	case NVME_CR_ACQ_HI:
1625 		DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1626 		break;
1627 	default:
1628 		DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1629 	}
1630 
1631 }
1632 
1633 static void
1634 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1635 	uint64_t offset, int size, uint64_t value)
1636 {
1637 	uint32_t ccreg;
1638 
1639 	if (offset >= NVME_DOORBELL_OFFSET) {
1640 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1641 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1642 		int is_sq = (belloffset % 8) < 4;
1643 
1644 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1645 			WPRINTF(("guest attempted an overflow write offset "
1646 			         "0x%lx, val 0x%lx in %s",
1647 			         offset, value, __func__));
1648 			return;
1649 		}
1650 
1651 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1652 		return;
1653 	}
1654 
1655 	DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1656 	        offset, size, value));
1657 
1658 	if (size != 4) {
1659 		WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1660 		         "val 0x%lx) to bar0 in %s",
1661 		         size, offset, value, __func__));
1662 		/* TODO: shutdown device */
1663 		return;
1664 	}
1665 
1666 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1667 
1668 	pthread_mutex_lock(&sc->mtx);
1669 
1670 	switch (offset) {
1671 	case NVME_CR_CAP_LOW:
1672 	case NVME_CR_CAP_HI:
1673 		/* readonly */
1674 		break;
1675 	case NVME_CR_VS:
1676 		/* readonly */
1677 		break;
1678 	case NVME_CR_INTMS:
1679 		/* MSI-X, so ignore */
1680 		break;
1681 	case NVME_CR_INTMC:
1682 		/* MSI-X, so ignore */
1683 		break;
1684 	case NVME_CR_CC:
1685 		ccreg = (uint32_t)value;
1686 
1687 		DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1688 		         "iocqes %u\r\n",
1689 		        __func__,
1690 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1691 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1692 			 NVME_CC_GET_IOCQES(ccreg)));
1693 
1694 		if (NVME_CC_GET_SHN(ccreg)) {
1695 			/* perform shutdown - flush out data to backend */
1696 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1697 			    NVME_CSTS_REG_SHST_SHIFT);
1698 			sc->regs.csts |= NVME_SHST_COMPLETE <<
1699 			    NVME_CSTS_REG_SHST_SHIFT;
1700 		}
1701 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1702 			if (NVME_CC_GET_EN(ccreg) == 0)
1703 				/* transition 1-> causes controller reset */
1704 				pci_nvme_reset_locked(sc);
1705 			else
1706 				pci_nvme_init_controller(ctx, sc);
1707 		}
1708 
1709 		/* Insert the iocqes, iosqes and en bits from the write */
1710 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1711 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1712 		if (NVME_CC_GET_EN(ccreg) == 0) {
1713 			/* Insert the ams, mps and css bit fields */
1714 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1715 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1716 			sc->regs.csts &= ~NVME_CSTS_RDY;
1717 		} else if (sc->pending_ios == 0) {
1718 			sc->regs.csts |= NVME_CSTS_RDY;
1719 		}
1720 		break;
1721 	case NVME_CR_CSTS:
1722 		break;
1723 	case NVME_CR_NSSR:
1724 		/* ignore writes; don't support subsystem reset */
1725 		break;
1726 	case NVME_CR_AQA:
1727 		sc->regs.aqa = (uint32_t)value;
1728 		break;
1729 	case NVME_CR_ASQ_LOW:
1730 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1731 		               (0xFFFFF000 & value);
1732 		break;
1733 	case NVME_CR_ASQ_HI:
1734 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1735 		               (value << 32);
1736 		break;
1737 	case NVME_CR_ACQ_LOW:
1738 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1739 		               (0xFFFFF000 & value);
1740 		break;
1741 	case NVME_CR_ACQ_HI:
1742 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1743 		               (value << 32);
1744 		break;
1745 	default:
1746 		DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1747 		         __func__, offset, value, size));
1748 	}
1749 	pthread_mutex_unlock(&sc->mtx);
1750 }
1751 
1752 static void
1753 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1754                 int baridx, uint64_t offset, int size, uint64_t value)
1755 {
1756 	struct pci_nvme_softc* sc = pi->pi_arg;
1757 
1758 	if (baridx == pci_msix_table_bar(pi) ||
1759 	    baridx == pci_msix_pba_bar(pi)) {
1760 		DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1761 		         " value 0x%lx\r\n", baridx, offset, size, value));
1762 
1763 		pci_emul_msix_twrite(pi, offset, size, value);
1764 		return;
1765 	}
1766 
1767 	switch (baridx) {
1768 	case 0:
1769 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1770 		break;
1771 
1772 	default:
1773 		DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1774 		         __func__, baridx, value));
1775 	}
1776 }
1777 
1778 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1779 	uint64_t offset, int size)
1780 {
1781 	uint64_t value;
1782 
1783 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1784 
1785 	if (offset < NVME_DOORBELL_OFFSET) {
1786 		void *p = &(sc->regs);
1787 		pthread_mutex_lock(&sc->mtx);
1788 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
1789 		pthread_mutex_unlock(&sc->mtx);
1790 	} else {
1791 		value = 0;
1792                 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1793 	}
1794 
1795 	switch (size) {
1796 	case 1:
1797 		value &= 0xFF;
1798 		break;
1799 	case 2:
1800 		value &= 0xFFFF;
1801 		break;
1802 	case 4:
1803 		value &= 0xFFFFFFFF;
1804 		break;
1805 	}
1806 
1807 	DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1808 	         offset, size, (uint32_t)value));
1809 
1810 	return (value);
1811 }
1812 
1813 
1814 
1815 static uint64_t
1816 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1817     uint64_t offset, int size)
1818 {
1819 	struct pci_nvme_softc* sc = pi->pi_arg;
1820 
1821 	if (baridx == pci_msix_table_bar(pi) ||
1822 	    baridx == pci_msix_pba_bar(pi)) {
1823 		DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1824 		        baridx, offset, size));
1825 
1826 		return pci_emul_msix_tread(pi, offset, size);
1827 	}
1828 
1829 	switch (baridx) {
1830 	case 0:
1831        		return pci_nvme_read_bar_0(sc, offset, size);
1832 
1833 	default:
1834 		DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1835 	}
1836 
1837 	return (0);
1838 }
1839 
1840 
1841 static int
1842 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1843 {
1844 	char bident[sizeof("XX:X:X")];
1845 	char	*uopt, *xopts, *config;
1846 	uint32_t sectsz;
1847 	int optidx;
1848 
1849 	sc->max_queues = NVME_QUEUES;
1850 	sc->max_qentries = NVME_MAX_QENTRIES;
1851 	sc->ioslots = NVME_IOSLOTS;
1852 	sc->num_squeues = sc->max_queues;
1853 	sc->num_cqueues = sc->max_queues;
1854 	sectsz = 0;
1855 
1856 	uopt = strdup(opts);
1857 	optidx = 0;
1858 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1859 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1860 	for (xopts = strtok(uopt, ",");
1861 	     xopts != NULL;
1862 	     xopts = strtok(NULL, ",")) {
1863 
1864 		if ((config = strchr(xopts, '=')) != NULL)
1865 			*config++ = '\0';
1866 
1867 		if (!strcmp("maxq", xopts)) {
1868 			sc->max_queues = atoi(config);
1869 		} else if (!strcmp("qsz", xopts)) {
1870 			sc->max_qentries = atoi(config);
1871 		} else if (!strcmp("ioslots", xopts)) {
1872 			sc->ioslots = atoi(config);
1873 		} else if (!strcmp("sectsz", xopts)) {
1874 			sectsz = atoi(config);
1875 		} else if (!strcmp("ser", xopts)) {
1876 			/*
1877 			 * This field indicates the Product Serial Number in
1878 			 * 7-bit ASCII, unused bytes should be space characters.
1879 			 * Ref: NVMe v1.3c.
1880 			 */
1881 			cpywithpad((char *)sc->ctrldata.sn,
1882 			           sizeof(sc->ctrldata.sn), config, ' ');
1883 		} else if (!strcmp("ram", xopts)) {
1884 			uint64_t sz = strtoull(&xopts[4], NULL, 10);
1885 
1886 			sc->nvstore.type = NVME_STOR_RAM;
1887 			sc->nvstore.size = sz * 1024 * 1024;
1888 			sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1889 			sc->nvstore.sectsz = 4096;
1890 			sc->nvstore.sectsz_bits = 12;
1891 			if (sc->nvstore.ctx == NULL) {
1892 				perror("Unable to allocate RAM");
1893 				free(uopt);
1894 				return (-1);
1895 			}
1896 		} else if (!strcmp("eui64", xopts)) {
1897 			sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
1898 		} else if (optidx == 0) {
1899 			snprintf(bident, sizeof(bident), "%d:%d",
1900 			         sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1901 			sc->nvstore.ctx = blockif_open(xopts, bident);
1902 			if (sc->nvstore.ctx == NULL) {
1903 				perror("Could not open backing file");
1904 				free(uopt);
1905 				return (-1);
1906 			}
1907 			sc->nvstore.type = NVME_STOR_BLOCKIF;
1908 			sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1909 		} else {
1910 			fprintf(stderr, "Invalid option %s\n", xopts);
1911 			free(uopt);
1912 			return (-1);
1913 		}
1914 
1915 		optidx++;
1916 	}
1917 	free(uopt);
1918 
1919 	if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1920 		fprintf(stderr, "backing store not specified\n");
1921 		return (-1);
1922 	}
1923 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1924 		sc->nvstore.sectsz = sectsz;
1925 	else if (sc->nvstore.type != NVME_STOR_RAM)
1926 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1927 	for (sc->nvstore.sectsz_bits = 9;
1928 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1929 	     sc->nvstore.sectsz_bits++);
1930 
1931 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1932 		sc->max_queues = NVME_QUEUES;
1933 
1934 	if (sc->max_qentries <= 0) {
1935 		fprintf(stderr, "Invalid qsz option\n");
1936 		return (-1);
1937 	}
1938 	if (sc->ioslots <= 0) {
1939 		fprintf(stderr, "Invalid ioslots option\n");
1940 		return (-1);
1941 	}
1942 
1943 	return (0);
1944 }
1945 
1946 static int
1947 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1948 {
1949 	struct pci_nvme_softc *sc;
1950 	uint32_t pci_membar_sz;
1951 	int	error;
1952 
1953 	error = 0;
1954 
1955 	sc = calloc(1, sizeof(struct pci_nvme_softc));
1956 	pi->pi_arg = sc;
1957 	sc->nsc_pi = pi;
1958 
1959 	error = pci_nvme_parse_opts(sc, opts);
1960 	if (error < 0)
1961 		goto done;
1962 	else
1963 		error = 0;
1964 
1965 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1966 	for (int i = 0; i < sc->ioslots; i++) {
1967 		if (i < (sc->ioslots-1))
1968 			sc->ioreqs[i].next = &sc->ioreqs[i+1];
1969 		pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1970 		pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1971 	}
1972 	sc->ioreqs_free = sc->ioreqs;
1973 	sc->intr_coales_aggr_thresh = 1;
1974 
1975 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1976 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1977 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1978 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1979 	pci_set_cfgdata8(pi, PCIR_PROGIF,
1980 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1981 
1982 	/*
1983 	 * Allocate size of NVMe registers + doorbell space for all queues.
1984 	 *
1985 	 * The specification requires a minimum memory I/O window size of 16K.
1986 	 * The Windows driver will refuse to start a device with a smaller
1987 	 * window.
1988 	 */
1989 	pci_membar_sz = sizeof(struct nvme_registers) +
1990 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
1991 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
1992 
1993 	DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
1994 
1995 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1996 	if (error) {
1997 		WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
1998 		goto done;
1999 	}
2000 
2001 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2002 	if (error) {
2003 		WPRINTF(("%s pci add msixcap failed\r\n", __func__));
2004 		goto done;
2005 	}
2006 
2007 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2008 	if (error) {
2009 		WPRINTF(("%s pci add Express capability failed\r\n", __func__));
2010 		goto done;
2011 	}
2012 
2013 	pthread_mutex_init(&sc->mtx, NULL);
2014 	sem_init(&sc->iosemlock, 0, sc->ioslots);
2015 
2016 	pci_nvme_reset(sc);
2017 	pci_nvme_init_ctrldata(sc);
2018 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, sc->nvstore.eui64);
2019 	pci_nvme_init_logpages(sc);
2020 
2021 	pci_lintr_request(pi);
2022 
2023 done:
2024 	return (error);
2025 }
2026 
2027 
2028 struct pci_devemu pci_de_nvme = {
2029 	.pe_emu =	"nvme",
2030 	.pe_init =	pci_nvme_init,
2031 	.pe_barwrite =	pci_nvme_write,
2032 	.pe_barread =	pci_nvme_read
2033 };
2034 PCI_EMUL_SET(pci_de_nvme);
2035