xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision 0957b409)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * bhyve PCIe-NVMe device emulation.
31  *
32  * options:
33  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z
34  *
35  *  accepted devpath:
36  *    /dev/blockdev
37  *    /path/to/image
38  *    ram=size_in_MiB
39  *
40  *  maxq    = max number of queues
41  *  qsz     = max elements in each queue
42  *  ioslots = max number of concurrent io requests
43  *  sectsz  = sector size (defaults to blockif sector size)
44  *  ser     = serial number (20-chars max)
45  *
46  */
47 
48 /* TODO:
49     - create async event for smart and log
50     - intr coalesce
51  */
52 
53 #include <sys/cdefs.h>
54 __FBSDID("$FreeBSD$");
55 
56 #include <sys/types.h>
57 
58 #include <assert.h>
59 #include <pthread.h>
60 #include <semaphore.h>
61 #include <stdbool.h>
62 #include <stddef.h>
63 #include <stdint.h>
64 #include <stdio.h>
65 #include <stdlib.h>
66 #include <string.h>
67 
68 #include <machine/atomic.h>
69 #include <machine/vmm.h>
70 #include <vmmapi.h>
71 
72 #include <dev/nvme/nvme.h>
73 
74 #include "bhyverun.h"
75 #include "block_if.h"
76 #include "pci_emul.h"
77 
78 
79 static int nvme_debug = 0;
80 #define	DPRINTF(params) if (nvme_debug) printf params
81 #define	WPRINTF(params) printf params
82 
83 /* defaults; can be overridden */
84 #define	NVME_MSIX_BAR		4
85 
86 #define	NVME_IOSLOTS		8
87 
88 #define	NVME_QUEUES		16
89 #define	NVME_MAX_QENTRIES	2048
90 
91 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
92 #define	NVME_MAX_BLOCKIOVS	512
93 
94 /* helpers */
95 
96 /* Convert a zero-based value into a one-based value */
97 #define ONE_BASED(zero)		((zero) + 1)
98 /* Convert a one-based value into a zero-based value */
99 #define ZERO_BASED(one)		((one)  - 1)
100 
101 /* Encode number of SQ's and CQ's for Set/Get Features */
102 #define NVME_FEATURE_NUM_QUEUES(sc) \
103 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
104 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
105 
106 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
107 
108 enum nvme_controller_register_offsets {
109 	NVME_CR_CAP_LOW = 0x00,
110 	NVME_CR_CAP_HI  = 0x04,
111 	NVME_CR_VS      = 0x08,
112 	NVME_CR_INTMS   = 0x0c,
113 	NVME_CR_INTMC   = 0x10,
114 	NVME_CR_CC      = 0x14,
115 	NVME_CR_CSTS    = 0x1c,
116 	NVME_CR_NSSR    = 0x20,
117 	NVME_CR_AQA     = 0x24,
118 	NVME_CR_ASQ_LOW = 0x28,
119 	NVME_CR_ASQ_HI  = 0x2c,
120 	NVME_CR_ACQ_LOW = 0x30,
121 	NVME_CR_ACQ_HI  = 0x34,
122 };
123 
124 enum nvme_cmd_cdw11 {
125 	NVME_CMD_CDW11_PC  = 0x0001,
126 	NVME_CMD_CDW11_IEN = 0x0002,
127 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
128 };
129 
130 #define	NVME_CQ_INTEN	0x01
131 #define	NVME_CQ_INTCOAL	0x02
132 
133 struct nvme_completion_queue {
134 	struct nvme_completion *qbase;
135 	uint32_t	size;
136 	uint16_t	tail; /* nvme progress */
137 	uint16_t	head; /* guest progress */
138 	uint16_t	intr_vec;
139 	uint32_t	intr_en;
140 	pthread_mutex_t	mtx;
141 };
142 
143 struct nvme_submission_queue {
144 	struct nvme_command *qbase;
145 	uint32_t	size;
146 	uint16_t	head; /* nvme progress */
147 	uint16_t	tail; /* guest progress */
148 	uint16_t	cqid; /* completion queue id */
149 	int		busy; /* queue is being processed */
150 	int		qpriority;
151 };
152 
153 enum nvme_storage_type {
154 	NVME_STOR_BLOCKIF = 0,
155 	NVME_STOR_RAM = 1,
156 };
157 
158 struct pci_nvme_blockstore {
159 	enum nvme_storage_type type;
160 	void		*ctx;
161 	uint64_t	size;
162 	uint32_t	sectsz;
163 	uint32_t	sectsz_bits;
164 };
165 
166 struct pci_nvme_ioreq {
167 	struct pci_nvme_softc *sc;
168 	struct pci_nvme_ioreq *next;
169 	struct nvme_submission_queue *nvme_sq;
170 	uint16_t	sqid;
171 
172 	/* command information */
173 	uint16_t	opc;
174 	uint16_t	cid;
175 	uint32_t	nsid;
176 
177 	uint64_t	prev_gpaddr;
178 	size_t		prev_size;
179 
180 	/*
181 	 * lock if all iovs consumed (big IO);
182 	 * complete transaction before continuing
183 	 */
184 	pthread_mutex_t	mtx;
185 	pthread_cond_t	cv;
186 
187 	struct blockif_req io_req;
188 
189 	/* pad to fit up to 512 page descriptors from guest IO request */
190 	struct iovec	iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
191 };
192 
193 struct pci_nvme_softc {
194 	struct pci_devinst *nsc_pi;
195 
196 	pthread_mutex_t	mtx;
197 
198 	struct nvme_registers regs;
199 
200 	struct nvme_namespace_data  nsdata;
201 	struct nvme_controller_data ctrldata;
202 
203 	struct pci_nvme_blockstore nvstore;
204 
205 	uint16_t	max_qentries;	/* max entries per queue */
206 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
207 	uint32_t	num_cqueues;
208 	uint32_t	num_squeues;
209 
210 	struct pci_nvme_ioreq *ioreqs;
211 	struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
212 	uint32_t	pending_ios;
213 	uint32_t	ioslots;
214 	sem_t		iosemlock;
215 
216 	/*
217 	 * Memory mapped Submission and Completion queues
218 	 * Each array includes both Admin and IO queues
219 	 */
220 	struct nvme_completion_queue *compl_queues;
221 	struct nvme_submission_queue *submit_queues;
222 
223 	/* controller features */
224 	uint32_t	intr_coales_aggr_time;   /* 0x08: uS to delay intr */
225 	uint32_t	intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
226 	uint32_t	async_ev_config;         /* 0x0B: async event config */
227 };
228 
229 
230 static void pci_nvme_io_partial(struct blockif_req *br, int err);
231 
232 /* Controller Configuration utils */
233 #define	NVME_CC_GET_EN(cc) \
234 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
235 #define	NVME_CC_GET_CSS(cc) \
236 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
237 #define	NVME_CC_GET_SHN(cc) \
238 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
239 #define	NVME_CC_GET_IOSQES(cc) \
240 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
241 #define	NVME_CC_GET_IOCQES(cc) \
242 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
243 
244 #define	NVME_CC_WRITE_MASK \
245 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
246 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
247 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
248 
249 #define	NVME_CC_NEN_WRITE_MASK \
250 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
251 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
252 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
253 
254 /* Controller Status utils */
255 #define	NVME_CSTS_GET_RDY(sts) \
256 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
257 
258 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
259 
260 /* Completion Queue status word utils */
261 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
262 #define	NVME_STATUS_MASK \
263 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
264 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
265 
266 static __inline void
267 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
268 {
269 	size_t len;
270 
271 	len = strnlen(src, dst_size);
272 	memset(dst, pad, dst_size);
273 	memcpy(dst, src, len);
274 }
275 
276 static __inline void
277 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
278 {
279 
280 	*status &= ~NVME_STATUS_MASK;
281 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
282 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
283 }
284 
285 static __inline void
286 pci_nvme_status_genc(uint16_t *status, uint16_t code)
287 {
288 
289 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
290 }
291 
292 static __inline void
293 pci_nvme_toggle_phase(uint16_t *status, int prev)
294 {
295 
296 	if (prev)
297 		*status &= ~NVME_STATUS_P;
298 	else
299 		*status |= NVME_STATUS_P;
300 }
301 
302 static void
303 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
304 {
305 	struct nvme_controller_data *cd = &sc->ctrldata;
306 
307 	cd->vid = 0xFB5D;
308 	cd->ssvid = 0x0000;
309 
310 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
311 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
312 
313 	/* Num of submission commands that we can handle at a time (2^rab) */
314 	cd->rab   = 4;
315 
316 	/* FreeBSD OUI */
317 	cd->ieee[0] = 0x58;
318 	cd->ieee[1] = 0x9c;
319 	cd->ieee[2] = 0xfc;
320 
321 	cd->mic = 0;
322 
323 	cd->mdts = 9;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
324 
325 	cd->ver = 0x00010300;
326 
327 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
328 	cd->acl = 2;
329 	cd->aerl = 4;
330 
331 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
332 	cd->elpe = 0;	/* max error log page entries */
333 	cd->npss = 1;	/* number of power states support */
334 
335 	/* Warning Composite Temperature Threshold */
336 	cd->wctemp = 0x0157;
337 
338 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
339 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
340 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
341 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
342 	cd->nn = 1;	/* number of namespaces */
343 
344 	cd->fna = 0x03;
345 
346 	cd->power_state[0].mp = 10;
347 }
348 
349 static void
350 pci_nvme_init_nsdata(struct pci_nvme_softc *sc)
351 {
352 	struct nvme_namespace_data *nd;
353 
354 	nd = &sc->nsdata;
355 
356 	nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
357 	nd->ncap = nd->nsze;
358 	nd->nuse = nd->nsze;
359 
360 	/* Get LBA and backstore information from backing store */
361 	nd->nlbaf = 1;
362 	/* LBA data-sz = 2^lbads */
363 	nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
364 
365 	nd->flbas = 0;
366 }
367 
368 static void
369 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
370 {
371 	DPRINTF(("%s\r\n", __func__));
372 
373 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
374 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
375 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
376 
377 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
378 
379 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
380 
381 	sc->regs.cc = 0;
382 	sc->regs.csts = 0;
383 
384 	sc->num_cqueues = sc->num_squeues = sc->max_queues;
385 	if (sc->submit_queues != NULL) {
386 		for (int i = 0; i < sc->num_squeues + 1; i++) {
387 			/*
388 			 * The Admin Submission Queue is at index 0.
389 			 * It must not be changed at reset otherwise the
390 			 * emulation will be out of sync with the guest.
391 			 */
392 			if (i != 0) {
393 				sc->submit_queues[i].qbase = NULL;
394 				sc->submit_queues[i].size = 0;
395 				sc->submit_queues[i].cqid = 0;
396 			}
397 			sc->submit_queues[i].tail = 0;
398 			sc->submit_queues[i].head = 0;
399 			sc->submit_queues[i].busy = 0;
400 		}
401 	} else
402 		sc->submit_queues = calloc(sc->num_squeues + 1,
403 		                        sizeof(struct nvme_submission_queue));
404 
405 	if (sc->compl_queues != NULL) {
406 		for (int i = 0; i < sc->num_cqueues + 1; i++) {
407 			/* See Admin Submission Queue note above */
408 			if (i != 0) {
409 				sc->compl_queues[i].qbase = NULL;
410 				sc->compl_queues[i].size = 0;
411 			}
412 
413 			sc->compl_queues[i].tail = 0;
414 			sc->compl_queues[i].head = 0;
415 		}
416 	} else {
417 		sc->compl_queues = calloc(sc->num_cqueues + 1,
418 		                        sizeof(struct nvme_completion_queue));
419 
420 		for (int i = 0; i < sc->num_cqueues + 1; i++)
421 			pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
422 	}
423 }
424 
425 static void
426 pci_nvme_reset(struct pci_nvme_softc *sc)
427 {
428 	pthread_mutex_lock(&sc->mtx);
429 	pci_nvme_reset_locked(sc);
430 	pthread_mutex_unlock(&sc->mtx);
431 }
432 
433 static void
434 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
435 {
436 	uint16_t acqs, asqs;
437 
438 	DPRINTF(("%s\r\n", __func__));
439 
440 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
441 	sc->submit_queues[0].size = asqs;
442 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
443 	            sizeof(struct nvme_command) * asqs);
444 
445 	DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
446 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase));
447 
448 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
449 	    NVME_AQA_REG_ACQS_MASK) + 1;
450 	sc->compl_queues[0].size = acqs;
451 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
452 	         sizeof(struct nvme_completion) * acqs);
453 	DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
454 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase));
455 }
456 
457 static int
458 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
459 	struct nvme_completion* compl)
460 {
461 	uint16_t qid = command->cdw10 & 0xffff;
462 
463 	DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
464 	if (qid == 0 || qid > sc->num_squeues) {
465 		WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
466 		        __func__, qid, sc->num_squeues));
467 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
468 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
469 		return (1);
470 	}
471 
472 	sc->submit_queues[qid].qbase = NULL;
473 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
474 	return (1);
475 }
476 
477 static int
478 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
479 	struct nvme_completion* compl)
480 {
481 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
482 		uint16_t qid = command->cdw10 & 0xffff;
483 		struct nvme_submission_queue *nsq;
484 
485 		if ((qid == 0) || (qid > sc->num_squeues)) {
486 			WPRINTF(("%s queue index %u > num_squeues %u\r\n",
487 			        __func__, qid, sc->num_squeues));
488 			pci_nvme_status_tc(&compl->status,
489 			    NVME_SCT_COMMAND_SPECIFIC,
490 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
491 			return (1);
492 		}
493 
494 		nsq = &sc->submit_queues[qid];
495 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
496 
497 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
498 		              sizeof(struct nvme_command) * (size_t)nsq->size);
499 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
500 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
501 
502 		DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
503 		        qid, nsq->size, nsq->qbase, nsq->cqid));
504 
505 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
506 
507 		DPRINTF(("%s completed creating IOSQ qid %u\r\n",
508 		         __func__, qid));
509 	} else {
510 		/*
511 		 * Guest sent non-cont submission queue request.
512 		 * This setting is unsupported by this emulation.
513 		 */
514 		WPRINTF(("%s unsupported non-contig (list-based) "
515 		         "create i/o submission queue\r\n", __func__));
516 
517 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
518 	}
519 	return (1);
520 }
521 
522 static int
523 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
524 	struct nvme_completion* compl)
525 {
526 	uint16_t qid = command->cdw10 & 0xffff;
527 
528 	DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
529 	if (qid == 0 || qid > sc->num_cqueues) {
530 		WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
531 		        __func__, qid, sc->num_cqueues));
532 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
533 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
534 		return (1);
535 	}
536 
537 	sc->compl_queues[qid].qbase = NULL;
538 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
539 	return (1);
540 }
541 
542 static int
543 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
544 	struct nvme_completion* compl)
545 {
546 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
547 		uint16_t qid = command->cdw10 & 0xffff;
548 		struct nvme_completion_queue *ncq;
549 
550 		if ((qid == 0) || (qid > sc->num_cqueues)) {
551 			WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
552 			        __func__, qid, sc->num_cqueues));
553 			pci_nvme_status_tc(&compl->status,
554 			    NVME_SCT_COMMAND_SPECIFIC,
555 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
556 			return (1);
557 		}
558 
559 		ncq = &sc->compl_queues[qid];
560 		ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
561 		ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
562 		ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
563 
564 		ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
565 		             command->prp1,
566 		             sizeof(struct nvme_command) * (size_t)ncq->size);
567 
568 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
569 	} else {
570 		/*
571 		 * Non-contig completion queue unsupported.
572 		 */
573 		WPRINTF(("%s unsupported non-contig (list-based) "
574 		         "create i/o completion queue\r\n",
575 		         __func__));
576 
577 		/* 0x12 = Invalid Use of Controller Memory Buffer */
578 		pci_nvme_status_genc(&compl->status, 0x12);
579 	}
580 
581 	return (1);
582 }
583 
584 static int
585 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
586 	struct nvme_completion* compl)
587 {
588 	uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
589 	uint8_t logpage = command->cdw10 & 0xFF;
590 	void *data;
591 
592 	DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
593 
594 	if (logpage >= 1 && logpage <= 3)
595 		data = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
596 		                  PAGE_SIZE);
597 
598 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
599 
600 	switch (logpage) {
601 	case 0x01: /* Error information */
602 		memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
603 		break;
604 	case 0x02: /* SMART/Health information */
605 		/* TODO: present some smart info */
606 		memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
607 		break;
608 	case 0x03: /* Firmware slot information */
609 		memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
610 		break;
611 	default:
612 		WPRINTF(("%s get log page %x command not supported\r\n",
613 		        __func__, logpage));
614 
615 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
616 		    NVME_SC_INVALID_LOG_PAGE);
617 	}
618 
619 	return (1);
620 }
621 
622 static int
623 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
624 	struct nvme_completion* compl)
625 {
626 	void *dest;
627 
628 	DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
629 	        command->cdw10 & 0xFF, command->nsid));
630 
631 	switch (command->cdw10 & 0xFF) {
632 	case 0x00: /* return Identify Namespace data structure */
633 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
634 		                  sizeof(sc->nsdata));
635 		memcpy(dest, &sc->nsdata, sizeof(sc->nsdata));
636 		break;
637 	case 0x01: /* return Identify Controller data structure */
638 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
639 		                  sizeof(sc->ctrldata));
640 		memcpy(dest, &sc->ctrldata, sizeof(sc->ctrldata));
641 		break;
642 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
643 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
644 		                  sizeof(uint32_t) * 1024);
645 		((uint32_t *)dest)[0] = 1;
646 		((uint32_t *)dest)[1] = 0;
647 		break;
648 	case 0x11:
649 		pci_nvme_status_genc(&compl->status,
650 		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
651 		return (1);
652 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
653 	case 0x10:
654 	case 0x12:
655 	case 0x13:
656 	case 0x14:
657 	case 0x15:
658 	default:
659 		DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
660 		         __func__, command->cdw10 & 0xFF));
661 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
662 		return (1);
663 	}
664 
665 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
666 	return (1);
667 }
668 
669 static int
670 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
671 	struct nvme_completion* compl)
672 {
673 	uint16_t nqr;	/* Number of Queues Requested */
674 
675 	nqr = command->cdw11 & 0xFFFF;
676 	if (nqr == 0xffff) {
677 		WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr));
678 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
679 		return (-1);
680 	}
681 
682 	sc->num_squeues = ONE_BASED(nqr);
683 	if (sc->num_squeues > sc->max_queues) {
684 		DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues,
685 					sc->max_queues));
686 		sc->num_squeues = sc->max_queues;
687 	}
688 
689 	nqr = (command->cdw11 >> 16) & 0xFFFF;
690 	if (nqr == 0xffff) {
691 		WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr));
692 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
693 		return (-1);
694 	}
695 
696 	sc->num_cqueues = ONE_BASED(nqr);
697 	if (sc->num_cqueues > sc->max_queues) {
698 		DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues,
699 					sc->max_queues));
700 		sc->num_cqueues = sc->max_queues;
701 	}
702 
703 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
704 
705 	return (0);
706 }
707 
708 static int
709 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
710 	struct nvme_completion* compl)
711 {
712 	int feature = command->cdw10 & 0xFF;
713 	uint32_t iv;
714 
715 	DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
716 	compl->cdw0 = 0;
717 
718 	switch (feature) {
719 	case NVME_FEAT_ARBITRATION:
720 		DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
721 		break;
722 	case NVME_FEAT_POWER_MANAGEMENT:
723 		DPRINTF(("  power management 0x%x\r\n", command->cdw11));
724 		break;
725 	case NVME_FEAT_LBA_RANGE_TYPE:
726 		DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
727 		break;
728 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
729 		DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
730 		break;
731 	case NVME_FEAT_ERROR_RECOVERY:
732 		DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
733 		break;
734 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
735 		DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
736 		break;
737 	case NVME_FEAT_NUMBER_OF_QUEUES:
738 		nvme_set_feature_queues(sc, command, compl);
739 		break;
740 	case NVME_FEAT_INTERRUPT_COALESCING:
741 		DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
742 
743 		/* in uS */
744 		sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
745 
746 		sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
747 		break;
748 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
749 		iv = command->cdw11 & 0xFFFF;
750 
751 		DPRINTF(("  interrupt vector configuration 0x%x\r\n",
752 		        command->cdw11));
753 
754 		for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
755 			if (sc->compl_queues[i].intr_vec == iv) {
756 				if (command->cdw11 & (1 << 16))
757 					sc->compl_queues[i].intr_en |=
758 					                      NVME_CQ_INTCOAL;
759 				else
760 					sc->compl_queues[i].intr_en &=
761 					                     ~NVME_CQ_INTCOAL;
762 			}
763 		}
764 		break;
765 	case NVME_FEAT_WRITE_ATOMICITY:
766 		DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
767 		break;
768 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
769 		DPRINTF(("  async event configuration 0x%x\r\n",
770 		        command->cdw11));
771 		sc->async_ev_config = command->cdw11;
772 		break;
773 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
774 		DPRINTF(("  software progress marker 0x%x\r\n",
775 		        command->cdw11));
776 		break;
777 	case 0x0C:
778 		DPRINTF(("  autonomous power state transition 0x%x\r\n",
779 		        command->cdw11));
780 		break;
781 	default:
782 		WPRINTF(("%s invalid feature\r\n", __func__));
783 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
784 		return (1);
785 	}
786 
787 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
788 	return (1);
789 }
790 
791 static int
792 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
793 	struct nvme_completion* compl)
794 {
795 	int feature = command->cdw10 & 0xFF;
796 
797 	DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
798 
799 	compl->cdw0 = 0;
800 
801 	switch (feature) {
802 	case NVME_FEAT_ARBITRATION:
803 		DPRINTF(("  arbitration\r\n"));
804 		break;
805 	case NVME_FEAT_POWER_MANAGEMENT:
806 		DPRINTF(("  power management\r\n"));
807 		break;
808 	case NVME_FEAT_LBA_RANGE_TYPE:
809 		DPRINTF(("  lba range\r\n"));
810 		break;
811 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
812 		DPRINTF(("  temperature threshold\r\n"));
813 		switch ((command->cdw11 >> 20) & 0x3) {
814 		case 0:
815 			/* Over temp threshold */
816 			compl->cdw0 = 0xFFFF;
817 			break;
818 		case 1:
819 			/* Under temp threshold */
820 			compl->cdw0 = 0;
821 			break;
822 		default:
823 			WPRINTF(("  invalid threshold type select\r\n"));
824 			pci_nvme_status_genc(&compl->status,
825 			    NVME_SC_INVALID_FIELD);
826 			return (1);
827 		}
828 		break;
829 	case NVME_FEAT_ERROR_RECOVERY:
830 		DPRINTF(("  error recovery\r\n"));
831 		break;
832 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
833 		DPRINTF(("  volatile write cache\r\n"));
834 		break;
835 	case NVME_FEAT_NUMBER_OF_QUEUES:
836 		compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
837 
838 		DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
839 		        compl->cdw0 & 0xFFFF,
840 		        (compl->cdw0 >> 16) & 0xFFFF));
841 
842 		break;
843 	case NVME_FEAT_INTERRUPT_COALESCING:
844 		DPRINTF(("  interrupt coalescing\r\n"));
845 		break;
846 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
847 		DPRINTF(("  interrupt vector configuration\r\n"));
848 		break;
849 	case NVME_FEAT_WRITE_ATOMICITY:
850 		DPRINTF(("  write atomicity\r\n"));
851 		break;
852 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
853 		DPRINTF(("  async event configuration\r\n"));
854 		sc->async_ev_config = command->cdw11;
855 		break;
856 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
857 		DPRINTF(("  software progress marker\r\n"));
858 		break;
859 	case 0x0C:
860 		DPRINTF(("  autonomous power state transition\r\n"));
861 		break;
862 	default:
863 		WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
864 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
865 		return (1);
866 	}
867 
868 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
869 	return (1);
870 }
871 
872 static int
873 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
874 	struct nvme_completion* compl)
875 {
876 	DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
877 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
878 
879 	/* TODO: search for the command ID and abort it */
880 
881 	compl->cdw0 = 1;
882 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
883 	return (1);
884 }
885 
886 static int
887 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
888 	struct nvme_command* command, struct nvme_completion* compl)
889 {
890 	DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
891 
892 	/*
893 	 * TODO: raise events when they happen based on the Set Features cmd.
894 	 * These events happen async, so only set completion successful if
895 	 * there is an event reflective of the request to get event.
896 	 */
897 	pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
898 	    NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
899 	return (0);
900 }
901 
902 static void
903 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
904 {
905 	struct nvme_completion compl;
906 	struct nvme_command *cmd;
907 	struct nvme_submission_queue *sq;
908 	struct nvme_completion_queue *cq;
909 	int do_intr = 0;
910 	uint16_t sqhead;
911 
912 	DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
913 
914 	sq = &sc->submit_queues[0];
915 
916 	sqhead = atomic_load_acq_short(&sq->head);
917 
918 	if (atomic_testandset_int(&sq->busy, 1)) {
919 		DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
920 		        __func__, sqhead, sq->tail));
921 		return;
922 	}
923 
924 	DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
925 
926 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
927 		cmd = &(sq->qbase)[sqhead];
928 		compl.status = 0;
929 
930 		switch (cmd->opc) {
931 		case NVME_OPC_DELETE_IO_SQ:
932 			DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
933 			do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
934 			break;
935 		case NVME_OPC_CREATE_IO_SQ:
936 			DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
937 			do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
938 			break;
939 		case NVME_OPC_DELETE_IO_CQ:
940 			DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
941 			do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
942 			break;
943 		case NVME_OPC_CREATE_IO_CQ:
944 			DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
945 			do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
946 			break;
947 		case NVME_OPC_GET_LOG_PAGE:
948 			DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
949 			do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
950 			break;
951 		case NVME_OPC_IDENTIFY:
952 			DPRINTF(("%s command IDENTIFY\r\n", __func__));
953 			do_intr |= nvme_opc_identify(sc, cmd, &compl);
954 			break;
955 		case NVME_OPC_ABORT:
956 			DPRINTF(("%s command ABORT\r\n", __func__));
957 			do_intr |= nvme_opc_abort(sc, cmd, &compl);
958 			break;
959 		case NVME_OPC_SET_FEATURES:
960 			DPRINTF(("%s command SET_FEATURES\r\n", __func__));
961 			do_intr |= nvme_opc_set_features(sc, cmd, &compl);
962 			break;
963 		case NVME_OPC_GET_FEATURES:
964 			DPRINTF(("%s command GET_FEATURES\r\n", __func__));
965 			do_intr |= nvme_opc_get_features(sc, cmd, &compl);
966 			break;
967 		case NVME_OPC_ASYNC_EVENT_REQUEST:
968 			DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
969 			/* XXX dont care, unhandled for now
970 			do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
971 			*/
972 			break;
973 		default:
974 			WPRINTF(("0x%x command is not implemented\r\n",
975 			    cmd->opc));
976 		}
977 
978 		/* for now skip async event generation */
979 		if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
980 			struct nvme_completion *cp;
981 			int phase;
982 
983 			cq = &sc->compl_queues[0];
984 
985 			cp = &(cq->qbase)[cq->tail];
986 			cp->cdw0 = compl.cdw0;
987 			cp->sqid = 0;
988 			cp->sqhd = sqhead;
989 			cp->cid = cmd->cid;
990 
991 			phase = NVME_STATUS_GET_P(cp->status);
992 			cp->status = compl.status;
993 			pci_nvme_toggle_phase(&cp->status, phase);
994 
995 			cq->tail = (cq->tail + 1) % cq->size;
996 		}
997 		sqhead = (sqhead + 1) % sq->size;
998 	}
999 
1000 	DPRINTF(("setting sqhead %u\r\n", sqhead));
1001 	atomic_store_short(&sq->head, sqhead);
1002 	atomic_store_int(&sq->busy, 0);
1003 
1004 	if (do_intr)
1005 		pci_generate_msix(sc->nsc_pi, 0);
1006 
1007 }
1008 
1009 static int
1010 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1011 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1012 {
1013 	int iovidx;
1014 
1015 	if (req != NULL) {
1016 		/* concatenate contig block-iovs to minimize number of iovs */
1017 		if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1018 			iovidx = req->io_req.br_iovcnt - 1;
1019 
1020 			req->io_req.br_iov[iovidx].iov_base =
1021 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1022 			                     req->prev_gpaddr, size);
1023 
1024 			req->prev_size += size;
1025 			req->io_req.br_resid += size;
1026 
1027 			req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1028 		} else {
1029 			pthread_mutex_lock(&req->mtx);
1030 
1031 			iovidx = req->io_req.br_iovcnt;
1032 			if (iovidx == NVME_MAX_BLOCKIOVS) {
1033 				int err = 0;
1034 
1035 				DPRINTF(("large I/O, doing partial req\r\n"));
1036 
1037 				iovidx = 0;
1038 				req->io_req.br_iovcnt = 0;
1039 
1040 				req->io_req.br_callback = pci_nvme_io_partial;
1041 
1042 				if (!do_write)
1043 					err = blockif_read(sc->nvstore.ctx,
1044 					                   &req->io_req);
1045 				else
1046 					err = blockif_write(sc->nvstore.ctx,
1047 					                    &req->io_req);
1048 
1049 				/* wait until req completes before cont */
1050 				if (err == 0)
1051 					pthread_cond_wait(&req->cv, &req->mtx);
1052 			}
1053 			if (iovidx == 0) {
1054 				req->io_req.br_offset = lba;
1055 				req->io_req.br_resid = 0;
1056 				req->io_req.br_param = req;
1057 			}
1058 
1059 			req->io_req.br_iov[iovidx].iov_base =
1060 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1061 			                     gpaddr, size);
1062 
1063 			req->io_req.br_iov[iovidx].iov_len = size;
1064 
1065 			req->prev_gpaddr = gpaddr;
1066 			req->prev_size = size;
1067 			req->io_req.br_resid += size;
1068 
1069 			req->io_req.br_iovcnt++;
1070 
1071 			pthread_mutex_unlock(&req->mtx);
1072 		}
1073 	} else {
1074 		/* RAM buffer: read/write directly */
1075 		void *p = sc->nvstore.ctx;
1076 		void *gptr;
1077 
1078 		if ((lba + size) > sc->nvstore.size) {
1079 			WPRINTF(("%s write would overflow RAM\r\n", __func__));
1080 			return (-1);
1081 		}
1082 
1083 		p = (void *)((uintptr_t)p + (uintptr_t)lba);
1084 		gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1085 		if (do_write)
1086 			memcpy(p, gptr, size);
1087 		else
1088 			memcpy(gptr, p, size);
1089 	}
1090 	return (0);
1091 }
1092 
1093 static void
1094 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1095 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1096 	uint32_t cdw0, uint16_t status, int ignore_busy)
1097 {
1098 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1099 	struct nvme_completion *compl;
1100 	int do_intr = 0;
1101 	int phase;
1102 
1103 	DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1104 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1105 		 NVME_STATUS_GET_SC(status)));
1106 
1107 	pthread_mutex_lock(&cq->mtx);
1108 
1109 	assert(cq->qbase != NULL);
1110 
1111 	compl = &cq->qbase[cq->tail];
1112 
1113 	compl->sqhd = atomic_load_acq_short(&sq->head);
1114 	compl->sqid = sqid;
1115 	compl->cid = cid;
1116 
1117 	// toggle phase
1118 	phase = NVME_STATUS_GET_P(compl->status);
1119 	compl->status = status;
1120 	pci_nvme_toggle_phase(&compl->status, phase);
1121 
1122 	cq->tail = (cq->tail + 1) % cq->size;
1123 
1124 	if (cq->intr_en & NVME_CQ_INTEN)
1125 		do_intr = 1;
1126 
1127 	pthread_mutex_unlock(&cq->mtx);
1128 
1129 	if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1130 		if (do_intr)
1131 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1132 }
1133 
1134 static void
1135 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1136 {
1137 	req->sc = NULL;
1138 	req->nvme_sq = NULL;
1139 	req->sqid = 0;
1140 
1141 	pthread_mutex_lock(&sc->mtx);
1142 
1143 	req->next = sc->ioreqs_free;
1144 	sc->ioreqs_free = req;
1145 	sc->pending_ios--;
1146 
1147 	/* when no more IO pending, can set to ready if device reset/enabled */
1148 	if (sc->pending_ios == 0 &&
1149 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1150 		sc->regs.csts |= NVME_CSTS_RDY;
1151 
1152 	pthread_mutex_unlock(&sc->mtx);
1153 
1154 	sem_post(&sc->iosemlock);
1155 }
1156 
1157 static struct pci_nvme_ioreq *
1158 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1159 {
1160 	struct pci_nvme_ioreq *req = NULL;;
1161 
1162 	sem_wait(&sc->iosemlock);
1163 	pthread_mutex_lock(&sc->mtx);
1164 
1165 	req = sc->ioreqs_free;
1166 	assert(req != NULL);
1167 
1168 	sc->ioreqs_free = req->next;
1169 
1170 	req->next = NULL;
1171 	req->sc = sc;
1172 
1173 	sc->pending_ios++;
1174 
1175 	pthread_mutex_unlock(&sc->mtx);
1176 
1177 	req->io_req.br_iovcnt = 0;
1178 	req->io_req.br_offset = 0;
1179 	req->io_req.br_resid = 0;
1180 	req->io_req.br_param = req;
1181 	req->prev_gpaddr = 0;
1182 	req->prev_size = 0;
1183 
1184 	return req;
1185 }
1186 
1187 static void
1188 pci_nvme_io_done(struct blockif_req *br, int err)
1189 {
1190 	struct pci_nvme_ioreq *req = br->br_param;
1191 	struct nvme_submission_queue *sq = req->nvme_sq;
1192 	uint16_t code, status;
1193 
1194 	DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1195 
1196 	/* TODO return correct error */
1197 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1198 	pci_nvme_status_genc(&status, code);
1199 
1200 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1201 	pci_nvme_release_ioreq(req->sc, req);
1202 }
1203 
1204 static void
1205 pci_nvme_io_partial(struct blockif_req *br, int err)
1206 {
1207 	struct pci_nvme_ioreq *req = br->br_param;
1208 
1209 	DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1210 
1211 	pthread_cond_signal(&req->cv);
1212 }
1213 
1214 
1215 static void
1216 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1217 {
1218 	struct nvme_submission_queue *sq;
1219 	uint16_t status;
1220 	uint16_t sqhead;
1221 	int err;
1222 
1223 	/* handle all submissions up to sq->tail index */
1224 	sq = &sc->submit_queues[idx];
1225 
1226 	if (atomic_testandset_int(&sq->busy, 1)) {
1227 		DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1228 		return;
1229 	}
1230 
1231 	sqhead = atomic_load_acq_short(&sq->head);
1232 
1233 	DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1234 	         idx, sqhead, sq->tail, sq->qbase));
1235 
1236 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1237 		struct nvme_command *cmd;
1238 		struct pci_nvme_ioreq *req = NULL;
1239 		uint64_t lba;
1240 		uint64_t nblocks, bytes, size, cpsz;
1241 
1242 		/* TODO: support scatter gather list handling */
1243 
1244 		cmd = &sq->qbase[sqhead];
1245 		sqhead = (sqhead + 1) % sq->size;
1246 
1247 		lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1248 
1249 		if (cmd->opc == NVME_OPC_FLUSH) {
1250 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1251 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1252 			                        status, 1);
1253 
1254 			continue;
1255 		} else if (cmd->opc == 0x08) {
1256 			/* TODO: write zeroes */
1257 			WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1258 			        __func__, lba, cmd->cdw12 & 0xFFFF));
1259 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1260 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1261 			                        status, 1);
1262 
1263 			continue;
1264 		}
1265 
1266 		nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1267 
1268 		bytes = nblocks * sc->nvstore.sectsz;
1269 
1270 		if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1271 			req = pci_nvme_get_ioreq(sc);
1272 			req->nvme_sq = sq;
1273 			req->sqid = idx;
1274 		}
1275 
1276 		/*
1277 		 * If data starts mid-page and flows into the next page, then
1278 		 * increase page count
1279 		 */
1280 
1281 		DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1282 		         "(%lu-bytes)\r\n",
1283 		         sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1284 		         cmd->opc == NVME_OPC_WRITE ?
1285 			     "WRITE" : "READ",
1286 		         lba, nblocks, bytes));
1287 
1288 		cmd->prp1 &= ~(0x03UL);
1289 		cmd->prp2 &= ~(0x03UL);
1290 
1291 		DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1292 
1293 		size = bytes;
1294 		lba *= sc->nvstore.sectsz;
1295 
1296 		cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1297 
1298 		if (cpsz > bytes)
1299 			cpsz = bytes;
1300 
1301 		if (req != NULL) {
1302 			req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1303 			                        cmd->cdw10;
1304 			req->opc = cmd->opc;
1305 			req->cid = cmd->cid;
1306 			req->nsid = cmd->nsid;
1307 		}
1308 
1309 		err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1310 		    cmd->opc == NVME_OPC_WRITE, lba);
1311 		lba += cpsz;
1312 		size -= cpsz;
1313 
1314 		if (size == 0)
1315 			goto iodone;
1316 
1317 		if (size <= PAGE_SIZE) {
1318 			/* prp2 is second (and final) page in transfer */
1319 
1320 			err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1321 			    size,
1322 			    cmd->opc == NVME_OPC_WRITE,
1323 			    lba);
1324 		} else {
1325 			uint64_t *prp_list;
1326 			int i;
1327 
1328 			/* prp2 is pointer to a physical region page list */
1329 			prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1330 			                            cmd->prp2, PAGE_SIZE);
1331 
1332 			i = 0;
1333 			while (size != 0) {
1334 				cpsz = MIN(size, PAGE_SIZE);
1335 
1336 				/*
1337 				 * Move to linked physical region page list
1338 				 * in last item.
1339 				 */
1340 				if (i == (NVME_PRP2_ITEMS-1) &&
1341 				    size > PAGE_SIZE) {
1342 					assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1343 					prp_list = paddr_guest2host(
1344 					              sc->nsc_pi->pi_vmctx,
1345 					              prp_list[i], PAGE_SIZE);
1346 					i = 0;
1347 				}
1348 				if (prp_list[i] == 0) {
1349 					WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1350 					err = 1;
1351 					break;
1352 				}
1353 
1354 				err = pci_nvme_append_iov_req(sc, req,
1355 				    prp_list[i], cpsz,
1356 				    cmd->opc == NVME_OPC_WRITE, lba);
1357 				if (err)
1358 					break;
1359 
1360 				lba += cpsz;
1361 				size -= cpsz;
1362 				i++;
1363 			}
1364 		}
1365 
1366 iodone:
1367 		if (sc->nvstore.type == NVME_STOR_RAM) {
1368 			uint16_t code, status;
1369 
1370 			code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1371 			    NVME_SC_SUCCESS;
1372 			pci_nvme_status_genc(&status, code);
1373 
1374 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1375 			                        status, 1);
1376 
1377 			continue;
1378 		}
1379 
1380 
1381 		if (err)
1382 			goto do_error;
1383 
1384 		req->io_req.br_callback = pci_nvme_io_done;
1385 
1386 		err = 0;
1387 		switch (cmd->opc) {
1388 		case NVME_OPC_READ:
1389 			err = blockif_read(sc->nvstore.ctx, &req->io_req);
1390 			break;
1391 		case NVME_OPC_WRITE:
1392 			err = blockif_write(sc->nvstore.ctx, &req->io_req);
1393 			break;
1394 		default:
1395 			WPRINTF(("%s unhandled io command 0x%x\r\n",
1396 				 __func__, cmd->opc));
1397 			err = 1;
1398 		}
1399 
1400 do_error:
1401 		if (err) {
1402 			uint16_t status;
1403 
1404 			pci_nvme_status_genc(&status,
1405 			    NVME_SC_DATA_TRANSFER_ERROR);
1406 
1407 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1408 			                        status, 1);
1409 			pci_nvme_release_ioreq(sc, req);
1410 		}
1411 	}
1412 
1413 	atomic_store_short(&sq->head, sqhead);
1414 	atomic_store_int(&sq->busy, 0);
1415 }
1416 
1417 static void
1418 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1419 	uint64_t idx, int is_sq, uint64_t value)
1420 {
1421 	DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1422 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1423 
1424 	if (is_sq) {
1425 		atomic_store_short(&sc->submit_queues[idx].tail,
1426 		                   (uint16_t)value);
1427 
1428 		if (idx == 0) {
1429 			pci_nvme_handle_admin_cmd(sc, value);
1430 		} else {
1431 			/* submission queue; handle new entries in SQ */
1432 			if (idx > sc->num_squeues) {
1433 				WPRINTF(("%s SQ index %lu overflow from "
1434 				         "guest (max %u)\r\n",
1435 				         __func__, idx, sc->num_squeues));
1436 				return;
1437 			}
1438 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1439 		}
1440 	} else {
1441 		if (idx > sc->num_cqueues) {
1442 			WPRINTF(("%s queue index %lu overflow from "
1443 			         "guest (max %u)\r\n",
1444 			         __func__, idx, sc->num_cqueues));
1445 			return;
1446 		}
1447 
1448 		sc->compl_queues[idx].head = (uint16_t)value;
1449 	}
1450 }
1451 
1452 static void
1453 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1454 {
1455 	const char *s = iswrite ? "WRITE" : "READ";
1456 
1457 	switch (offset) {
1458 	case NVME_CR_CAP_LOW:
1459 		DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1460 		break;
1461 	case NVME_CR_CAP_HI:
1462 		DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1463 		break;
1464 	case NVME_CR_VS:
1465 		DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1466 		break;
1467 	case NVME_CR_INTMS:
1468 		DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1469 		break;
1470 	case NVME_CR_INTMC:
1471 		DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1472 		break;
1473 	case NVME_CR_CC:
1474 		DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1475 		break;
1476 	case NVME_CR_CSTS:
1477 		DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1478 		break;
1479 	case NVME_CR_NSSR:
1480 		DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1481 		break;
1482 	case NVME_CR_AQA:
1483 		DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1484 		break;
1485 	case NVME_CR_ASQ_LOW:
1486 		DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1487 		break;
1488 	case NVME_CR_ASQ_HI:
1489 		DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1490 		break;
1491 	case NVME_CR_ACQ_LOW:
1492 		DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1493 		break;
1494 	case NVME_CR_ACQ_HI:
1495 		DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1496 		break;
1497 	default:
1498 		DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1499 	}
1500 
1501 }
1502 
1503 static void
1504 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1505 	uint64_t offset, int size, uint64_t value)
1506 {
1507 	uint32_t ccreg;
1508 
1509 	if (offset >= NVME_DOORBELL_OFFSET) {
1510 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1511 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1512 		int is_sq = (belloffset % 8) < 4;
1513 
1514 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1515 			WPRINTF(("guest attempted an overflow write offset "
1516 			         "0x%lx, val 0x%lx in %s",
1517 			         offset, value, __func__));
1518 			return;
1519 		}
1520 
1521 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1522 		return;
1523 	}
1524 
1525 	DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1526 	        offset, size, value));
1527 
1528 	if (size != 4) {
1529 		WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1530 		         "val 0x%lx) to bar0 in %s",
1531 		         size, offset, value, __func__));
1532 		/* TODO: shutdown device */
1533 		return;
1534 	}
1535 
1536 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1537 
1538 	pthread_mutex_lock(&sc->mtx);
1539 
1540 	switch (offset) {
1541 	case NVME_CR_CAP_LOW:
1542 	case NVME_CR_CAP_HI:
1543 		/* readonly */
1544 		break;
1545 	case NVME_CR_VS:
1546 		/* readonly */
1547 		break;
1548 	case NVME_CR_INTMS:
1549 		/* MSI-X, so ignore */
1550 		break;
1551 	case NVME_CR_INTMC:
1552 		/* MSI-X, so ignore */
1553 		break;
1554 	case NVME_CR_CC:
1555 		ccreg = (uint32_t)value;
1556 
1557 		DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1558 		         "iocqes %u\r\n",
1559 		        __func__,
1560 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1561 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1562 			 NVME_CC_GET_IOCQES(ccreg)));
1563 
1564 		if (NVME_CC_GET_SHN(ccreg)) {
1565 			/* perform shutdown - flush out data to backend */
1566 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1567 			    NVME_CSTS_REG_SHST_SHIFT);
1568 			sc->regs.csts |= NVME_SHST_COMPLETE <<
1569 			    NVME_CSTS_REG_SHST_SHIFT;
1570 		}
1571 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1572 			if (NVME_CC_GET_EN(ccreg) == 0)
1573 				/* transition 1-> causes controller reset */
1574 				pci_nvme_reset_locked(sc);
1575 			else
1576 				pci_nvme_init_controller(ctx, sc);
1577 		}
1578 
1579 		/* Insert the iocqes, iosqes and en bits from the write */
1580 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1581 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1582 		if (NVME_CC_GET_EN(ccreg) == 0) {
1583 			/* Insert the ams, mps and css bit fields */
1584 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1585 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1586 			sc->regs.csts &= ~NVME_CSTS_RDY;
1587 		} else if (sc->pending_ios == 0) {
1588 			sc->regs.csts |= NVME_CSTS_RDY;
1589 		}
1590 		break;
1591 	case NVME_CR_CSTS:
1592 		break;
1593 	case NVME_CR_NSSR:
1594 		/* ignore writes; don't support subsystem reset */
1595 		break;
1596 	case NVME_CR_AQA:
1597 		sc->regs.aqa = (uint32_t)value;
1598 		break;
1599 	case NVME_CR_ASQ_LOW:
1600 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1601 		               (0xFFFFF000 & value);
1602 		break;
1603 	case NVME_CR_ASQ_HI:
1604 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1605 		               (value << 32);
1606 		break;
1607 	case NVME_CR_ACQ_LOW:
1608 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1609 		               (0xFFFFF000 & value);
1610 		break;
1611 	case NVME_CR_ACQ_HI:
1612 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1613 		               (value << 32);
1614 		break;
1615 	default:
1616 		DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1617 		         __func__, offset, value, size));
1618 	}
1619 	pthread_mutex_unlock(&sc->mtx);
1620 }
1621 
1622 static void
1623 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1624                 int baridx, uint64_t offset, int size, uint64_t value)
1625 {
1626 	struct pci_nvme_softc* sc = pi->pi_arg;
1627 
1628 	if (baridx == pci_msix_table_bar(pi) ||
1629 	    baridx == pci_msix_pba_bar(pi)) {
1630 		DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1631 		         " value 0x%lx\r\n", baridx, offset, size, value));
1632 
1633 		pci_emul_msix_twrite(pi, offset, size, value);
1634 		return;
1635 	}
1636 
1637 	switch (baridx) {
1638 	case 0:
1639 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1640 		break;
1641 
1642 	default:
1643 		DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1644 		         __func__, baridx, value));
1645 	}
1646 }
1647 
1648 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1649 	uint64_t offset, int size)
1650 {
1651 	uint64_t value;
1652 
1653 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1654 
1655 	if (offset < NVME_DOORBELL_OFFSET) {
1656 		void *p = &(sc->regs);
1657 		pthread_mutex_lock(&sc->mtx);
1658 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
1659 		pthread_mutex_unlock(&sc->mtx);
1660 	} else {
1661 		value = 0;
1662                 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1663 	}
1664 
1665 	switch (size) {
1666 	case 1:
1667 		value &= 0xFF;
1668 		break;
1669 	case 2:
1670 		value &= 0xFFFF;
1671 		break;
1672 	case 4:
1673 		value &= 0xFFFFFFFF;
1674 		break;
1675 	}
1676 
1677 	DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1678 	         offset, size, (uint32_t)value));
1679 
1680 	return (value);
1681 }
1682 
1683 
1684 
1685 static uint64_t
1686 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1687     uint64_t offset, int size)
1688 {
1689 	struct pci_nvme_softc* sc = pi->pi_arg;
1690 
1691 	if (baridx == pci_msix_table_bar(pi) ||
1692 	    baridx == pci_msix_pba_bar(pi)) {
1693 		DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1694 		        baridx, offset, size));
1695 
1696 		return pci_emul_msix_tread(pi, offset, size);
1697 	}
1698 
1699 	switch (baridx) {
1700 	case 0:
1701        		return pci_nvme_read_bar_0(sc, offset, size);
1702 
1703 	default:
1704 		DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1705 	}
1706 
1707 	return (0);
1708 }
1709 
1710 
1711 static int
1712 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1713 {
1714 	char bident[sizeof("XX:X:X")];
1715 	char	*uopt, *xopts, *config;
1716 	uint32_t sectsz;
1717 	int optidx;
1718 
1719 	sc->max_queues = NVME_QUEUES;
1720 	sc->max_qentries = NVME_MAX_QENTRIES;
1721 	sc->ioslots = NVME_IOSLOTS;
1722 	sc->num_squeues = sc->max_queues;
1723 	sc->num_cqueues = sc->max_queues;
1724 	sectsz = 0;
1725 
1726 	uopt = strdup(opts);
1727 	optidx = 0;
1728 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1729 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1730 	for (xopts = strtok(uopt, ",");
1731 	     xopts != NULL;
1732 	     xopts = strtok(NULL, ",")) {
1733 
1734 		if ((config = strchr(xopts, '=')) != NULL)
1735 			*config++ = '\0';
1736 
1737 		if (!strcmp("maxq", xopts)) {
1738 			sc->max_queues = atoi(config);
1739 		} else if (!strcmp("qsz", xopts)) {
1740 			sc->max_qentries = atoi(config);
1741 		} else if (!strcmp("ioslots", xopts)) {
1742 			sc->ioslots = atoi(config);
1743 		} else if (!strcmp("sectsz", xopts)) {
1744 			sectsz = atoi(config);
1745 		} else if (!strcmp("ser", xopts)) {
1746 			/*
1747 			 * This field indicates the Product Serial Number in
1748 			 * 7-bit ASCII, unused bytes should be space characters.
1749 			 * Ref: NVMe v1.3c.
1750 			 */
1751 			cpywithpad((char *)sc->ctrldata.sn,
1752 			           sizeof(sc->ctrldata.sn), config, ' ');
1753 		} else if (!strcmp("ram", xopts)) {
1754 			uint64_t sz = strtoull(&xopts[4], NULL, 10);
1755 
1756 			sc->nvstore.type = NVME_STOR_RAM;
1757 			sc->nvstore.size = sz * 1024 * 1024;
1758 			sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1759 			sc->nvstore.sectsz = 4096;
1760 			sc->nvstore.sectsz_bits = 12;
1761 			if (sc->nvstore.ctx == NULL) {
1762 				perror("Unable to allocate RAM");
1763 				free(uopt);
1764 				return (-1);
1765 			}
1766 		} else if (optidx == 0) {
1767 			snprintf(bident, sizeof(bident), "%d:%d",
1768 			         sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1769 			sc->nvstore.ctx = blockif_open(xopts, bident);
1770 			if (sc->nvstore.ctx == NULL) {
1771 				perror("Could not open backing file");
1772 				free(uopt);
1773 				return (-1);
1774 			}
1775 			sc->nvstore.type = NVME_STOR_BLOCKIF;
1776 			sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1777 		} else {
1778 			fprintf(stderr, "Invalid option %s\n", xopts);
1779 			free(uopt);
1780 			return (-1);
1781 		}
1782 
1783 		optidx++;
1784 	}
1785 	free(uopt);
1786 
1787 	if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1788 		fprintf(stderr, "backing store not specified\n");
1789 		return (-1);
1790 	}
1791 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1792 		sc->nvstore.sectsz = sectsz;
1793 	else if (sc->nvstore.type != NVME_STOR_RAM)
1794 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1795 	for (sc->nvstore.sectsz_bits = 9;
1796 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1797 	     sc->nvstore.sectsz_bits++);
1798 
1799 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1800 		sc->max_queues = NVME_QUEUES;
1801 
1802 	if (sc->max_qentries <= 0) {
1803 		fprintf(stderr, "Invalid qsz option\n");
1804 		return (-1);
1805 	}
1806 	if (sc->ioslots <= 0) {
1807 		fprintf(stderr, "Invalid ioslots option\n");
1808 		return (-1);
1809 	}
1810 
1811 	return (0);
1812 }
1813 
1814 static int
1815 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1816 {
1817 	struct pci_nvme_softc *sc;
1818 	uint32_t pci_membar_sz;
1819 	int	error;
1820 
1821 	error = 0;
1822 
1823 	sc = calloc(1, sizeof(struct pci_nvme_softc));
1824 	pi->pi_arg = sc;
1825 	sc->nsc_pi = pi;
1826 
1827 	error = pci_nvme_parse_opts(sc, opts);
1828 	if (error < 0)
1829 		goto done;
1830 	else
1831 		error = 0;
1832 
1833 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1834 	for (int i = 0; i < sc->ioslots; i++) {
1835 		if (i < (sc->ioslots-1))
1836 			sc->ioreqs[i].next = &sc->ioreqs[i+1];
1837 		pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1838 		pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1839 	}
1840 	sc->ioreqs_free = sc->ioreqs;
1841 	sc->intr_coales_aggr_thresh = 1;
1842 
1843 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1844 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1845 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1846 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1847 	pci_set_cfgdata8(pi, PCIR_PROGIF,
1848 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1849 
1850 	/* allocate size of nvme registers + doorbell space for all queues */
1851 	pci_membar_sz = sizeof(struct nvme_registers) +
1852 	                2*sizeof(uint32_t)*(sc->max_queues + 1);
1853 
1854 	DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
1855 
1856 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1857 	if (error) {
1858 		WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
1859 		goto done;
1860 	}
1861 
1862 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
1863 	if (error) {
1864 		WPRINTF(("%s pci add msixcap failed\r\n", __func__));
1865 		goto done;
1866 	}
1867 
1868 	pthread_mutex_init(&sc->mtx, NULL);
1869 	sem_init(&sc->iosemlock, 0, sc->ioslots);
1870 
1871 	pci_nvme_reset(sc);
1872 	pci_nvme_init_ctrldata(sc);
1873 	pci_nvme_init_nsdata(sc);
1874 
1875 	pci_lintr_request(pi);
1876 
1877 done:
1878 	return (error);
1879 }
1880 
1881 
1882 struct pci_devemu pci_de_nvme = {
1883 	.pe_emu =	"nvme",
1884 	.pe_init =	pci_nvme_init,
1885 	.pe_barwrite =	pci_nvme_write,
1886 	.pe_barread =	pci_nvme_read
1887 };
1888 PCI_EMUL_SET(pci_de_nvme);
1889