1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 #ifndef __FreeBSD__ 66 #include <endian.h> 67 #endif 68 69 #include <assert.h> 70 #include <pthread.h> 71 #include <pthread_np.h> 72 #include <semaphore.h> 73 #include <stdbool.h> 74 #include <stddef.h> 75 #include <stdint.h> 76 #include <stdio.h> 77 #include <stdlib.h> 78 #include <string.h> 79 80 #include <machine/atomic.h> 81 #include <machine/vmm.h> 82 #include <vmmapi.h> 83 84 #include <dev/nvme/nvme.h> 85 86 #include "bhyverun.h" 87 #include "block_if.h" 88 #include "config.h" 89 #include "debug.h" 90 #include "pci_emul.h" 91 92 93 static int nvme_debug = 0; 94 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 95 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 96 97 /* defaults; can be overridden */ 98 #define NVME_MSIX_BAR 4 99 100 #define NVME_IOSLOTS 8 101 102 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 103 #define NVME_MMIO_SPACE_MIN (1 << 14) 104 105 #define NVME_QUEUES 16 106 #define NVME_MAX_QENTRIES 2048 107 /* Memory Page size Minimum reported in CAP register */ 108 #define NVME_MPSMIN 0 109 /* MPSMIN converted to bytes */ 110 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 111 112 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 113 #define NVME_MDTS 9 114 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 115 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 116 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 117 118 /* This is a synthetic status code to indicate there is no status */ 119 #define NVME_NO_STATUS 0xffff 120 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 121 122 /* helpers */ 123 124 /* Convert a zero-based value into a one-based value */ 125 #define ONE_BASED(zero) ((zero) + 1) 126 /* Convert a one-based value into a zero-based value */ 127 #define ZERO_BASED(one) ((one) - 1) 128 129 /* Encode number of SQ's and CQ's for Set/Get Features */ 130 #define NVME_FEATURE_NUM_QUEUES(sc) \ 131 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 132 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 133 134 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 135 136 enum nvme_controller_register_offsets { 137 NVME_CR_CAP_LOW = 0x00, 138 NVME_CR_CAP_HI = 0x04, 139 NVME_CR_VS = 0x08, 140 NVME_CR_INTMS = 0x0c, 141 NVME_CR_INTMC = 0x10, 142 NVME_CR_CC = 0x14, 143 NVME_CR_CSTS = 0x1c, 144 NVME_CR_NSSR = 0x20, 145 NVME_CR_AQA = 0x24, 146 NVME_CR_ASQ_LOW = 0x28, 147 NVME_CR_ASQ_HI = 0x2c, 148 NVME_CR_ACQ_LOW = 0x30, 149 NVME_CR_ACQ_HI = 0x34, 150 }; 151 152 enum nvme_cmd_cdw11 { 153 NVME_CMD_CDW11_PC = 0x0001, 154 NVME_CMD_CDW11_IEN = 0x0002, 155 NVME_CMD_CDW11_IV = 0xFFFF0000, 156 }; 157 158 enum nvme_copy_dir { 159 NVME_COPY_TO_PRP, 160 NVME_COPY_FROM_PRP, 161 }; 162 163 #define NVME_CQ_INTEN 0x01 164 #define NVME_CQ_INTCOAL 0x02 165 166 struct nvme_completion_queue { 167 struct nvme_completion *qbase; 168 pthread_mutex_t mtx; 169 uint32_t size; 170 uint16_t tail; /* nvme progress */ 171 uint16_t head; /* guest progress */ 172 uint16_t intr_vec; 173 uint32_t intr_en; 174 }; 175 176 struct nvme_submission_queue { 177 struct nvme_command *qbase; 178 pthread_mutex_t mtx; 179 uint32_t size; 180 uint16_t head; /* nvme progress */ 181 uint16_t tail; /* guest progress */ 182 uint16_t cqid; /* completion queue id */ 183 int qpriority; 184 }; 185 186 enum nvme_storage_type { 187 NVME_STOR_BLOCKIF = 0, 188 NVME_STOR_RAM = 1, 189 }; 190 191 struct pci_nvme_blockstore { 192 enum nvme_storage_type type; 193 void *ctx; 194 uint64_t size; 195 uint32_t sectsz; 196 uint32_t sectsz_bits; 197 uint64_t eui64; 198 uint32_t deallocate:1; 199 }; 200 201 /* 202 * Calculate the number of additional page descriptors for guest IO requests 203 * based on the advertised Max Data Transfer (MDTS) and given the number of 204 * default iovec's in a struct blockif_req. 205 */ 206 #define MDTS_PAD_SIZE \ 207 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 208 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 209 0 ) 210 211 struct pci_nvme_ioreq { 212 struct pci_nvme_softc *sc; 213 STAILQ_ENTRY(pci_nvme_ioreq) link; 214 struct nvme_submission_queue *nvme_sq; 215 uint16_t sqid; 216 217 /* command information */ 218 uint16_t opc; 219 uint16_t cid; 220 uint32_t nsid; 221 222 uint64_t prev_gpaddr; 223 size_t prev_size; 224 size_t bytes; 225 226 struct blockif_req io_req; 227 228 struct iovec iovpadding[MDTS_PAD_SIZE]; 229 }; 230 231 enum nvme_dsm_type { 232 /* Dataset Management bit in ONCS reflects backing storage capability */ 233 NVME_DATASET_MANAGEMENT_AUTO, 234 /* Unconditionally set Dataset Management bit in ONCS */ 235 NVME_DATASET_MANAGEMENT_ENABLE, 236 /* Unconditionally clear Dataset Management bit in ONCS */ 237 NVME_DATASET_MANAGEMENT_DISABLE, 238 }; 239 240 struct pci_nvme_softc; 241 struct nvme_feature_obj; 242 243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 244 struct nvme_feature_obj *, 245 struct nvme_command *, 246 struct nvme_completion *); 247 248 struct nvme_feature_obj { 249 uint32_t cdw11; 250 nvme_feature_cb set; 251 nvme_feature_cb get; 252 bool namespace_specific; 253 }; 254 255 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 256 257 typedef enum { 258 PCI_NVME_AE_TYPE_ERROR = 0, 259 PCI_NVME_AE_TYPE_SMART, 260 PCI_NVME_AE_TYPE_NOTICE, 261 PCI_NVME_AE_TYPE_IO_CMD = 6, 262 PCI_NVME_AE_TYPE_VENDOR = 7, 263 PCI_NVME_AE_TYPE_MAX /* Must be last */ 264 } pci_nvme_async_type; 265 266 /* Asynchronous Event Requests */ 267 struct pci_nvme_aer { 268 STAILQ_ENTRY(pci_nvme_aer) link; 269 uint16_t cid; /* Command ID of the submitted AER */ 270 }; 271 272 typedef enum { 273 PCI_NVME_AE_INFO_NS_ATTR_CHANGED = 0, 274 PCI_NVME_AE_INFO_FW_ACTIVATION, 275 PCI_NVME_AE_INFO_TELEMETRY_CHANGE, 276 PCI_NVME_AE_INFO_ANA_CHANGE, 277 PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE, 278 PCI_NVME_AE_INFO_LBA_STATUS_ALERT, 279 PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE, 280 PCI_NVME_AE_INFO_MAX, 281 } pci_nvme_async_info; 282 283 /* Asynchronous Event Notifications */ 284 struct pci_nvme_aen { 285 pci_nvme_async_type atype; 286 uint32_t event_data; 287 bool posted; 288 }; 289 290 struct pci_nvme_softc { 291 struct pci_devinst *nsc_pi; 292 293 pthread_mutex_t mtx; 294 295 struct nvme_registers regs; 296 297 struct nvme_namespace_data nsdata; 298 struct nvme_controller_data ctrldata; 299 struct nvme_error_information_entry err_log; 300 struct nvme_health_information_page health_log; 301 struct nvme_firmware_page fw_log; 302 struct nvme_ns_list ns_log; 303 304 struct pci_nvme_blockstore nvstore; 305 306 uint16_t max_qentries; /* max entries per queue */ 307 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 308 uint32_t num_cqueues; 309 uint32_t num_squeues; 310 bool num_q_is_set; /* Has host set Number of Queues */ 311 312 struct pci_nvme_ioreq *ioreqs; 313 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 314 uint32_t pending_ios; 315 uint32_t ioslots; 316 sem_t iosemlock; 317 318 /* 319 * Memory mapped Submission and Completion queues 320 * Each array includes both Admin and IO queues 321 */ 322 struct nvme_completion_queue *compl_queues; 323 struct nvme_submission_queue *submit_queues; 324 325 struct nvme_feature_obj feat[NVME_FID_MAX]; 326 327 enum nvme_dsm_type dataset_management; 328 329 /* Accounting for SMART data */ 330 __uint128_t read_data_units; 331 __uint128_t write_data_units; 332 __uint128_t read_commands; 333 __uint128_t write_commands; 334 uint32_t read_dunits_remainder; 335 uint32_t write_dunits_remainder; 336 337 STAILQ_HEAD(, pci_nvme_aer) aer_list; 338 pthread_mutex_t aer_mtx; 339 uint32_t aer_count; 340 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 341 pthread_t aen_tid; 342 pthread_mutex_t aen_mtx; 343 pthread_cond_t aen_cond; 344 }; 345 346 347 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 348 struct nvme_completion_queue *cq, 349 uint32_t cdw0, 350 uint16_t cid, 351 uint16_t sqid, 352 uint16_t status); 353 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 354 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 355 static void pci_nvme_io_done(struct blockif_req *, int); 356 357 /* Controller Configuration utils */ 358 #define NVME_CC_GET_EN(cc) \ 359 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 360 #define NVME_CC_GET_CSS(cc) \ 361 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 362 #define NVME_CC_GET_SHN(cc) \ 363 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 364 #define NVME_CC_GET_IOSQES(cc) \ 365 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 366 #define NVME_CC_GET_IOCQES(cc) \ 367 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 368 369 #define NVME_CC_WRITE_MASK \ 370 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 371 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 372 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 373 374 #define NVME_CC_NEN_WRITE_MASK \ 375 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 376 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 377 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 378 379 /* Controller Status utils */ 380 #define NVME_CSTS_GET_RDY(sts) \ 381 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 382 383 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 384 385 /* Completion Queue status word utils */ 386 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 387 #define NVME_STATUS_MASK \ 388 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 389 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 390 391 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 392 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 393 394 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 395 struct nvme_feature_obj *, 396 struct nvme_command *, 397 struct nvme_completion *); 398 static void nvme_feature_num_queues(struct pci_nvme_softc *, 399 struct nvme_feature_obj *, 400 struct nvme_command *, 401 struct nvme_completion *); 402 static void nvme_feature_iv_config(struct pci_nvme_softc *, 403 struct nvme_feature_obj *, 404 struct nvme_command *, 405 struct nvme_completion *); 406 407 static void *aen_thr(void *arg); 408 409 static __inline void 410 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 411 { 412 size_t len; 413 414 len = strnlen(src, dst_size); 415 memset(dst, pad, dst_size); 416 memcpy(dst, src, len); 417 } 418 419 static __inline void 420 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 421 { 422 423 *status &= ~NVME_STATUS_MASK; 424 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 425 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 426 } 427 428 static __inline void 429 pci_nvme_status_genc(uint16_t *status, uint16_t code) 430 { 431 432 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 433 } 434 435 /* 436 * Initialize the requested number or IO Submission and Completion Queues. 437 * Admin queues are allocated implicitly. 438 */ 439 static void 440 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 441 { 442 uint32_t i; 443 444 /* 445 * Allocate and initialize the Submission Queues 446 */ 447 if (nsq > NVME_QUEUES) { 448 WPRINTF("%s: clamping number of SQ from %u to %u", 449 __func__, nsq, NVME_QUEUES); 450 nsq = NVME_QUEUES; 451 } 452 453 sc->num_squeues = nsq; 454 455 sc->submit_queues = calloc(sc->num_squeues + 1, 456 sizeof(struct nvme_submission_queue)); 457 if (sc->submit_queues == NULL) { 458 WPRINTF("%s: SQ allocation failed", __func__); 459 sc->num_squeues = 0; 460 } else { 461 struct nvme_submission_queue *sq = sc->submit_queues; 462 463 for (i = 0; i < sc->num_squeues; i++) 464 pthread_mutex_init(&sq[i].mtx, NULL); 465 } 466 467 /* 468 * Allocate and initialize the Completion Queues 469 */ 470 if (ncq > NVME_QUEUES) { 471 WPRINTF("%s: clamping number of CQ from %u to %u", 472 __func__, ncq, NVME_QUEUES); 473 ncq = NVME_QUEUES; 474 } 475 476 sc->num_cqueues = ncq; 477 478 sc->compl_queues = calloc(sc->num_cqueues + 1, 479 sizeof(struct nvme_completion_queue)); 480 if (sc->compl_queues == NULL) { 481 WPRINTF("%s: CQ allocation failed", __func__); 482 sc->num_cqueues = 0; 483 } else { 484 struct nvme_completion_queue *cq = sc->compl_queues; 485 486 for (i = 0; i < sc->num_cqueues; i++) 487 pthread_mutex_init(&cq[i].mtx, NULL); 488 } 489 } 490 491 static void 492 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 493 { 494 struct nvme_controller_data *cd = &sc->ctrldata; 495 496 cd->vid = 0xFB5D; 497 cd->ssvid = 0x0000; 498 499 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 500 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 501 502 /* Num of submission commands that we can handle at a time (2^rab) */ 503 cd->rab = 4; 504 505 /* FreeBSD OUI */ 506 cd->ieee[0] = 0x58; 507 cd->ieee[1] = 0x9c; 508 cd->ieee[2] = 0xfc; 509 510 cd->mic = 0; 511 512 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 513 514 cd->ver = 0x00010300; 515 516 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 517 #ifndef __FreeBSD__ 518 /* 519 * Reported upstream against https://reviews.freebsd.org/D32953 520 * which introduced support for the namespace attribute changed AEN 521 * and the corresponding changed namespace log page, without setting 522 * the bit in oaes. A future sync will likely include this 523 * definition in usr/src/contrib/bhyve/dev/nvme/nvme.h once it's 524 * fixed there. 525 */ 526 #define NVME_CTRLR_DATA_OAES_NSCHANGE_SHIFT (8) 527 cd->oaes = 1 << NVME_CTRLR_DATA_OAES_NSCHANGE_SHIFT; 528 #endif 529 cd->acl = 2; 530 cd->aerl = 4; 531 532 /* Advertise 1, Read-only firmware slot */ 533 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK | 534 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 535 cd->lpa = 0; /* TODO: support some simple things like SMART */ 536 cd->elpe = 0; /* max error log page entries */ 537 cd->npss = 1; /* number of power states support */ 538 539 /* Warning Composite Temperature Threshold */ 540 cd->wctemp = 0x0157; 541 542 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 543 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 544 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 545 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 546 cd->nn = 1; /* number of namespaces */ 547 548 cd->oncs = 0; 549 switch (sc->dataset_management) { 550 case NVME_DATASET_MANAGEMENT_AUTO: 551 if (sc->nvstore.deallocate) 552 cd->oncs |= NVME_ONCS_DSM; 553 break; 554 case NVME_DATASET_MANAGEMENT_ENABLE: 555 cd->oncs |= NVME_ONCS_DSM; 556 break; 557 default: 558 break; 559 } 560 561 cd->fna = 0x03; 562 563 cd->power_state[0].mp = 10; 564 } 565 566 /* 567 * Calculate the CRC-16 of the given buffer 568 * See copyright attribution at top of file 569 */ 570 static uint16_t 571 crc16(uint16_t crc, const void *buffer, unsigned int len) 572 { 573 const unsigned char *cp = buffer; 574 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 575 static uint16_t const crc16_table[256] = { 576 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 577 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 578 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 579 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 580 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 581 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 582 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 583 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 584 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 585 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 586 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 587 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 588 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 589 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 590 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 591 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 592 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 593 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 594 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 595 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 596 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 597 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 598 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 599 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 600 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 601 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 602 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 603 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 604 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 605 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 606 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 607 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 608 }; 609 610 while (len--) 611 crc = (((crc >> 8) & 0xffU) ^ 612 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 613 return crc; 614 } 615 616 static void 617 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 618 struct nvme_namespace_data *nd) 619 { 620 621 /* Get capacity and block size information from backing store */ 622 nd->nsze = nvstore->size / nvstore->sectsz; 623 nd->ncap = nd->nsze; 624 nd->nuse = nd->nsze; 625 } 626 627 static void 628 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 629 struct nvme_namespace_data *nd, uint32_t nsid, 630 struct pci_nvme_blockstore *nvstore) 631 { 632 633 pci_nvme_init_nsdata_size(nvstore, nd); 634 635 if (nvstore->type == NVME_STOR_BLOCKIF) 636 nvstore->deallocate = blockif_candelete(nvstore->ctx); 637 638 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 639 nd->flbas = 0; 640 641 /* Create an EUI-64 if user did not provide one */ 642 if (nvstore->eui64 == 0) { 643 char *data = NULL; 644 uint64_t eui64 = nvstore->eui64; 645 646 asprintf(&data, "%s%u%u%u", get_config_value("name"), 647 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 648 sc->nsc_pi->pi_func); 649 650 if (data != NULL) { 651 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 652 free(data); 653 } 654 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 655 } 656 be64enc(nd->eui64, nvstore->eui64); 657 658 /* LBA data-sz = 2^lbads */ 659 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 660 } 661 662 static void 663 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 664 { 665 666 memset(&sc->err_log, 0, sizeof(sc->err_log)); 667 memset(&sc->health_log, 0, sizeof(sc->health_log)); 668 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 669 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 670 671 /* Set read/write remainder to round up according to spec */ 672 sc->read_dunits_remainder = 999; 673 sc->write_dunits_remainder = 999; 674 675 /* Set nominal Health values checked by implementations */ 676 sc->health_log.temperature = 310; 677 sc->health_log.available_spare = 100; 678 sc->health_log.available_spare_threshold = 10; 679 } 680 681 static void 682 pci_nvme_init_features(struct pci_nvme_softc *sc) 683 { 684 685 sc->feat[0].set = nvme_feature_invalid_cb; 686 sc->feat[0].get = nvme_feature_invalid_cb; 687 688 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true; 689 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true; 690 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues; 691 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set = 692 nvme_feature_iv_config; 693 /* Enable all AENs by default */ 694 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 = 0x31f; 695 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get = 696 nvme_feature_invalid_cb; 697 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get = 698 nvme_feature_invalid_cb; 699 } 700 701 static void 702 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 703 { 704 705 STAILQ_INIT(&sc->aer_list); 706 sc->aer_count = 0; 707 } 708 709 static void 710 pci_nvme_aer_init(struct pci_nvme_softc *sc) 711 { 712 713 pthread_mutex_init(&sc->aer_mtx, NULL); 714 pci_nvme_aer_reset(sc); 715 } 716 717 static void 718 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 719 { 720 struct pci_nvme_aer *aer = NULL; 721 722 pthread_mutex_lock(&sc->aer_mtx); 723 while (!STAILQ_EMPTY(&sc->aer_list)) { 724 aer = STAILQ_FIRST(&sc->aer_list); 725 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 726 free(aer); 727 } 728 pthread_mutex_unlock(&sc->aer_mtx); 729 730 pci_nvme_aer_reset(sc); 731 } 732 733 static bool 734 pci_nvme_aer_available(struct pci_nvme_softc *sc) 735 { 736 737 return (sc->aer_count != 0); 738 } 739 740 static bool 741 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 742 { 743 struct nvme_controller_data *cd = &sc->ctrldata; 744 745 /* AERL is a zero based value while aer_count is one's based */ 746 return (sc->aer_count == (cd->aerl + 1)); 747 } 748 749 /* 750 * Add an Async Event Request 751 * 752 * Stores an AER to be returned later if the Controller needs to notify the 753 * host of an event. 754 * Note that while the NVMe spec doesn't require Controllers to return AER's 755 * in order, this implementation does preserve the order. 756 */ 757 static int 758 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 759 { 760 struct pci_nvme_aer *aer = NULL; 761 762 if (pci_nvme_aer_limit_reached(sc)) 763 return (-1); 764 765 aer = calloc(1, sizeof(struct pci_nvme_aer)); 766 if (aer == NULL) 767 return (-1); 768 769 /* Save the Command ID for use in the completion message */ 770 aer->cid = cid; 771 772 pthread_mutex_lock(&sc->aer_mtx); 773 sc->aer_count++; 774 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 775 pthread_mutex_unlock(&sc->aer_mtx); 776 777 return (0); 778 } 779 780 /* 781 * Get an Async Event Request structure 782 * 783 * Returns a pointer to an AER previously submitted by the host or NULL if 784 * no AER's exist. Caller is responsible for freeing the returned struct. 785 */ 786 static struct pci_nvme_aer * 787 pci_nvme_aer_get(struct pci_nvme_softc *sc) 788 { 789 struct pci_nvme_aer *aer = NULL; 790 791 pthread_mutex_lock(&sc->aer_mtx); 792 aer = STAILQ_FIRST(&sc->aer_list); 793 if (aer != NULL) { 794 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 795 sc->aer_count--; 796 } 797 pthread_mutex_unlock(&sc->aer_mtx); 798 799 return (aer); 800 } 801 802 static void 803 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 804 { 805 uint32_t atype; 806 807 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 808 809 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 810 sc->aen[atype].atype = atype; 811 } 812 } 813 814 static void 815 pci_nvme_aen_init(struct pci_nvme_softc *sc) 816 { 817 char nstr[80]; 818 819 pci_nvme_aen_reset(sc); 820 821 pthread_mutex_init(&sc->aen_mtx, NULL); 822 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 823 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 824 sc->nsc_pi->pi_func); 825 pthread_set_name_np(sc->aen_tid, nstr); 826 } 827 828 static void 829 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 830 { 831 832 pci_nvme_aen_reset(sc); 833 } 834 835 /* Notify the AEN thread of pending work */ 836 static void 837 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 838 { 839 840 pthread_cond_signal(&sc->aen_cond); 841 } 842 843 /* 844 * Post an Asynchronous Event Notification 845 */ 846 static int32_t 847 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 848 uint32_t event_data) 849 { 850 struct pci_nvme_aen *aen; 851 852 if (atype >= PCI_NVME_AE_TYPE_MAX) { 853 return(EINVAL); 854 } 855 856 pthread_mutex_lock(&sc->aen_mtx); 857 aen = &sc->aen[atype]; 858 859 /* Has the controller already posted an event of this type? */ 860 if (aen->posted) { 861 pthread_mutex_unlock(&sc->aen_mtx); 862 return(EALREADY); 863 } 864 865 aen->event_data = event_data; 866 aen->posted = true; 867 pthread_mutex_unlock(&sc->aen_mtx); 868 869 pci_nvme_aen_notify(sc); 870 871 return(0); 872 } 873 874 static void 875 pci_nvme_aen_process(struct pci_nvme_softc *sc) 876 { 877 struct pci_nvme_aer *aer; 878 struct pci_nvme_aen *aen; 879 pci_nvme_async_type atype; 880 uint32_t mask; 881 uint16_t status; 882 uint8_t lid; 883 884 #ifndef __FreeBSD__ 885 lid = 0; 886 #endif 887 888 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 889 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 890 aen = &sc->aen[atype]; 891 /* Previous iterations may have depleted the available AER's */ 892 if (!pci_nvme_aer_available(sc)) { 893 DPRINTF("%s: no AER", __func__); 894 break; 895 } 896 897 if (!aen->posted) { 898 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 899 continue; 900 } 901 902 status = NVME_SC_SUCCESS; 903 904 /* Is the event masked? */ 905 mask = 906 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 907 908 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 909 switch (atype) { 910 case PCI_NVME_AE_TYPE_ERROR: 911 lid = NVME_LOG_ERROR; 912 break; 913 case PCI_NVME_AE_TYPE_SMART: 914 mask &= 0xff; 915 if ((mask & aen->event_data) == 0) 916 continue; 917 lid = NVME_LOG_HEALTH_INFORMATION; 918 break; 919 case PCI_NVME_AE_TYPE_NOTICE: 920 if (aen->event_data >= PCI_NVME_AE_INFO_MAX) { 921 EPRINTLN("%s unknown AEN notice type %u", 922 __func__, aen->event_data); 923 status = NVME_SC_INTERNAL_DEVICE_ERROR; 924 break; 925 } 926 mask >>= 8; 927 if (((1 << aen->event_data) & mask) == 0) 928 continue; 929 switch (aen->event_data) { 930 case PCI_NVME_AE_INFO_NS_ATTR_CHANGED: 931 lid = NVME_LOG_CHANGED_NAMESPACE; 932 break; 933 case PCI_NVME_AE_INFO_FW_ACTIVATION: 934 lid = NVME_LOG_FIRMWARE_SLOT; 935 break; 936 case PCI_NVME_AE_INFO_TELEMETRY_CHANGE: 937 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 938 break; 939 case PCI_NVME_AE_INFO_ANA_CHANGE: 940 lid = NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS; //TODO spelling 941 break; 942 case PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE: 943 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 944 break; 945 case PCI_NVME_AE_INFO_LBA_STATUS_ALERT: 946 lid = NVME_LOG_LBA_STATUS_INFORMATION; 947 break; 948 case PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE: 949 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 950 break; 951 default: 952 lid = 0; 953 } 954 break; 955 default: 956 /* bad type?!? */ 957 EPRINTLN("%s unknown AEN type %u", __func__, atype); 958 status = NVME_SC_INTERNAL_DEVICE_ERROR; 959 break; 960 } 961 962 aer = pci_nvme_aer_get(sc); 963 assert(aer != NULL); 964 965 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 966 pci_nvme_cq_update(sc, &sc->compl_queues[0], 967 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 968 aer->cid, 969 0, /* SQID */ 970 status); 971 972 aen->event_data = 0; 973 aen->posted = false; 974 975 pci_generate_msix(sc->nsc_pi, 0); 976 } 977 } 978 979 static void * 980 aen_thr(void *arg) 981 { 982 struct pci_nvme_softc *sc; 983 984 sc = arg; 985 986 pthread_mutex_lock(&sc->aen_mtx); 987 for (;;) { 988 pci_nvme_aen_process(sc); 989 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 990 } 991 #ifdef __FreeBSD__ 992 pthread_mutex_unlock(&sc->aen_mtx); 993 994 pthread_exit(NULL); 995 #endif 996 return (NULL); 997 } 998 999 static void 1000 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1001 { 1002 uint32_t i; 1003 1004 DPRINTF("%s", __func__); 1005 1006 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1007 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1008 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1009 1010 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1011 1012 sc->regs.vs = 0x00010300; /* NVMe v1.3 */ 1013 1014 sc->regs.cc = 0; 1015 sc->regs.csts = 0; 1016 1017 assert(sc->submit_queues != NULL); 1018 1019 for (i = 0; i < sc->num_squeues + 1; i++) { 1020 sc->submit_queues[i].qbase = NULL; 1021 sc->submit_queues[i].size = 0; 1022 sc->submit_queues[i].cqid = 0; 1023 sc->submit_queues[i].tail = 0; 1024 sc->submit_queues[i].head = 0; 1025 } 1026 1027 assert(sc->compl_queues != NULL); 1028 1029 for (i = 0; i < sc->num_cqueues + 1; i++) { 1030 sc->compl_queues[i].qbase = NULL; 1031 sc->compl_queues[i].size = 0; 1032 sc->compl_queues[i].tail = 0; 1033 sc->compl_queues[i].head = 0; 1034 } 1035 1036 sc->num_q_is_set = false; 1037 1038 pci_nvme_aer_destroy(sc); 1039 pci_nvme_aen_destroy(sc); 1040 } 1041 1042 static void 1043 pci_nvme_reset(struct pci_nvme_softc *sc) 1044 { 1045 pthread_mutex_lock(&sc->mtx); 1046 pci_nvme_reset_locked(sc); 1047 pthread_mutex_unlock(&sc->mtx); 1048 } 1049 1050 static void 1051 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 1052 { 1053 uint16_t acqs, asqs; 1054 1055 DPRINTF("%s", __func__); 1056 1057 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 1058 sc->submit_queues[0].size = asqs; 1059 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 1060 sizeof(struct nvme_command) * asqs); 1061 1062 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1063 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1064 1065 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1066 NVME_AQA_REG_ACQS_MASK) + 1; 1067 sc->compl_queues[0].size = acqs; 1068 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 1069 sizeof(struct nvme_completion) * acqs); 1070 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1071 1072 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1073 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1074 } 1075 1076 static int 1077 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1078 size_t len, enum nvme_copy_dir dir) 1079 { 1080 uint8_t *p; 1081 size_t bytes; 1082 1083 if (len > (8 * 1024)) { 1084 return (-1); 1085 } 1086 1087 /* Copy from the start of prp1 to the end of the physical page */ 1088 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1089 bytes = MIN(bytes, len); 1090 1091 p = vm_map_gpa(ctx, prp1, bytes); 1092 if (p == NULL) { 1093 return (-1); 1094 } 1095 1096 if (dir == NVME_COPY_TO_PRP) 1097 memcpy(p, b, bytes); 1098 else 1099 memcpy(b, p, bytes); 1100 1101 b += bytes; 1102 1103 len -= bytes; 1104 if (len == 0) { 1105 return (0); 1106 } 1107 1108 len = MIN(len, PAGE_SIZE); 1109 1110 p = vm_map_gpa(ctx, prp2, len); 1111 if (p == NULL) { 1112 return (-1); 1113 } 1114 1115 if (dir == NVME_COPY_TO_PRP) 1116 memcpy(p, b, len); 1117 else 1118 memcpy(b, p, len); 1119 1120 return (0); 1121 } 1122 1123 /* 1124 * Write a Completion Queue Entry update 1125 * 1126 * Write the completion and update the doorbell value 1127 */ 1128 static void 1129 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1130 struct nvme_completion_queue *cq, 1131 uint32_t cdw0, 1132 uint16_t cid, 1133 uint16_t sqid, 1134 uint16_t status) 1135 { 1136 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1137 struct nvme_completion *cqe; 1138 1139 assert(cq->qbase != NULL); 1140 1141 pthread_mutex_lock(&cq->mtx); 1142 1143 cqe = &cq->qbase[cq->tail]; 1144 1145 /* Flip the phase bit */ 1146 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1147 1148 cqe->cdw0 = cdw0; 1149 cqe->sqhd = sq->head; 1150 cqe->sqid = sqid; 1151 cqe->cid = cid; 1152 cqe->status = status; 1153 1154 cq->tail++; 1155 if (cq->tail >= cq->size) { 1156 cq->tail = 0; 1157 } 1158 1159 pthread_mutex_unlock(&cq->mtx); 1160 } 1161 1162 static int 1163 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1164 struct nvme_completion* compl) 1165 { 1166 uint16_t qid = command->cdw10 & 0xffff; 1167 1168 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1169 if (qid == 0 || qid > sc->num_squeues || 1170 (sc->submit_queues[qid].qbase == NULL)) { 1171 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1172 __func__, qid, sc->num_squeues); 1173 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1174 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1175 return (1); 1176 } 1177 1178 sc->submit_queues[qid].qbase = NULL; 1179 sc->submit_queues[qid].cqid = 0; 1180 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1181 return (1); 1182 } 1183 1184 static int 1185 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1186 struct nvme_completion* compl) 1187 { 1188 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1189 uint16_t qid = command->cdw10 & 0xffff; 1190 struct nvme_submission_queue *nsq; 1191 1192 if ((qid == 0) || (qid > sc->num_squeues) || 1193 (sc->submit_queues[qid].qbase != NULL)) { 1194 WPRINTF("%s queue index %u > num_squeues %u", 1195 __func__, qid, sc->num_squeues); 1196 pci_nvme_status_tc(&compl->status, 1197 NVME_SCT_COMMAND_SPECIFIC, 1198 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1199 return (1); 1200 } 1201 1202 nsq = &sc->submit_queues[qid]; 1203 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1204 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1205 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1206 /* 1207 * Queues must specify at least two entries 1208 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1209 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1210 */ 1211 pci_nvme_status_tc(&compl->status, 1212 NVME_SCT_COMMAND_SPECIFIC, 1213 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1214 return (1); 1215 } 1216 nsq->head = nsq->tail = 0; 1217 1218 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1219 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1220 pci_nvme_status_tc(&compl->status, 1221 NVME_SCT_COMMAND_SPECIFIC, 1222 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1223 return (1); 1224 } 1225 1226 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1227 pci_nvme_status_tc(&compl->status, 1228 NVME_SCT_COMMAND_SPECIFIC, 1229 NVME_SC_COMPLETION_QUEUE_INVALID); 1230 return (1); 1231 } 1232 1233 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1234 1235 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1236 sizeof(struct nvme_command) * (size_t)nsq->size); 1237 1238 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1239 qid, nsq->size, nsq->qbase, nsq->cqid); 1240 1241 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1242 1243 DPRINTF("%s completed creating IOSQ qid %u", 1244 __func__, qid); 1245 } else { 1246 /* 1247 * Guest sent non-cont submission queue request. 1248 * This setting is unsupported by this emulation. 1249 */ 1250 WPRINTF("%s unsupported non-contig (list-based) " 1251 "create i/o submission queue", __func__); 1252 1253 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1254 } 1255 return (1); 1256 } 1257 1258 static int 1259 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1260 struct nvme_completion* compl) 1261 { 1262 uint16_t qid = command->cdw10 & 0xffff; 1263 uint16_t sqid; 1264 1265 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1266 if (qid == 0 || qid > sc->num_cqueues || 1267 (sc->compl_queues[qid].qbase == NULL)) { 1268 WPRINTF("%s queue index %u / num_cqueues %u", 1269 __func__, qid, sc->num_cqueues); 1270 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1271 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1272 return (1); 1273 } 1274 1275 /* Deleting an Active CQ is an error */ 1276 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1277 if (sc->submit_queues[sqid].cqid == qid) { 1278 pci_nvme_status_tc(&compl->status, 1279 NVME_SCT_COMMAND_SPECIFIC, 1280 NVME_SC_INVALID_QUEUE_DELETION); 1281 return (1); 1282 } 1283 1284 sc->compl_queues[qid].qbase = NULL; 1285 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1286 return (1); 1287 } 1288 1289 static int 1290 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1291 struct nvme_completion* compl) 1292 { 1293 struct nvme_completion_queue *ncq; 1294 uint16_t qid = command->cdw10 & 0xffff; 1295 1296 /* Only support Physically Contiguous queues */ 1297 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1298 WPRINTF("%s unsupported non-contig (list-based) " 1299 "create i/o completion queue", 1300 __func__); 1301 1302 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1303 return (1); 1304 } 1305 1306 if ((qid == 0) || (qid > sc->num_cqueues) || 1307 (sc->compl_queues[qid].qbase != NULL)) { 1308 WPRINTF("%s queue index %u > num_cqueues %u", 1309 __func__, qid, sc->num_cqueues); 1310 pci_nvme_status_tc(&compl->status, 1311 NVME_SCT_COMMAND_SPECIFIC, 1312 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1313 return (1); 1314 } 1315 1316 ncq = &sc->compl_queues[qid]; 1317 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1318 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1319 if (ncq->intr_vec > (sc->max_queues + 1)) { 1320 pci_nvme_status_tc(&compl->status, 1321 NVME_SCT_COMMAND_SPECIFIC, 1322 NVME_SC_INVALID_INTERRUPT_VECTOR); 1323 return (1); 1324 } 1325 1326 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1327 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1328 /* 1329 * Queues must specify at least two entries 1330 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1331 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1332 */ 1333 pci_nvme_status_tc(&compl->status, 1334 NVME_SCT_COMMAND_SPECIFIC, 1335 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1336 return (1); 1337 } 1338 ncq->head = ncq->tail = 0; 1339 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1340 command->prp1, 1341 sizeof(struct nvme_command) * (size_t)ncq->size); 1342 1343 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1344 1345 1346 return (1); 1347 } 1348 1349 static int 1350 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1351 struct nvme_completion* compl) 1352 { 1353 uint32_t logsize; 1354 uint8_t logpage = command->cdw10 & 0xFF; 1355 1356 #ifndef __FreeBSD__ 1357 logsize = 0; 1358 #endif 1359 1360 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1361 1362 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1363 1364 /* 1365 * Command specifies the number of dwords to return in fields NUMDU 1366 * and NUMDL. This is a zero-based value. 1367 */ 1368 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1369 logsize *= sizeof(uint32_t); 1370 1371 switch (logpage) { 1372 case NVME_LOG_ERROR: 1373 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1374 command->prp2, (uint8_t *)&sc->err_log, 1375 MIN(logsize, sizeof(sc->err_log)), 1376 NVME_COPY_TO_PRP); 1377 break; 1378 case NVME_LOG_HEALTH_INFORMATION: 1379 pthread_mutex_lock(&sc->mtx); 1380 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1381 sizeof(sc->health_log.data_units_read)); 1382 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1383 sizeof(sc->health_log.data_units_written)); 1384 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1385 sizeof(sc->health_log.host_read_commands)); 1386 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1387 sizeof(sc->health_log.host_write_commands)); 1388 pthread_mutex_unlock(&sc->mtx); 1389 1390 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1391 command->prp2, (uint8_t *)&sc->health_log, 1392 MIN(logsize, sizeof(sc->health_log)), 1393 NVME_COPY_TO_PRP); 1394 break; 1395 case NVME_LOG_FIRMWARE_SLOT: 1396 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1397 command->prp2, (uint8_t *)&sc->fw_log, 1398 MIN(logsize, sizeof(sc->fw_log)), 1399 NVME_COPY_TO_PRP); 1400 break; 1401 case NVME_LOG_CHANGED_NAMESPACE: 1402 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1403 command->prp2, (uint8_t *)&sc->ns_log, 1404 MIN(logsize, sizeof(sc->ns_log)), 1405 NVME_COPY_TO_PRP); 1406 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1407 break; 1408 default: 1409 DPRINTF("%s get log page %x command not supported", 1410 __func__, logpage); 1411 1412 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1413 NVME_SC_INVALID_LOG_PAGE); 1414 } 1415 1416 return (1); 1417 } 1418 1419 static int 1420 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1421 struct nvme_completion* compl) 1422 { 1423 void *dest; 1424 uint16_t status; 1425 1426 #ifndef __FreeBSD__ 1427 status = 0; 1428 #endif 1429 1430 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1431 command->cdw10 & 0xFF, command->nsid); 1432 1433 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1434 1435 switch (command->cdw10 & 0xFF) { 1436 case 0x00: /* return Identify Namespace data structure */ 1437 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1438 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1439 NVME_COPY_TO_PRP); 1440 break; 1441 case 0x01: /* return Identify Controller data structure */ 1442 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1443 command->prp2, (uint8_t *)&sc->ctrldata, 1444 sizeof(sc->ctrldata), 1445 NVME_COPY_TO_PRP); 1446 break; 1447 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1448 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1449 sizeof(uint32_t) * 1024); 1450 /* All unused entries shall be zero */ 1451 bzero(dest, sizeof(uint32_t) * 1024); 1452 ((uint32_t *)dest)[0] = 1; 1453 break; 1454 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1455 if (command->nsid != 1) { 1456 pci_nvme_status_genc(&status, 1457 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1458 break; 1459 } 1460 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1461 sizeof(uint32_t) * 1024); 1462 /* All bytes after the descriptor shall be zero */ 1463 bzero(dest, sizeof(uint32_t) * 1024); 1464 1465 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1466 ((uint8_t *)dest)[0] = 1; 1467 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1468 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t)); 1469 break; 1470 default: 1471 DPRINTF("%s unsupported identify command requested 0x%x", 1472 __func__, command->cdw10 & 0xFF); 1473 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1474 break; 1475 } 1476 1477 compl->status = status; 1478 return (1); 1479 } 1480 1481 static const char * 1482 nvme_fid_to_name(uint8_t fid) 1483 { 1484 const char *name; 1485 1486 switch (fid) { 1487 case NVME_FEAT_ARBITRATION: 1488 name = "Arbitration"; 1489 break; 1490 case NVME_FEAT_POWER_MANAGEMENT: 1491 name = "Power Management"; 1492 break; 1493 case NVME_FEAT_LBA_RANGE_TYPE: 1494 name = "LBA Range Type"; 1495 break; 1496 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1497 name = "Temperature Threshold"; 1498 break; 1499 case NVME_FEAT_ERROR_RECOVERY: 1500 name = "Error Recovery"; 1501 break; 1502 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1503 name = "Volatile Write Cache"; 1504 break; 1505 case NVME_FEAT_NUMBER_OF_QUEUES: 1506 name = "Number of Queues"; 1507 break; 1508 case NVME_FEAT_INTERRUPT_COALESCING: 1509 name = "Interrupt Coalescing"; 1510 break; 1511 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1512 name = "Interrupt Vector Configuration"; 1513 break; 1514 case NVME_FEAT_WRITE_ATOMICITY: 1515 name = "Write Atomicity Normal"; 1516 break; 1517 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1518 name = "Asynchronous Event Configuration"; 1519 break; 1520 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1521 name = "Autonomous Power State Transition"; 1522 break; 1523 case NVME_FEAT_HOST_MEMORY_BUFFER: 1524 name = "Host Memory Buffer"; 1525 break; 1526 case NVME_FEAT_TIMESTAMP: 1527 name = "Timestamp"; 1528 break; 1529 case NVME_FEAT_KEEP_ALIVE_TIMER: 1530 name = "Keep Alive Timer"; 1531 break; 1532 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1533 name = "Host Controlled Thermal Management"; 1534 break; 1535 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1536 name = "Non-Operation Power State Config"; 1537 break; 1538 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1539 name = "Read Recovery Level Config"; 1540 break; 1541 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1542 name = "Predictable Latency Mode Config"; 1543 break; 1544 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1545 name = "Predictable Latency Mode Window"; 1546 break; 1547 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1548 name = "LBA Status Information Report Interval"; 1549 break; 1550 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1551 name = "Host Behavior Support"; 1552 break; 1553 case NVME_FEAT_SANITIZE_CONFIG: 1554 name = "Sanitize Config"; 1555 break; 1556 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1557 name = "Endurance Group Event Configuration"; 1558 break; 1559 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1560 name = "Software Progress Marker"; 1561 break; 1562 case NVME_FEAT_HOST_IDENTIFIER: 1563 name = "Host Identifier"; 1564 break; 1565 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1566 name = "Reservation Notification Mask"; 1567 break; 1568 case NVME_FEAT_RESERVATION_PERSISTENCE: 1569 name = "Reservation Persistence"; 1570 break; 1571 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1572 name = "Namespace Write Protection Config"; 1573 break; 1574 default: 1575 name = "Unknown"; 1576 break; 1577 } 1578 1579 return (name); 1580 } 1581 1582 static void 1583 nvme_feature_invalid_cb(struct pci_nvme_softc *sc, 1584 struct nvme_feature_obj *feat, 1585 struct nvme_command *command, 1586 struct nvme_completion *compl) 1587 { 1588 1589 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1590 } 1591 1592 static void 1593 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1594 struct nvme_feature_obj *feat, 1595 struct nvme_command *command, 1596 struct nvme_completion *compl) 1597 { 1598 uint32_t i; 1599 uint32_t cdw11 = command->cdw11; 1600 uint16_t iv; 1601 bool cd; 1602 1603 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1604 1605 iv = cdw11 & 0xffff; 1606 cd = cdw11 & (1 << 16); 1607 1608 if (iv > (sc->max_queues + 1)) { 1609 return; 1610 } 1611 1612 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1613 if ((iv == 0) && !cd) 1614 return; 1615 1616 /* Requested Interrupt Vector must be used by a CQ */ 1617 for (i = 0; i < sc->num_cqueues + 1; i++) { 1618 if (sc->compl_queues[i].intr_vec == iv) { 1619 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1620 } 1621 } 1622 1623 } 1624 1625 static void 1626 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1627 struct nvme_feature_obj *feat, 1628 struct nvme_command *command, 1629 struct nvme_completion *compl) 1630 { 1631 uint16_t nqr; /* Number of Queues Requested */ 1632 1633 if (sc->num_q_is_set) { 1634 WPRINTF("%s: Number of Queues already set", __func__); 1635 pci_nvme_status_genc(&compl->status, 1636 NVME_SC_COMMAND_SEQUENCE_ERROR); 1637 return; 1638 } 1639 1640 nqr = command->cdw11 & 0xFFFF; 1641 if (nqr == 0xffff) { 1642 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1643 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1644 return; 1645 } 1646 1647 sc->num_squeues = ONE_BASED(nqr); 1648 if (sc->num_squeues > sc->max_queues) { 1649 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1650 sc->max_queues); 1651 sc->num_squeues = sc->max_queues; 1652 } 1653 1654 nqr = (command->cdw11 >> 16) & 0xFFFF; 1655 if (nqr == 0xffff) { 1656 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1657 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1658 return; 1659 } 1660 1661 sc->num_cqueues = ONE_BASED(nqr); 1662 if (sc->num_cqueues > sc->max_queues) { 1663 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1664 sc->max_queues); 1665 sc->num_cqueues = sc->max_queues; 1666 } 1667 1668 /* Patch the command value which will be saved on callback's return */ 1669 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1670 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1671 1672 sc->num_q_is_set = true; 1673 } 1674 1675 static int 1676 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1677 struct nvme_completion *compl) 1678 { 1679 struct nvme_feature_obj *feat; 1680 uint32_t nsid = command->nsid; 1681 uint8_t fid = command->cdw10 & 0xFF; 1682 1683 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1684 1685 if (fid >= NVME_FID_MAX) { 1686 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1687 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1688 return (1); 1689 } 1690 feat = &sc->feat[fid]; 1691 1692 if (!feat->namespace_specific && 1693 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1694 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1695 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1696 return (1); 1697 } 1698 1699 compl->cdw0 = 0; 1700 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1701 1702 if (feat->set) 1703 feat->set(sc, feat, command, compl); 1704 1705 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1706 if (compl->status == NVME_SC_SUCCESS) { 1707 feat->cdw11 = command->cdw11; 1708 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1709 (command->cdw11 != 0)) 1710 pci_nvme_aen_notify(sc); 1711 } 1712 1713 return (0); 1714 } 1715 1716 static int 1717 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1718 struct nvme_completion* compl) 1719 { 1720 struct nvme_feature_obj *feat; 1721 uint8_t fid = command->cdw10 & 0xFF; 1722 1723 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1724 1725 if (fid >= NVME_FID_MAX) { 1726 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1727 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1728 return (1); 1729 } 1730 1731 compl->cdw0 = 0; 1732 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1733 1734 feat = &sc->feat[fid]; 1735 if (feat->get) { 1736 feat->get(sc, feat, command, compl); 1737 } 1738 1739 if (compl->status == NVME_SC_SUCCESS) { 1740 compl->cdw0 = feat->cdw11; 1741 } 1742 1743 return (0); 1744 } 1745 1746 static int 1747 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1748 struct nvme_completion* compl) 1749 { 1750 uint8_t ses, lbaf, pi; 1751 1752 /* Only supports Secure Erase Setting - User Data Erase */ 1753 ses = (command->cdw10 >> 9) & 0x7; 1754 if (ses > 0x1) { 1755 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1756 return (1); 1757 } 1758 1759 /* Only supports a single LBA Format */ 1760 lbaf = command->cdw10 & 0xf; 1761 if (lbaf != 0) { 1762 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1763 NVME_SC_INVALID_FORMAT); 1764 return (1); 1765 } 1766 1767 /* Doesn't support Protection Infomation */ 1768 pi = (command->cdw10 >> 5) & 0x7; 1769 if (pi != 0) { 1770 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1771 return (1); 1772 } 1773 1774 if (sc->nvstore.type == NVME_STOR_RAM) { 1775 if (sc->nvstore.ctx) 1776 free(sc->nvstore.ctx); 1777 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1778 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1779 } else { 1780 struct pci_nvme_ioreq *req; 1781 int err; 1782 1783 req = pci_nvme_get_ioreq(sc); 1784 if (req == NULL) { 1785 pci_nvme_status_genc(&compl->status, 1786 NVME_SC_INTERNAL_DEVICE_ERROR); 1787 WPRINTF("%s: unable to allocate IO req", __func__); 1788 return (1); 1789 } 1790 req->nvme_sq = &sc->submit_queues[0]; 1791 req->sqid = 0; 1792 req->opc = command->opc; 1793 req->cid = command->cid; 1794 req->nsid = command->nsid; 1795 1796 req->io_req.br_offset = 0; 1797 req->io_req.br_resid = sc->nvstore.size; 1798 req->io_req.br_callback = pci_nvme_io_done; 1799 1800 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1801 if (err) { 1802 pci_nvme_status_genc(&compl->status, 1803 NVME_SC_INTERNAL_DEVICE_ERROR); 1804 pci_nvme_release_ioreq(sc, req); 1805 } 1806 } 1807 1808 return (1); 1809 } 1810 1811 static int 1812 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1813 struct nvme_completion* compl) 1814 { 1815 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1816 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1817 1818 /* TODO: search for the command ID and abort it */ 1819 1820 compl->cdw0 = 1; 1821 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1822 return (1); 1823 } 1824 1825 static int 1826 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1827 struct nvme_command* command, struct nvme_completion* compl) 1828 { 1829 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 1830 sc->aer_count, sc->ctrldata.aerl, command->cid); 1831 1832 /* Don't exceed the Async Event Request Limit (AERL). */ 1833 if (pci_nvme_aer_limit_reached(sc)) { 1834 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1835 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1836 return (1); 1837 } 1838 1839 if (pci_nvme_aer_add(sc, command->cid)) { 1840 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1841 NVME_SC_INTERNAL_DEVICE_ERROR); 1842 return (1); 1843 } 1844 1845 /* 1846 * Raise events when they happen based on the Set Features cmd. 1847 * These events happen async, so only set completion successful if 1848 * there is an event reflective of the request to get event. 1849 */ 1850 compl->status = NVME_NO_STATUS; 1851 pci_nvme_aen_notify(sc); 1852 1853 return (0); 1854 } 1855 1856 static void 1857 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1858 { 1859 struct nvme_completion compl; 1860 struct nvme_command *cmd; 1861 struct nvme_submission_queue *sq; 1862 struct nvme_completion_queue *cq; 1863 uint16_t sqhead; 1864 1865 DPRINTF("%s index %u", __func__, (uint32_t)value); 1866 1867 sq = &sc->submit_queues[0]; 1868 cq = &sc->compl_queues[0]; 1869 1870 pthread_mutex_lock(&sq->mtx); 1871 1872 sqhead = sq->head; 1873 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 1874 1875 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1876 cmd = &(sq->qbase)[sqhead]; 1877 compl.cdw0 = 0; 1878 compl.status = 0; 1879 1880 switch (cmd->opc) { 1881 case NVME_OPC_DELETE_IO_SQ: 1882 DPRINTF("%s command DELETE_IO_SQ", __func__); 1883 nvme_opc_delete_io_sq(sc, cmd, &compl); 1884 break; 1885 case NVME_OPC_CREATE_IO_SQ: 1886 DPRINTF("%s command CREATE_IO_SQ", __func__); 1887 nvme_opc_create_io_sq(sc, cmd, &compl); 1888 break; 1889 case NVME_OPC_DELETE_IO_CQ: 1890 DPRINTF("%s command DELETE_IO_CQ", __func__); 1891 nvme_opc_delete_io_cq(sc, cmd, &compl); 1892 break; 1893 case NVME_OPC_CREATE_IO_CQ: 1894 DPRINTF("%s command CREATE_IO_CQ", __func__); 1895 nvme_opc_create_io_cq(sc, cmd, &compl); 1896 break; 1897 case NVME_OPC_GET_LOG_PAGE: 1898 DPRINTF("%s command GET_LOG_PAGE", __func__); 1899 nvme_opc_get_log_page(sc, cmd, &compl); 1900 break; 1901 case NVME_OPC_IDENTIFY: 1902 DPRINTF("%s command IDENTIFY", __func__); 1903 nvme_opc_identify(sc, cmd, &compl); 1904 break; 1905 case NVME_OPC_ABORT: 1906 DPRINTF("%s command ABORT", __func__); 1907 nvme_opc_abort(sc, cmd, &compl); 1908 break; 1909 case NVME_OPC_SET_FEATURES: 1910 DPRINTF("%s command SET_FEATURES", __func__); 1911 nvme_opc_set_features(sc, cmd, &compl); 1912 break; 1913 case NVME_OPC_GET_FEATURES: 1914 DPRINTF("%s command GET_FEATURES", __func__); 1915 nvme_opc_get_features(sc, cmd, &compl); 1916 break; 1917 case NVME_OPC_FIRMWARE_ACTIVATE: 1918 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 1919 pci_nvme_status_tc(&compl.status, 1920 NVME_SCT_COMMAND_SPECIFIC, 1921 NVME_SC_INVALID_FIRMWARE_SLOT); 1922 break; 1923 case NVME_OPC_ASYNC_EVENT_REQUEST: 1924 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 1925 nvme_opc_async_event_req(sc, cmd, &compl); 1926 break; 1927 case NVME_OPC_FORMAT_NVM: 1928 DPRINTF("%s command FORMAT_NVM", __func__); 1929 if ((sc->ctrldata.oacs & 1930 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 1931 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1932 } 1933 compl.status = NVME_NO_STATUS; 1934 nvme_opc_format_nvm(sc, cmd, &compl); 1935 break; 1936 default: 1937 DPRINTF("0x%x command is not implemented", 1938 cmd->opc); 1939 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1940 } 1941 sqhead = (sqhead + 1) % sq->size; 1942 1943 if (NVME_COMPLETION_VALID(compl)) { 1944 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1945 compl.cdw0, 1946 cmd->cid, 1947 0, /* SQID */ 1948 compl.status); 1949 } 1950 } 1951 1952 DPRINTF("setting sqhead %u", sqhead); 1953 sq->head = sqhead; 1954 1955 if (cq->head != cq->tail) 1956 pci_generate_msix(sc->nsc_pi, 0); 1957 1958 pthread_mutex_unlock(&sq->mtx); 1959 } 1960 1961 /* 1962 * Update the Write and Read statistics reported in SMART data 1963 * 1964 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 1965 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 1966 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 1967 */ 1968 static void 1969 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 1970 size_t bytes, uint16_t status) 1971 { 1972 1973 pthread_mutex_lock(&sc->mtx); 1974 switch (opc) { 1975 case NVME_OPC_WRITE: 1976 sc->write_commands++; 1977 if (status != NVME_SC_SUCCESS) 1978 break; 1979 sc->write_dunits_remainder += (bytes / 512); 1980 while (sc->write_dunits_remainder >= 1000) { 1981 sc->write_data_units++; 1982 sc->write_dunits_remainder -= 1000; 1983 } 1984 break; 1985 case NVME_OPC_READ: 1986 sc->read_commands++; 1987 if (status != NVME_SC_SUCCESS) 1988 break; 1989 sc->read_dunits_remainder += (bytes / 512); 1990 while (sc->read_dunits_remainder >= 1000) { 1991 sc->read_data_units++; 1992 sc->read_dunits_remainder -= 1000; 1993 } 1994 break; 1995 default: 1996 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 1997 break; 1998 } 1999 pthread_mutex_unlock(&sc->mtx); 2000 } 2001 2002 /* 2003 * Check if the combination of Starting LBA (slba) and Number of Logical 2004 * Blocks (nlb) exceeds the range of the underlying storage. 2005 * 2006 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2007 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2008 * overflow. 2009 */ 2010 static bool 2011 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2012 uint32_t nlb) 2013 { 2014 size_t offset, bytes; 2015 2016 /* Overflow check of multiplying Starting LBA by the sector size */ 2017 if (slba >> (64 - nvstore->sectsz_bits)) 2018 return (true); 2019 2020 offset = slba << nvstore->sectsz_bits; 2021 bytes = nlb << nvstore->sectsz_bits; 2022 2023 /* Overflow check of Number of Logical Blocks */ 2024 if ((nvstore->size - offset) < bytes) 2025 return (true); 2026 2027 return (false); 2028 } 2029 2030 static int 2031 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 2032 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 2033 { 2034 int iovidx; 2035 2036 if (req == NULL) 2037 return (-1); 2038 2039 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2040 return (-1); 2041 } 2042 2043 /* concatenate contig block-iovs to minimize number of iovs */ 2044 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 2045 iovidx = req->io_req.br_iovcnt - 1; 2046 2047 req->io_req.br_iov[iovidx].iov_base = 2048 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2049 req->prev_gpaddr, size); 2050 2051 req->prev_size += size; 2052 req->io_req.br_resid += size; 2053 2054 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2055 } else { 2056 iovidx = req->io_req.br_iovcnt; 2057 if (iovidx == 0) { 2058 req->io_req.br_offset = lba; 2059 req->io_req.br_resid = 0; 2060 req->io_req.br_param = req; 2061 } 2062 2063 req->io_req.br_iov[iovidx].iov_base = 2064 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2065 gpaddr, size); 2066 2067 req->io_req.br_iov[iovidx].iov_len = size; 2068 2069 req->prev_gpaddr = gpaddr; 2070 req->prev_size = size; 2071 req->io_req.br_resid += size; 2072 2073 req->io_req.br_iovcnt++; 2074 } 2075 2076 return (0); 2077 } 2078 2079 static void 2080 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2081 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 2082 uint32_t cdw0, uint16_t status) 2083 { 2084 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2085 2086 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2087 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2088 NVME_STATUS_GET_SC(status)); 2089 2090 pci_nvme_cq_update(sc, cq, 2091 0, /* CDW0 */ 2092 cid, 2093 sqid, 2094 status); 2095 2096 if (cq->head != cq->tail) { 2097 if (cq->intr_en & NVME_CQ_INTEN) { 2098 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2099 } else { 2100 DPRINTF("%s: CQ%u interrupt disabled", 2101 __func__, sq->cqid); 2102 } 2103 } 2104 } 2105 2106 static void 2107 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2108 { 2109 req->sc = NULL; 2110 req->nvme_sq = NULL; 2111 req->sqid = 0; 2112 2113 pthread_mutex_lock(&sc->mtx); 2114 2115 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2116 sc->pending_ios--; 2117 2118 /* when no more IO pending, can set to ready if device reset/enabled */ 2119 if (sc->pending_ios == 0 && 2120 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2121 sc->regs.csts |= NVME_CSTS_RDY; 2122 2123 pthread_mutex_unlock(&sc->mtx); 2124 2125 sem_post(&sc->iosemlock); 2126 } 2127 2128 static struct pci_nvme_ioreq * 2129 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2130 { 2131 struct pci_nvme_ioreq *req = NULL; 2132 2133 sem_wait(&sc->iosemlock); 2134 pthread_mutex_lock(&sc->mtx); 2135 2136 req = STAILQ_FIRST(&sc->ioreqs_free); 2137 assert(req != NULL); 2138 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2139 2140 req->sc = sc; 2141 2142 sc->pending_ios++; 2143 2144 pthread_mutex_unlock(&sc->mtx); 2145 2146 req->io_req.br_iovcnt = 0; 2147 req->io_req.br_offset = 0; 2148 req->io_req.br_resid = 0; 2149 req->io_req.br_param = req; 2150 req->prev_gpaddr = 0; 2151 req->prev_size = 0; 2152 2153 return req; 2154 } 2155 2156 static void 2157 pci_nvme_io_done(struct blockif_req *br, int err) 2158 { 2159 struct pci_nvme_ioreq *req = br->br_param; 2160 struct nvme_submission_queue *sq = req->nvme_sq; 2161 uint16_t code, status; 2162 2163 #ifndef __FreeBSD__ 2164 status = 0; 2165 #endif 2166 2167 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2168 2169 /* TODO return correct error */ 2170 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2171 pci_nvme_status_genc(&status, code); 2172 2173 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); 2174 pci_nvme_stats_write_read_update(req->sc, req->opc, 2175 req->bytes, status); 2176 pci_nvme_release_ioreq(req->sc, req); 2177 } 2178 2179 /* 2180 * Implements the Flush command. The specification states: 2181 * If a volatile write cache is not present, Flush commands complete 2182 * successfully and have no effect 2183 * in the description of the Volatile Write Cache (VWC) field of the Identify 2184 * Controller data. Therefore, set status to Success if the command is 2185 * not supported (i.e. RAM or as indicated by the blockif). 2186 */ 2187 static bool 2188 nvme_opc_flush(struct pci_nvme_softc *sc, 2189 struct nvme_command *cmd, 2190 struct pci_nvme_blockstore *nvstore, 2191 struct pci_nvme_ioreq *req, 2192 uint16_t *status) 2193 { 2194 bool pending = false; 2195 2196 if (nvstore->type == NVME_STOR_RAM) { 2197 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2198 } else { 2199 int err; 2200 2201 req->io_req.br_callback = pci_nvme_io_done; 2202 2203 err = blockif_flush(nvstore->ctx, &req->io_req); 2204 switch (err) { 2205 case 0: 2206 pending = true; 2207 break; 2208 case EOPNOTSUPP: 2209 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2210 break; 2211 default: 2212 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2213 } 2214 } 2215 2216 return (pending); 2217 } 2218 2219 static uint16_t 2220 nvme_write_read_ram(struct pci_nvme_softc *sc, 2221 struct pci_nvme_blockstore *nvstore, 2222 uint64_t prp1, uint64_t prp2, 2223 size_t offset, uint64_t bytes, 2224 bool is_write) 2225 { 2226 uint8_t *buf = nvstore->ctx; 2227 enum nvme_copy_dir dir; 2228 uint16_t status; 2229 2230 #ifndef __FreeBSD__ 2231 status = 0; 2232 #endif 2233 2234 if (is_write) 2235 dir = NVME_COPY_TO_PRP; 2236 else 2237 dir = NVME_COPY_FROM_PRP; 2238 2239 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2240 buf + offset, bytes, dir)) 2241 pci_nvme_status_genc(&status, 2242 NVME_SC_DATA_TRANSFER_ERROR); 2243 else 2244 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2245 2246 return (status); 2247 } 2248 2249 static uint16_t 2250 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2251 struct pci_nvme_blockstore *nvstore, 2252 struct pci_nvme_ioreq *req, 2253 uint64_t prp1, uint64_t prp2, 2254 size_t offset, uint64_t bytes, 2255 bool is_write) 2256 { 2257 uint64_t size; 2258 int err; 2259 uint16_t status = NVME_NO_STATUS; 2260 2261 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2262 if (pci_nvme_append_iov_req(sc, req, prp1, 2263 size, is_write, offset)) { 2264 pci_nvme_status_genc(&status, 2265 NVME_SC_DATA_TRANSFER_ERROR); 2266 goto out; 2267 } 2268 2269 offset += size; 2270 bytes -= size; 2271 2272 if (bytes == 0) { 2273 ; 2274 } else if (bytes <= PAGE_SIZE) { 2275 size = bytes; 2276 if (pci_nvme_append_iov_req(sc, req, prp2, 2277 size, is_write, offset)) { 2278 pci_nvme_status_genc(&status, 2279 NVME_SC_DATA_TRANSFER_ERROR); 2280 goto out; 2281 } 2282 } else { 2283 void *vmctx = sc->nsc_pi->pi_vmctx; 2284 uint64_t *prp_list = &prp2; 2285 uint64_t *last = prp_list; 2286 2287 /* PRP2 is pointer to a physical region page list */ 2288 while (bytes) { 2289 /* Last entry in list points to the next list */ 2290 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2291 uint64_t prp = *prp_list; 2292 2293 prp_list = paddr_guest2host(vmctx, prp, 2294 PAGE_SIZE - (prp % PAGE_SIZE)); 2295 last = prp_list + (NVME_PRP2_ITEMS - 1); 2296 } 2297 2298 size = MIN(bytes, PAGE_SIZE); 2299 2300 if (pci_nvme_append_iov_req(sc, req, *prp_list, 2301 size, is_write, offset)) { 2302 pci_nvme_status_genc(&status, 2303 NVME_SC_DATA_TRANSFER_ERROR); 2304 goto out; 2305 } 2306 2307 offset += size; 2308 bytes -= size; 2309 2310 prp_list++; 2311 } 2312 } 2313 req->io_req.br_callback = pci_nvme_io_done; 2314 if (is_write) 2315 err = blockif_write(nvstore->ctx, &req->io_req); 2316 else 2317 err = blockif_read(nvstore->ctx, &req->io_req); 2318 2319 if (err) 2320 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2321 out: 2322 return (status); 2323 } 2324 2325 static bool 2326 nvme_opc_write_read(struct pci_nvme_softc *sc, 2327 struct nvme_command *cmd, 2328 struct pci_nvme_blockstore *nvstore, 2329 struct pci_nvme_ioreq *req, 2330 uint16_t *status) 2331 { 2332 uint64_t lba, nblocks, bytes; 2333 size_t offset; 2334 bool is_write = cmd->opc == NVME_OPC_WRITE; 2335 bool pending = false; 2336 2337 #ifndef __FreeBSD__ 2338 bytes = 0; 2339 #endif 2340 2341 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2342 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2343 2344 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2345 WPRINTF("%s command would exceed LBA range", __func__); 2346 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2347 goto out; 2348 } 2349 2350 bytes = nblocks << nvstore->sectsz_bits; 2351 if (bytes > NVME_MAX_DATA_SIZE) { 2352 WPRINTF("%s command would exceed MDTS", __func__); 2353 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2354 goto out; 2355 } 2356 2357 offset = lba << nvstore->sectsz_bits; 2358 2359 req->bytes = bytes; 2360 req->io_req.br_offset = lba; 2361 2362 /* PRP bits 1:0 must be zero */ 2363 cmd->prp1 &= ~0x3UL; 2364 cmd->prp2 &= ~0x3UL; 2365 2366 if (nvstore->type == NVME_STOR_RAM) { 2367 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2368 cmd->prp2, offset, bytes, is_write); 2369 } else { 2370 *status = nvme_write_read_blockif(sc, nvstore, req, 2371 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2372 2373 if (*status == NVME_NO_STATUS) 2374 pending = true; 2375 } 2376 out: 2377 if (!pending) 2378 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2379 2380 return (pending); 2381 } 2382 2383 static void 2384 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2385 { 2386 struct pci_nvme_ioreq *req = br->br_param; 2387 struct pci_nvme_softc *sc = req->sc; 2388 bool done = true; 2389 uint16_t status; 2390 2391 #ifndef __FreeBSD__ 2392 status = 0; 2393 #endif 2394 2395 if (err) { 2396 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2397 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2398 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2399 } else { 2400 struct iovec *iov = req->io_req.br_iov; 2401 2402 req->prev_gpaddr++; 2403 iov += req->prev_gpaddr; 2404 2405 /* The iov_* values already include the sector size */ 2406 req->io_req.br_offset = (off_t)iov->iov_base; 2407 req->io_req.br_resid = iov->iov_len; 2408 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2409 pci_nvme_status_genc(&status, 2410 NVME_SC_INTERNAL_DEVICE_ERROR); 2411 } else 2412 done = false; 2413 } 2414 2415 if (done) { 2416 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, 2417 req->cid, 0, status); 2418 pci_nvme_release_ioreq(sc, req); 2419 } 2420 } 2421 2422 static bool 2423 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2424 struct nvme_command *cmd, 2425 struct pci_nvme_blockstore *nvstore, 2426 struct pci_nvme_ioreq *req, 2427 uint16_t *status) 2428 { 2429 struct nvme_dsm_range *range; 2430 uint32_t nr, r, non_zero, dr; 2431 int err; 2432 bool pending = false; 2433 2434 #ifndef __FreeBSD__ 2435 range = NULL; 2436 #endif 2437 2438 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2439 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2440 goto out; 2441 } 2442 2443 nr = cmd->cdw10 & 0xff; 2444 2445 /* copy locally because a range entry could straddle PRPs */ 2446 range = calloc(1, NVME_MAX_DSM_TRIM); 2447 if (range == NULL) { 2448 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2449 goto out; 2450 } 2451 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2452 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2453 2454 /* Check for invalid ranges and the number of non-zero lengths */ 2455 non_zero = 0; 2456 for (r = 0; r <= nr; r++) { 2457 if (pci_nvme_out_of_range(nvstore, 2458 range[r].starting_lba, range[r].length)) { 2459 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2460 goto out; 2461 } 2462 if (range[r].length != 0) 2463 non_zero++; 2464 } 2465 2466 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2467 size_t offset, bytes; 2468 int sectsz_bits = sc->nvstore.sectsz_bits; 2469 2470 /* 2471 * DSM calls are advisory only, and compliant controllers 2472 * may choose to take no actions (i.e. return Success). 2473 */ 2474 if (!nvstore->deallocate) { 2475 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2476 goto out; 2477 } 2478 2479 /* If all ranges have a zero length, return Success */ 2480 if (non_zero == 0) { 2481 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2482 goto out; 2483 } 2484 2485 if (req == NULL) { 2486 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2487 goto out; 2488 } 2489 2490 offset = range[0].starting_lba << sectsz_bits; 2491 bytes = range[0].length << sectsz_bits; 2492 2493 /* 2494 * If the request is for more than a single range, store 2495 * the ranges in the br_iov. Optimize for the common case 2496 * of a single range. 2497 * 2498 * Note that NVMe Number of Ranges is a zero based value 2499 */ 2500 req->io_req.br_iovcnt = 0; 2501 req->io_req.br_offset = offset; 2502 req->io_req.br_resid = bytes; 2503 2504 if (nr == 0) { 2505 req->io_req.br_callback = pci_nvme_io_done; 2506 } else { 2507 struct iovec *iov = req->io_req.br_iov; 2508 2509 for (r = 0, dr = 0; r <= nr; r++) { 2510 offset = range[r].starting_lba << sectsz_bits; 2511 bytes = range[r].length << sectsz_bits; 2512 if (bytes == 0) 2513 continue; 2514 2515 if ((nvstore->size - offset) < bytes) { 2516 pci_nvme_status_genc(status, 2517 NVME_SC_LBA_OUT_OF_RANGE); 2518 goto out; 2519 } 2520 iov[dr].iov_base = (void *)offset; 2521 iov[dr].iov_len = bytes; 2522 dr++; 2523 } 2524 req->io_req.br_callback = pci_nvme_dealloc_sm; 2525 2526 /* 2527 * Use prev_gpaddr to track the current entry and 2528 * prev_size to track the number of entries 2529 */ 2530 req->prev_gpaddr = 0; 2531 req->prev_size = dr; 2532 } 2533 2534 err = blockif_delete(nvstore->ctx, &req->io_req); 2535 if (err) 2536 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2537 else 2538 pending = true; 2539 } 2540 out: 2541 free(range); 2542 return (pending); 2543 } 2544 2545 static void 2546 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2547 { 2548 struct nvme_submission_queue *sq; 2549 uint16_t status; 2550 uint16_t sqhead; 2551 2552 #ifndef __FreeBSD__ 2553 status = 0; 2554 #endif 2555 2556 /* handle all submissions up to sq->tail index */ 2557 sq = &sc->submit_queues[idx]; 2558 2559 pthread_mutex_lock(&sq->mtx); 2560 2561 sqhead = sq->head; 2562 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2563 idx, sqhead, sq->tail, sq->qbase); 2564 2565 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2566 struct nvme_command *cmd; 2567 struct pci_nvme_ioreq *req; 2568 uint32_t nsid; 2569 bool pending; 2570 2571 pending = false; 2572 req = NULL; 2573 status = 0; 2574 2575 cmd = &sq->qbase[sqhead]; 2576 sqhead = (sqhead + 1) % sq->size; 2577 2578 nsid = le32toh(cmd->nsid); 2579 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2580 pci_nvme_status_genc(&status, 2581 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2582 status |= 2583 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2584 goto complete; 2585 } 2586 2587 req = pci_nvme_get_ioreq(sc); 2588 if (req == NULL) { 2589 pci_nvme_status_genc(&status, 2590 NVME_SC_INTERNAL_DEVICE_ERROR); 2591 WPRINTF("%s: unable to allocate IO req", __func__); 2592 goto complete; 2593 } 2594 req->nvme_sq = sq; 2595 req->sqid = idx; 2596 req->opc = cmd->opc; 2597 req->cid = cmd->cid; 2598 req->nsid = cmd->nsid; 2599 2600 switch (cmd->opc) { 2601 case NVME_OPC_FLUSH: 2602 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2603 req, &status); 2604 break; 2605 case NVME_OPC_WRITE: 2606 case NVME_OPC_READ: 2607 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2608 req, &status); 2609 break; 2610 case NVME_OPC_WRITE_ZEROES: 2611 /* TODO: write zeroes 2612 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2613 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2614 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2615 break; 2616 case NVME_OPC_DATASET_MANAGEMENT: 2617 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2618 req, &status); 2619 break; 2620 default: 2621 WPRINTF("%s unhandled io command 0x%x", 2622 __func__, cmd->opc); 2623 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2624 } 2625 complete: 2626 if (!pending) { 2627 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 2628 status); 2629 if (req != NULL) 2630 pci_nvme_release_ioreq(sc, req); 2631 } 2632 } 2633 2634 sq->head = sqhead; 2635 2636 pthread_mutex_unlock(&sq->mtx); 2637 } 2638 2639 static void 2640 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 2641 uint64_t idx, int is_sq, uint64_t value) 2642 { 2643 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2644 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2645 2646 if (is_sq) { 2647 if (idx > sc->num_squeues) { 2648 WPRINTF("%s queue index %lu overflow from " 2649 "guest (max %u)", 2650 __func__, idx, sc->num_squeues); 2651 return; 2652 } 2653 2654 atomic_store_short(&sc->submit_queues[idx].tail, 2655 (uint16_t)value); 2656 2657 if (idx == 0) { 2658 pci_nvme_handle_admin_cmd(sc, value); 2659 } else { 2660 /* submission queue; handle new entries in SQ */ 2661 if (idx > sc->num_squeues) { 2662 WPRINTF("%s SQ index %lu overflow from " 2663 "guest (max %u)", 2664 __func__, idx, sc->num_squeues); 2665 return; 2666 } 2667 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2668 } 2669 } else { 2670 if (idx > sc->num_cqueues) { 2671 WPRINTF("%s queue index %lu overflow from " 2672 "guest (max %u)", 2673 __func__, idx, sc->num_cqueues); 2674 return; 2675 } 2676 2677 atomic_store_short(&sc->compl_queues[idx].head, 2678 (uint16_t)value); 2679 } 2680 } 2681 2682 static void 2683 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2684 { 2685 const char *s = iswrite ? "WRITE" : "READ"; 2686 2687 switch (offset) { 2688 case NVME_CR_CAP_LOW: 2689 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2690 break; 2691 case NVME_CR_CAP_HI: 2692 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2693 break; 2694 case NVME_CR_VS: 2695 DPRINTF("%s %s NVME_CR_VS", func, s); 2696 break; 2697 case NVME_CR_INTMS: 2698 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2699 break; 2700 case NVME_CR_INTMC: 2701 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2702 break; 2703 case NVME_CR_CC: 2704 DPRINTF("%s %s NVME_CR_CC", func, s); 2705 break; 2706 case NVME_CR_CSTS: 2707 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2708 break; 2709 case NVME_CR_NSSR: 2710 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2711 break; 2712 case NVME_CR_AQA: 2713 DPRINTF("%s %s NVME_CR_AQA", func, s); 2714 break; 2715 case NVME_CR_ASQ_LOW: 2716 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2717 break; 2718 case NVME_CR_ASQ_HI: 2719 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2720 break; 2721 case NVME_CR_ACQ_LOW: 2722 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2723 break; 2724 case NVME_CR_ACQ_HI: 2725 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2726 break; 2727 default: 2728 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2729 } 2730 2731 } 2732 2733 static void 2734 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2735 uint64_t offset, int size, uint64_t value) 2736 { 2737 uint32_t ccreg; 2738 2739 if (offset >= NVME_DOORBELL_OFFSET) { 2740 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2741 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2742 int is_sq = (belloffset % 8) < 4; 2743 2744 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2745 WPRINTF("guest attempted an overflow write offset " 2746 "0x%lx, val 0x%lx in %s", 2747 offset, value, __func__); 2748 return; 2749 } 2750 2751 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2752 return; 2753 } 2754 2755 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2756 offset, size, value); 2757 2758 if (size != 4) { 2759 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2760 "val 0x%lx) to bar0 in %s", 2761 size, offset, value, __func__); 2762 /* TODO: shutdown device */ 2763 return; 2764 } 2765 2766 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2767 2768 pthread_mutex_lock(&sc->mtx); 2769 2770 switch (offset) { 2771 case NVME_CR_CAP_LOW: 2772 case NVME_CR_CAP_HI: 2773 /* readonly */ 2774 break; 2775 case NVME_CR_VS: 2776 /* readonly */ 2777 break; 2778 case NVME_CR_INTMS: 2779 /* MSI-X, so ignore */ 2780 break; 2781 case NVME_CR_INTMC: 2782 /* MSI-X, so ignore */ 2783 break; 2784 case NVME_CR_CC: 2785 ccreg = (uint32_t)value; 2786 2787 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2788 "iocqes %u", 2789 __func__, 2790 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2791 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2792 NVME_CC_GET_IOCQES(ccreg)); 2793 2794 if (NVME_CC_GET_SHN(ccreg)) { 2795 /* perform shutdown - flush out data to backend */ 2796 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2797 NVME_CSTS_REG_SHST_SHIFT); 2798 sc->regs.csts |= NVME_SHST_COMPLETE << 2799 NVME_CSTS_REG_SHST_SHIFT; 2800 } 2801 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2802 if (NVME_CC_GET_EN(ccreg) == 0) 2803 /* transition 1-> causes controller reset */ 2804 pci_nvme_reset_locked(sc); 2805 else 2806 pci_nvme_init_controller(ctx, sc); 2807 } 2808 2809 /* Insert the iocqes, iosqes and en bits from the write */ 2810 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2811 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2812 if (NVME_CC_GET_EN(ccreg) == 0) { 2813 /* Insert the ams, mps and css bit fields */ 2814 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2815 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2816 sc->regs.csts &= ~NVME_CSTS_RDY; 2817 } else if (sc->pending_ios == 0) { 2818 sc->regs.csts |= NVME_CSTS_RDY; 2819 } 2820 break; 2821 case NVME_CR_CSTS: 2822 break; 2823 case NVME_CR_NSSR: 2824 /* ignore writes; don't support subsystem reset */ 2825 break; 2826 case NVME_CR_AQA: 2827 sc->regs.aqa = (uint32_t)value; 2828 break; 2829 case NVME_CR_ASQ_LOW: 2830 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2831 (0xFFFFF000 & value); 2832 break; 2833 case NVME_CR_ASQ_HI: 2834 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 2835 (value << 32); 2836 break; 2837 case NVME_CR_ACQ_LOW: 2838 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 2839 (0xFFFFF000 & value); 2840 break; 2841 case NVME_CR_ACQ_HI: 2842 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 2843 (value << 32); 2844 break; 2845 default: 2846 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 2847 __func__, offset, value, size); 2848 } 2849 pthread_mutex_unlock(&sc->mtx); 2850 } 2851 2852 static void 2853 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 2854 int baridx, uint64_t offset, int size, uint64_t value) 2855 { 2856 struct pci_nvme_softc* sc = pi->pi_arg; 2857 2858 if (baridx == pci_msix_table_bar(pi) || 2859 baridx == pci_msix_pba_bar(pi)) { 2860 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 2861 " value 0x%lx", baridx, offset, size, value); 2862 2863 pci_emul_msix_twrite(pi, offset, size, value); 2864 return; 2865 } 2866 2867 switch (baridx) { 2868 case 0: 2869 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 2870 break; 2871 2872 default: 2873 DPRINTF("%s unknown baridx %d, val 0x%lx", 2874 __func__, baridx, value); 2875 } 2876 } 2877 2878 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 2879 uint64_t offset, int size) 2880 { 2881 uint64_t value; 2882 2883 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 2884 2885 if (offset < NVME_DOORBELL_OFFSET) { 2886 void *p = &(sc->regs); 2887 pthread_mutex_lock(&sc->mtx); 2888 memcpy(&value, (void *)((uintptr_t)p + offset), size); 2889 pthread_mutex_unlock(&sc->mtx); 2890 } else { 2891 value = 0; 2892 WPRINTF("pci_nvme: read invalid offset %ld", offset); 2893 } 2894 2895 switch (size) { 2896 case 1: 2897 value &= 0xFF; 2898 break; 2899 case 2: 2900 value &= 0xFFFF; 2901 break; 2902 case 4: 2903 value &= 0xFFFFFFFF; 2904 break; 2905 } 2906 2907 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 2908 offset, size, (uint32_t)value); 2909 2910 return (value); 2911 } 2912 2913 2914 2915 static uint64_t 2916 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 2917 uint64_t offset, int size) 2918 { 2919 struct pci_nvme_softc* sc = pi->pi_arg; 2920 2921 if (baridx == pci_msix_table_bar(pi) || 2922 baridx == pci_msix_pba_bar(pi)) { 2923 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 2924 baridx, offset, size); 2925 2926 return pci_emul_msix_tread(pi, offset, size); 2927 } 2928 2929 switch (baridx) { 2930 case 0: 2931 return pci_nvme_read_bar_0(sc, offset, size); 2932 2933 default: 2934 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 2935 } 2936 2937 return (0); 2938 } 2939 2940 static int 2941 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 2942 { 2943 char bident[sizeof("XX:X:X")]; 2944 const char *value; 2945 uint32_t sectsz; 2946 2947 sc->max_queues = NVME_QUEUES; 2948 sc->max_qentries = NVME_MAX_QENTRIES; 2949 sc->ioslots = NVME_IOSLOTS; 2950 sc->num_squeues = sc->max_queues; 2951 sc->num_cqueues = sc->max_queues; 2952 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2953 sectsz = 0; 2954 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 2955 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2956 2957 value = get_config_value_node(nvl, "maxq"); 2958 if (value != NULL) 2959 sc->max_queues = atoi(value); 2960 value = get_config_value_node(nvl, "qsz"); 2961 if (value != NULL) { 2962 sc->max_qentries = atoi(value); 2963 if (sc->max_qentries <= 0) { 2964 EPRINTLN("nvme: Invalid qsz option %d", 2965 sc->max_qentries); 2966 return (-1); 2967 } 2968 } 2969 value = get_config_value_node(nvl, "ioslots"); 2970 if (value != NULL) { 2971 sc->ioslots = atoi(value); 2972 if (sc->ioslots <= 0) { 2973 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 2974 return (-1); 2975 } 2976 } 2977 value = get_config_value_node(nvl, "sectsz"); 2978 if (value != NULL) 2979 sectsz = atoi(value); 2980 value = get_config_value_node(nvl, "ser"); 2981 if (value != NULL) { 2982 /* 2983 * This field indicates the Product Serial Number in 2984 * 7-bit ASCII, unused bytes should be space characters. 2985 * Ref: NVMe v1.3c. 2986 */ 2987 cpywithpad((char *)sc->ctrldata.sn, 2988 sizeof(sc->ctrldata.sn), value, ' '); 2989 } 2990 value = get_config_value_node(nvl, "eui64"); 2991 if (value != NULL) 2992 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 2993 value = get_config_value_node(nvl, "dsm"); 2994 if (value != NULL) { 2995 if (strcmp(value, "auto") == 0) 2996 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2997 else if (strcmp(value, "enable") == 0) 2998 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 2999 else if (strcmp(value, "disable") == 0) 3000 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3001 } 3002 3003 value = get_config_value_node(nvl, "ram"); 3004 if (value != NULL) { 3005 uint64_t sz = strtoull(value, NULL, 10); 3006 3007 sc->nvstore.type = NVME_STOR_RAM; 3008 sc->nvstore.size = sz * 1024 * 1024; 3009 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3010 sc->nvstore.sectsz = 4096; 3011 sc->nvstore.sectsz_bits = 12; 3012 if (sc->nvstore.ctx == NULL) { 3013 EPRINTLN("nvme: Unable to allocate RAM"); 3014 return (-1); 3015 } 3016 } else { 3017 snprintf(bident, sizeof(bident), "%d:%d", 3018 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3019 sc->nvstore.ctx = blockif_open(nvl, bident); 3020 if (sc->nvstore.ctx == NULL) { 3021 EPRINTLN("nvme: Could not open backing file: %s", 3022 strerror(errno)); 3023 return (-1); 3024 } 3025 sc->nvstore.type = NVME_STOR_BLOCKIF; 3026 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3027 } 3028 3029 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3030 sc->nvstore.sectsz = sectsz; 3031 else if (sc->nvstore.type != NVME_STOR_RAM) 3032 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3033 for (sc->nvstore.sectsz_bits = 9; 3034 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3035 sc->nvstore.sectsz_bits++); 3036 3037 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3038 sc->max_queues = NVME_QUEUES; 3039 3040 return (0); 3041 } 3042 3043 static void 3044 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size) 3045 { 3046 struct pci_nvme_softc *sc; 3047 struct pci_nvme_blockstore *nvstore; 3048 struct nvme_namespace_data *nd; 3049 3050 sc = arg; 3051 nvstore = &sc->nvstore; 3052 nd = &sc->nsdata; 3053 3054 nvstore->size = new_size; 3055 pci_nvme_init_nsdata_size(nvstore, nd); 3056 3057 /* Add changed NSID to list */ 3058 sc->ns_log.ns[0] = 1; 3059 sc->ns_log.ns[1] = 0; 3060 3061 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3062 PCI_NVME_AE_INFO_NS_ATTR_CHANGED); 3063 } 3064 3065 static int 3066 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) 3067 { 3068 struct pci_nvme_softc *sc; 3069 uint32_t pci_membar_sz; 3070 int error; 3071 3072 error = 0; 3073 3074 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3075 pi->pi_arg = sc; 3076 sc->nsc_pi = pi; 3077 3078 error = pci_nvme_parse_config(sc, nvl); 3079 if (error < 0) 3080 goto done; 3081 else 3082 error = 0; 3083 3084 STAILQ_INIT(&sc->ioreqs_free); 3085 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3086 for (int i = 0; i < sc->ioslots; i++) { 3087 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3088 } 3089 3090 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3091 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3092 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3093 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3094 pci_set_cfgdata8(pi, PCIR_PROGIF, 3095 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3096 3097 /* 3098 * Allocate size of NVMe registers + doorbell space for all queues. 3099 * 3100 * The specification requires a minimum memory I/O window size of 16K. 3101 * The Windows driver will refuse to start a device with a smaller 3102 * window. 3103 */ 3104 pci_membar_sz = sizeof(struct nvme_registers) + 3105 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3106 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3107 3108 DPRINTF("nvme membar size: %u", pci_membar_sz); 3109 3110 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3111 if (error) { 3112 WPRINTF("%s pci alloc mem bar failed", __func__); 3113 goto done; 3114 } 3115 3116 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3117 if (error) { 3118 WPRINTF("%s pci add msixcap failed", __func__); 3119 goto done; 3120 } 3121 3122 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3123 if (error) { 3124 WPRINTF("%s pci add Express capability failed", __func__); 3125 goto done; 3126 } 3127 3128 pthread_mutex_init(&sc->mtx, NULL); 3129 sem_init(&sc->iosemlock, 0, sc->ioslots); 3130 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3131 3132 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3133 /* 3134 * Controller data depends on Namespace data so initialize Namespace 3135 * data first. 3136 */ 3137 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3138 pci_nvme_init_ctrldata(sc); 3139 pci_nvme_init_logpages(sc); 3140 pci_nvme_init_features(sc); 3141 3142 pci_nvme_aer_init(sc); 3143 pci_nvme_aen_init(sc); 3144 3145 pci_nvme_reset(sc); 3146 3147 pci_lintr_request(pi); 3148 3149 done: 3150 return (error); 3151 } 3152 3153 static int 3154 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3155 { 3156 char *cp, *ram; 3157 3158 if (opts == NULL) 3159 return (0); 3160 3161 if (strncmp(opts, "ram=", 4) == 0) { 3162 cp = strchr(opts, ','); 3163 if (cp == NULL) { 3164 set_config_value_node(nvl, "ram", opts + 4); 3165 return (0); 3166 } 3167 ram = strndup(opts + 4, cp - opts - 4); 3168 set_config_value_node(nvl, "ram", ram); 3169 free(ram); 3170 return (pci_parse_legacy_config(nvl, cp + 1)); 3171 } else 3172 return (blockif_legacy_config(nvl, opts)); 3173 } 3174 3175 struct pci_devemu pci_de_nvme = { 3176 .pe_emu = "nvme", 3177 .pe_init = pci_nvme_init, 3178 .pe_legacy_config = pci_nvme_legacy_config, 3179 .pe_barwrite = pci_nvme_write, 3180 .pe_barread = pci_nvme_read 3181 }; 3182 PCI_EMUL_SET(pci_de_nvme); 3183