1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 66 #include <assert.h> 67 #include <pthread.h> 68 #include <pthread_np.h> 69 #include <semaphore.h> 70 #include <stdbool.h> 71 #include <stddef.h> 72 #include <stdint.h> 73 #include <stdio.h> 74 #include <stdlib.h> 75 #include <string.h> 76 77 #include <machine/atomic.h> 78 #include <machine/vmm.h> 79 #include <vmmapi.h> 80 81 #include <dev/nvme/nvme.h> 82 83 #include "bhyverun.h" 84 #include "block_if.h" 85 #include "config.h" 86 #include "debug.h" 87 #include "pci_emul.h" 88 89 90 static int nvme_debug = 0; 91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 93 94 /* defaults; can be overridden */ 95 #define NVME_MSIX_BAR 4 96 97 #define NVME_IOSLOTS 8 98 99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 100 #define NVME_MMIO_SPACE_MIN (1 << 14) 101 102 #define NVME_QUEUES 16 103 #define NVME_MAX_QENTRIES 2048 104 /* Memory Page size Minimum reported in CAP register */ 105 #define NVME_MPSMIN 0 106 /* MPSMIN converted to bytes */ 107 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 108 109 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 110 #define NVME_MDTS 9 111 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 112 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 113 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 114 115 /* This is a synthetic status code to indicate there is no status */ 116 #define NVME_NO_STATUS 0xffff 117 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 118 119 /* Reported temperature in Kelvin (i.e. room temperature) */ 120 #define NVME_TEMPERATURE 296 121 122 /* helpers */ 123 124 /* Convert a zero-based value into a one-based value */ 125 #define ONE_BASED(zero) ((zero) + 1) 126 /* Convert a one-based value into a zero-based value */ 127 #define ZERO_BASED(one) ((one) - 1) 128 129 /* Encode number of SQ's and CQ's for Set/Get Features */ 130 #define NVME_FEATURE_NUM_QUEUES(sc) \ 131 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 132 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 133 134 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 135 136 enum nvme_controller_register_offsets { 137 NVME_CR_CAP_LOW = 0x00, 138 NVME_CR_CAP_HI = 0x04, 139 NVME_CR_VS = 0x08, 140 NVME_CR_INTMS = 0x0c, 141 NVME_CR_INTMC = 0x10, 142 NVME_CR_CC = 0x14, 143 NVME_CR_CSTS = 0x1c, 144 NVME_CR_NSSR = 0x20, 145 NVME_CR_AQA = 0x24, 146 NVME_CR_ASQ_LOW = 0x28, 147 NVME_CR_ASQ_HI = 0x2c, 148 NVME_CR_ACQ_LOW = 0x30, 149 NVME_CR_ACQ_HI = 0x34, 150 }; 151 152 enum nvme_cmd_cdw11 { 153 NVME_CMD_CDW11_PC = 0x0001, 154 NVME_CMD_CDW11_IEN = 0x0002, 155 NVME_CMD_CDW11_IV = 0xFFFF0000, 156 }; 157 158 enum nvme_copy_dir { 159 NVME_COPY_TO_PRP, 160 NVME_COPY_FROM_PRP, 161 }; 162 163 #define NVME_CQ_INTEN 0x01 164 #define NVME_CQ_INTCOAL 0x02 165 166 struct nvme_completion_queue { 167 struct nvme_completion *qbase; 168 pthread_mutex_t mtx; 169 uint32_t size; 170 uint16_t tail; /* nvme progress */ 171 uint16_t head; /* guest progress */ 172 uint16_t intr_vec; 173 uint32_t intr_en; 174 }; 175 176 struct nvme_submission_queue { 177 struct nvme_command *qbase; 178 pthread_mutex_t mtx; 179 uint32_t size; 180 uint16_t head; /* nvme progress */ 181 uint16_t tail; /* guest progress */ 182 uint16_t cqid; /* completion queue id */ 183 int qpriority; 184 }; 185 186 enum nvme_storage_type { 187 NVME_STOR_BLOCKIF = 0, 188 NVME_STOR_RAM = 1, 189 }; 190 191 struct pci_nvme_blockstore { 192 enum nvme_storage_type type; 193 void *ctx; 194 uint64_t size; 195 uint32_t sectsz; 196 uint32_t sectsz_bits; 197 uint64_t eui64; 198 uint32_t deallocate:1; 199 }; 200 201 /* 202 * Calculate the number of additional page descriptors for guest IO requests 203 * based on the advertised Max Data Transfer (MDTS) and given the number of 204 * default iovec's in a struct blockif_req. 205 */ 206 #define MDTS_PAD_SIZE \ 207 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 208 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 209 0 ) 210 211 struct pci_nvme_ioreq { 212 struct pci_nvme_softc *sc; 213 STAILQ_ENTRY(pci_nvme_ioreq) link; 214 struct nvme_submission_queue *nvme_sq; 215 uint16_t sqid; 216 217 /* command information */ 218 uint16_t opc; 219 uint16_t cid; 220 uint32_t nsid; 221 222 uint64_t prev_gpaddr; 223 size_t prev_size; 224 size_t bytes; 225 226 struct blockif_req io_req; 227 228 struct iovec iovpadding[MDTS_PAD_SIZE]; 229 }; 230 231 enum nvme_dsm_type { 232 /* Dataset Management bit in ONCS reflects backing storage capability */ 233 NVME_DATASET_MANAGEMENT_AUTO, 234 /* Unconditionally set Dataset Management bit in ONCS */ 235 NVME_DATASET_MANAGEMENT_ENABLE, 236 /* Unconditionally clear Dataset Management bit in ONCS */ 237 NVME_DATASET_MANAGEMENT_DISABLE, 238 }; 239 240 struct pci_nvme_softc; 241 struct nvme_feature_obj; 242 243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 244 struct nvme_feature_obj *, 245 struct nvme_command *, 246 struct nvme_completion *); 247 248 struct nvme_feature_obj { 249 uint32_t cdw11; 250 nvme_feature_cb set; 251 nvme_feature_cb get; 252 bool namespace_specific; 253 }; 254 255 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 256 257 typedef enum { 258 PCI_NVME_AE_TYPE_ERROR = 0, 259 PCI_NVME_AE_TYPE_SMART, 260 PCI_NVME_AE_TYPE_NOTICE, 261 PCI_NVME_AE_TYPE_IO_CMD = 6, 262 PCI_NVME_AE_TYPE_VENDOR = 7, 263 PCI_NVME_AE_TYPE_MAX /* Must be last */ 264 } pci_nvme_async_type; 265 266 /* Asynchronous Event Requests */ 267 struct pci_nvme_aer { 268 STAILQ_ENTRY(pci_nvme_aer) link; 269 uint16_t cid; /* Command ID of the submitted AER */ 270 }; 271 272 /** Asynchronous Event Information - Notice */ 273 typedef enum { 274 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 275 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 276 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 277 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 278 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 279 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 280 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 281 PCI_NVME_AEI_NOTICE_MAX, 282 } pci_nvme_async_event_info_notice; 283 284 #define PCI_NVME_AEI_NOTICE_SHIFT 8 285 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 286 287 /* Asynchronous Event Notifications */ 288 struct pci_nvme_aen { 289 pci_nvme_async_type atype; 290 uint32_t event_data; 291 bool posted; 292 }; 293 294 /* 295 * By default, enable all Asynchrnous Event Notifications: 296 * SMART / Health Critical Warnings 297 * Namespace Attribute Notices 298 */ 299 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 300 301 typedef enum { 302 NVME_CNTRLTYPE_IO = 1, 303 NVME_CNTRLTYPE_DISCOVERY = 2, 304 NVME_CNTRLTYPE_ADMIN = 3, 305 } pci_nvme_cntrl_type; 306 307 struct pci_nvme_softc { 308 struct pci_devinst *nsc_pi; 309 310 pthread_mutex_t mtx; 311 312 struct nvme_registers regs; 313 314 struct nvme_namespace_data nsdata; 315 struct nvme_controller_data ctrldata; 316 struct nvme_error_information_entry err_log; 317 struct nvme_health_information_page health_log; 318 struct nvme_firmware_page fw_log; 319 struct nvme_ns_list ns_log; 320 321 struct pci_nvme_blockstore nvstore; 322 323 uint16_t max_qentries; /* max entries per queue */ 324 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 325 uint32_t num_cqueues; 326 uint32_t num_squeues; 327 bool num_q_is_set; /* Has host set Number of Queues */ 328 329 struct pci_nvme_ioreq *ioreqs; 330 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 331 uint32_t pending_ios; 332 uint32_t ioslots; 333 sem_t iosemlock; 334 335 /* 336 * Memory mapped Submission and Completion queues 337 * Each array includes both Admin and IO queues 338 */ 339 struct nvme_completion_queue *compl_queues; 340 struct nvme_submission_queue *submit_queues; 341 342 struct nvme_feature_obj feat[NVME_FID_MAX]; 343 344 enum nvme_dsm_type dataset_management; 345 346 /* Accounting for SMART data */ 347 __uint128_t read_data_units; 348 __uint128_t write_data_units; 349 __uint128_t read_commands; 350 __uint128_t write_commands; 351 uint32_t read_dunits_remainder; 352 uint32_t write_dunits_remainder; 353 354 STAILQ_HEAD(, pci_nvme_aer) aer_list; 355 pthread_mutex_t aer_mtx; 356 uint32_t aer_count; 357 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 358 pthread_t aen_tid; 359 pthread_mutex_t aen_mtx; 360 pthread_cond_t aen_cond; 361 }; 362 363 364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 365 struct nvme_completion_queue *cq, 366 uint32_t cdw0, 367 uint16_t cid, 368 uint16_t sqid, 369 uint16_t status); 370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 372 static void pci_nvme_io_done(struct blockif_req *, int); 373 374 /* Controller Configuration utils */ 375 #define NVME_CC_GET_EN(cc) \ 376 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 377 #define NVME_CC_GET_CSS(cc) \ 378 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 379 #define NVME_CC_GET_SHN(cc) \ 380 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 381 #define NVME_CC_GET_IOSQES(cc) \ 382 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 383 #define NVME_CC_GET_IOCQES(cc) \ 384 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 385 386 #define NVME_CC_WRITE_MASK \ 387 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 388 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 389 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 390 391 #define NVME_CC_NEN_WRITE_MASK \ 392 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 393 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 394 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 395 396 /* Controller Status utils */ 397 #define NVME_CSTS_GET_RDY(sts) \ 398 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 399 400 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 401 #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT) 402 403 /* Completion Queue status word utils */ 404 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 405 #define NVME_STATUS_MASK \ 406 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 407 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 408 409 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 410 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 411 412 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 413 struct nvme_feature_obj *, 414 struct nvme_command *, 415 struct nvme_completion *); 416 static void nvme_feature_temperature(struct pci_nvme_softc *, 417 struct nvme_feature_obj *, 418 struct nvme_command *, 419 struct nvme_completion *); 420 static void nvme_feature_num_queues(struct pci_nvme_softc *, 421 struct nvme_feature_obj *, 422 struct nvme_command *, 423 struct nvme_completion *); 424 static void nvme_feature_iv_config(struct pci_nvme_softc *, 425 struct nvme_feature_obj *, 426 struct nvme_command *, 427 struct nvme_completion *); 428 static void nvme_feature_async_event(struct pci_nvme_softc *, 429 struct nvme_feature_obj *, 430 struct nvme_command *, 431 struct nvme_completion *); 432 433 static void *aen_thr(void *arg); 434 435 static __inline void 436 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 437 { 438 size_t len; 439 440 len = strnlen(src, dst_size); 441 memset(dst, pad, dst_size); 442 memcpy(dst, src, len); 443 } 444 445 static __inline void 446 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 447 { 448 449 *status &= ~NVME_STATUS_MASK; 450 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 451 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 452 } 453 454 static __inline void 455 pci_nvme_status_genc(uint16_t *status, uint16_t code) 456 { 457 458 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 459 } 460 461 /* 462 * Initialize the requested number or IO Submission and Completion Queues. 463 * Admin queues are allocated implicitly. 464 */ 465 static void 466 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 467 { 468 uint32_t i; 469 470 /* 471 * Allocate and initialize the Submission Queues 472 */ 473 if (nsq > NVME_QUEUES) { 474 WPRINTF("%s: clamping number of SQ from %u to %u", 475 __func__, nsq, NVME_QUEUES); 476 nsq = NVME_QUEUES; 477 } 478 479 sc->num_squeues = nsq; 480 481 sc->submit_queues = calloc(sc->num_squeues + 1, 482 sizeof(struct nvme_submission_queue)); 483 if (sc->submit_queues == NULL) { 484 WPRINTF("%s: SQ allocation failed", __func__); 485 sc->num_squeues = 0; 486 } else { 487 struct nvme_submission_queue *sq = sc->submit_queues; 488 489 for (i = 0; i < sc->num_squeues + 1; i++) 490 pthread_mutex_init(&sq[i].mtx, NULL); 491 } 492 493 /* 494 * Allocate and initialize the Completion Queues 495 */ 496 if (ncq > NVME_QUEUES) { 497 WPRINTF("%s: clamping number of CQ from %u to %u", 498 __func__, ncq, NVME_QUEUES); 499 ncq = NVME_QUEUES; 500 } 501 502 sc->num_cqueues = ncq; 503 504 sc->compl_queues = calloc(sc->num_cqueues + 1, 505 sizeof(struct nvme_completion_queue)); 506 if (sc->compl_queues == NULL) { 507 WPRINTF("%s: CQ allocation failed", __func__); 508 sc->num_cqueues = 0; 509 } else { 510 struct nvme_completion_queue *cq = sc->compl_queues; 511 512 for (i = 0; i < sc->num_cqueues + 1; i++) 513 pthread_mutex_init(&cq[i].mtx, NULL); 514 } 515 } 516 517 static void 518 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 519 { 520 struct nvme_controller_data *cd = &sc->ctrldata; 521 522 cd->vid = 0xFB5D; 523 cd->ssvid = 0x0000; 524 525 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 526 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 527 528 /* Num of submission commands that we can handle at a time (2^rab) */ 529 cd->rab = 4; 530 531 /* FreeBSD OUI */ 532 cd->ieee[0] = 0x58; 533 cd->ieee[1] = 0x9c; 534 cd->ieee[2] = 0xfc; 535 536 cd->mic = 0; 537 538 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 539 540 cd->ver = NVME_REV(1,4); 541 542 cd->cntrltype = NVME_CNTRLTYPE_IO; 543 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 544 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); 545 cd->acl = 2; 546 cd->aerl = 4; 547 548 /* Advertise 1, Read-only firmware slot */ 549 cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | 550 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 551 cd->lpa = 0; /* TODO: support some simple things like SMART */ 552 cd->elpe = 0; /* max error log page entries */ 553 /* 554 * Report a single power state (zero-based value) 555 * power_state[] values are left as zero to indicate "Not reported" 556 */ 557 cd->npss = 0; 558 559 /* Warning Composite Temperature Threshold */ 560 cd->wctemp = 0x0157; 561 cd->cctemp = 0x0157; 562 563 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ 564 cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO << 565 NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT); 566 567 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 568 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 569 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 570 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 571 cd->nn = 1; /* number of namespaces */ 572 573 cd->oncs = 0; 574 switch (sc->dataset_management) { 575 case NVME_DATASET_MANAGEMENT_AUTO: 576 if (sc->nvstore.deallocate) 577 cd->oncs |= NVME_ONCS_DSM; 578 break; 579 case NVME_DATASET_MANAGEMENT_ENABLE: 580 cd->oncs |= NVME_ONCS_DSM; 581 break; 582 default: 583 break; 584 } 585 586 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << 587 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; 588 589 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; 590 } 591 592 /* 593 * Calculate the CRC-16 of the given buffer 594 * See copyright attribution at top of file 595 */ 596 static uint16_t 597 crc16(uint16_t crc, const void *buffer, unsigned int len) 598 { 599 const unsigned char *cp = buffer; 600 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 601 static uint16_t const crc16_table[256] = { 602 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 603 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 604 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 605 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 606 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 607 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 608 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 609 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 610 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 611 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 612 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 613 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 614 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 615 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 616 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 617 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 618 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 619 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 620 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 621 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 622 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 623 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 624 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 625 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 626 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 627 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 628 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 629 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 630 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 631 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 632 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 633 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 634 }; 635 636 while (len--) 637 crc = (((crc >> 8) & 0xffU) ^ 638 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 639 return crc; 640 } 641 642 static void 643 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 644 struct nvme_namespace_data *nd) 645 { 646 647 /* Get capacity and block size information from backing store */ 648 nd->nsze = nvstore->size / nvstore->sectsz; 649 nd->ncap = nd->nsze; 650 nd->nuse = nd->nsze; 651 } 652 653 static void 654 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 655 struct nvme_namespace_data *nd, uint32_t nsid, 656 struct pci_nvme_blockstore *nvstore) 657 { 658 659 pci_nvme_init_nsdata_size(nvstore, nd); 660 661 if (nvstore->type == NVME_STOR_BLOCKIF) 662 nvstore->deallocate = blockif_candelete(nvstore->ctx); 663 664 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 665 nd->flbas = 0; 666 667 /* Create an EUI-64 if user did not provide one */ 668 if (nvstore->eui64 == 0) { 669 char *data = NULL; 670 uint64_t eui64 = nvstore->eui64; 671 672 asprintf(&data, "%s%u%u%u", get_config_value("name"), 673 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 674 sc->nsc_pi->pi_func); 675 676 if (data != NULL) { 677 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 678 free(data); 679 } 680 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 681 } 682 be64enc(nd->eui64, nvstore->eui64); 683 684 /* LBA data-sz = 2^lbads */ 685 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 686 } 687 688 static void 689 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 690 { 691 __uint128_t power_cycles = 1; 692 693 memset(&sc->err_log, 0, sizeof(sc->err_log)); 694 memset(&sc->health_log, 0, sizeof(sc->health_log)); 695 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 696 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 697 698 /* Set read/write remainder to round up according to spec */ 699 sc->read_dunits_remainder = 999; 700 sc->write_dunits_remainder = 999; 701 702 /* Set nominal Health values checked by implementations */ 703 sc->health_log.temperature = NVME_TEMPERATURE; 704 sc->health_log.available_spare = 100; 705 sc->health_log.available_spare_threshold = 10; 706 707 /* Set Active Firmware Info to slot 1 */ 708 sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT); 709 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, 710 sizeof(sc->fw_log.revision[0])); 711 712 memcpy(&sc->health_log.power_cycles, &power_cycles, 713 sizeof(sc->health_log.power_cycles)); 714 } 715 716 static void 717 pci_nvme_init_features(struct pci_nvme_softc *sc) 718 { 719 enum nvme_feature fid; 720 721 for (fid = 0; fid < NVME_FID_MAX; fid++) { 722 switch (fid) { 723 case NVME_FEAT_ARBITRATION: 724 case NVME_FEAT_POWER_MANAGEMENT: 725 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 726 case NVME_FEAT_WRITE_ATOMICITY: 727 /* Mandatory but no special handling required */ 728 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 729 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 730 // this returns a data buffer 731 break; 732 case NVME_FEAT_TEMPERATURE_THRESHOLD: 733 sc->feat[fid].set = nvme_feature_temperature; 734 break; 735 case NVME_FEAT_ERROR_RECOVERY: 736 sc->feat[fid].namespace_specific = true; 737 break; 738 case NVME_FEAT_NUMBER_OF_QUEUES: 739 sc->feat[fid].set = nvme_feature_num_queues; 740 break; 741 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 742 sc->feat[fid].set = nvme_feature_iv_config; 743 break; 744 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 745 sc->feat[fid].set = nvme_feature_async_event; 746 /* Enable all AENs by default */ 747 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 748 break; 749 default: 750 sc->feat[fid].set = nvme_feature_invalid_cb; 751 sc->feat[fid].get = nvme_feature_invalid_cb; 752 } 753 } 754 } 755 756 static void 757 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 758 { 759 760 STAILQ_INIT(&sc->aer_list); 761 sc->aer_count = 0; 762 } 763 764 static void 765 pci_nvme_aer_init(struct pci_nvme_softc *sc) 766 { 767 768 pthread_mutex_init(&sc->aer_mtx, NULL); 769 pci_nvme_aer_reset(sc); 770 } 771 772 static void 773 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 774 { 775 struct pci_nvme_aer *aer = NULL; 776 777 pthread_mutex_lock(&sc->aer_mtx); 778 while (!STAILQ_EMPTY(&sc->aer_list)) { 779 aer = STAILQ_FIRST(&sc->aer_list); 780 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 781 free(aer); 782 } 783 pthread_mutex_unlock(&sc->aer_mtx); 784 785 pci_nvme_aer_reset(sc); 786 } 787 788 static bool 789 pci_nvme_aer_available(struct pci_nvme_softc *sc) 790 { 791 792 return (sc->aer_count != 0); 793 } 794 795 static bool 796 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 797 { 798 struct nvme_controller_data *cd = &sc->ctrldata; 799 800 /* AERL is a zero based value while aer_count is one's based */ 801 return (sc->aer_count == (cd->aerl + 1)); 802 } 803 804 /* 805 * Add an Async Event Request 806 * 807 * Stores an AER to be returned later if the Controller needs to notify the 808 * host of an event. 809 * Note that while the NVMe spec doesn't require Controllers to return AER's 810 * in order, this implementation does preserve the order. 811 */ 812 static int 813 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 814 { 815 struct pci_nvme_aer *aer = NULL; 816 817 aer = calloc(1, sizeof(struct pci_nvme_aer)); 818 if (aer == NULL) 819 return (-1); 820 821 /* Save the Command ID for use in the completion message */ 822 aer->cid = cid; 823 824 pthread_mutex_lock(&sc->aer_mtx); 825 sc->aer_count++; 826 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 827 pthread_mutex_unlock(&sc->aer_mtx); 828 829 return (0); 830 } 831 832 /* 833 * Get an Async Event Request structure 834 * 835 * Returns a pointer to an AER previously submitted by the host or NULL if 836 * no AER's exist. Caller is responsible for freeing the returned struct. 837 */ 838 static struct pci_nvme_aer * 839 pci_nvme_aer_get(struct pci_nvme_softc *sc) 840 { 841 struct pci_nvme_aer *aer = NULL; 842 843 pthread_mutex_lock(&sc->aer_mtx); 844 aer = STAILQ_FIRST(&sc->aer_list); 845 if (aer != NULL) { 846 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 847 sc->aer_count--; 848 } 849 pthread_mutex_unlock(&sc->aer_mtx); 850 851 return (aer); 852 } 853 854 static void 855 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 856 { 857 uint32_t atype; 858 859 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 860 861 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 862 sc->aen[atype].atype = atype; 863 } 864 } 865 866 static void 867 pci_nvme_aen_init(struct pci_nvme_softc *sc) 868 { 869 char nstr[80]; 870 871 pci_nvme_aen_reset(sc); 872 873 pthread_mutex_init(&sc->aen_mtx, NULL); 874 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 875 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 876 sc->nsc_pi->pi_func); 877 pthread_set_name_np(sc->aen_tid, nstr); 878 } 879 880 static void 881 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 882 { 883 884 pci_nvme_aen_reset(sc); 885 } 886 887 /* Notify the AEN thread of pending work */ 888 static void 889 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 890 { 891 892 pthread_cond_signal(&sc->aen_cond); 893 } 894 895 /* 896 * Post an Asynchronous Event Notification 897 */ 898 static int32_t 899 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 900 uint32_t event_data) 901 { 902 struct pci_nvme_aen *aen; 903 904 if (atype >= PCI_NVME_AE_TYPE_MAX) { 905 return(EINVAL); 906 } 907 908 pthread_mutex_lock(&sc->aen_mtx); 909 aen = &sc->aen[atype]; 910 911 /* Has the controller already posted an event of this type? */ 912 if (aen->posted) { 913 pthread_mutex_unlock(&sc->aen_mtx); 914 return(EALREADY); 915 } 916 917 aen->event_data = event_data; 918 aen->posted = true; 919 pthread_mutex_unlock(&sc->aen_mtx); 920 921 pci_nvme_aen_notify(sc); 922 923 return(0); 924 } 925 926 static void 927 pci_nvme_aen_process(struct pci_nvme_softc *sc) 928 { 929 struct pci_nvme_aer *aer; 930 struct pci_nvme_aen *aen; 931 pci_nvme_async_type atype; 932 uint32_t mask; 933 uint16_t status; 934 uint8_t lid; 935 936 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 937 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 938 aen = &sc->aen[atype]; 939 /* Previous iterations may have depleted the available AER's */ 940 if (!pci_nvme_aer_available(sc)) { 941 DPRINTF("%s: no AER", __func__); 942 break; 943 } 944 945 if (!aen->posted) { 946 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 947 continue; 948 } 949 950 status = NVME_SC_SUCCESS; 951 952 /* Is the event masked? */ 953 mask = 954 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 955 956 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 957 switch (atype) { 958 case PCI_NVME_AE_TYPE_ERROR: 959 lid = NVME_LOG_ERROR; 960 break; 961 case PCI_NVME_AE_TYPE_SMART: 962 mask &= 0xff; 963 if ((mask & aen->event_data) == 0) 964 continue; 965 lid = NVME_LOG_HEALTH_INFORMATION; 966 break; 967 case PCI_NVME_AE_TYPE_NOTICE: 968 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 969 EPRINTLN("%s unknown AEN notice type %u", 970 __func__, aen->event_data); 971 status = NVME_SC_INTERNAL_DEVICE_ERROR; 972 break; 973 } 974 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 975 continue; 976 switch (aen->event_data) { 977 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 978 lid = NVME_LOG_CHANGED_NAMESPACE; 979 break; 980 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 981 lid = NVME_LOG_FIRMWARE_SLOT; 982 break; 983 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 984 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 985 break; 986 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 987 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 988 break; 989 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 990 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 991 break; 992 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 993 lid = NVME_LOG_LBA_STATUS_INFORMATION; 994 break; 995 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 996 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 997 break; 998 default: 999 lid = 0; 1000 } 1001 break; 1002 default: 1003 /* bad type?!? */ 1004 EPRINTLN("%s unknown AEN type %u", __func__, atype); 1005 status = NVME_SC_INTERNAL_DEVICE_ERROR; 1006 break; 1007 } 1008 1009 aer = pci_nvme_aer_get(sc); 1010 assert(aer != NULL); 1011 1012 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 1013 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1014 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 1015 aer->cid, 1016 0, /* SQID */ 1017 status); 1018 1019 aen->event_data = 0; 1020 aen->posted = false; 1021 1022 pci_generate_msix(sc->nsc_pi, 0); 1023 } 1024 } 1025 1026 static void * 1027 aen_thr(void *arg) 1028 { 1029 struct pci_nvme_softc *sc; 1030 1031 sc = arg; 1032 1033 pthread_mutex_lock(&sc->aen_mtx); 1034 for (;;) { 1035 pci_nvme_aen_process(sc); 1036 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 1037 } 1038 pthread_mutex_unlock(&sc->aen_mtx); 1039 1040 pthread_exit(NULL); 1041 return (NULL); 1042 } 1043 1044 static void 1045 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1046 { 1047 uint32_t i; 1048 1049 DPRINTF("%s", __func__); 1050 1051 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1052 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1053 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1054 1055 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1056 1057 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1058 1059 sc->regs.cc = 0; 1060 1061 assert(sc->submit_queues != NULL); 1062 1063 for (i = 0; i < sc->num_squeues + 1; i++) { 1064 sc->submit_queues[i].qbase = NULL; 1065 sc->submit_queues[i].size = 0; 1066 sc->submit_queues[i].cqid = 0; 1067 sc->submit_queues[i].tail = 0; 1068 sc->submit_queues[i].head = 0; 1069 } 1070 1071 assert(sc->compl_queues != NULL); 1072 1073 for (i = 0; i < sc->num_cqueues + 1; i++) { 1074 sc->compl_queues[i].qbase = NULL; 1075 sc->compl_queues[i].size = 0; 1076 sc->compl_queues[i].tail = 0; 1077 sc->compl_queues[i].head = 0; 1078 } 1079 1080 sc->num_q_is_set = false; 1081 1082 pci_nvme_aer_destroy(sc); 1083 pci_nvme_aen_destroy(sc); 1084 1085 /* 1086 * Clear CSTS.RDY last to prevent the host from enabling Controller 1087 * before cleanup completes 1088 */ 1089 sc->regs.csts = 0; 1090 } 1091 1092 static void 1093 pci_nvme_reset(struct pci_nvme_softc *sc) 1094 { 1095 pthread_mutex_lock(&sc->mtx); 1096 pci_nvme_reset_locked(sc); 1097 pthread_mutex_unlock(&sc->mtx); 1098 } 1099 1100 static int 1101 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 1102 { 1103 uint16_t acqs, asqs; 1104 1105 DPRINTF("%s", __func__); 1106 1107 /* 1108 * NVMe 2.0 states that "enabling a controller while this field is 1109 * cleared to 0h produces undefined results" for both ACQS and 1110 * ASQS. If zero, set CFS and do not become ready. 1111 */ 1112 asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK); 1113 if (asqs < 2) { 1114 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, 1115 asqs - 1, sc->regs.aqa); 1116 sc->regs.csts |= NVME_CSTS_CFS; 1117 return (-1); 1118 } 1119 sc->submit_queues[0].size = asqs; 1120 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 1121 sizeof(struct nvme_command) * asqs); 1122 if (sc->submit_queues[0].qbase == NULL) { 1123 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, 1124 sc->regs.asq); 1125 sc->regs.csts |= NVME_CSTS_CFS; 1126 return (-1); 1127 } 1128 1129 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1130 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1131 1132 acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1133 NVME_AQA_REG_ACQS_MASK); 1134 if (acqs < 2) { 1135 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, 1136 acqs - 1, sc->regs.aqa); 1137 sc->regs.csts |= NVME_CSTS_CFS; 1138 return (-1); 1139 } 1140 sc->compl_queues[0].size = acqs; 1141 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 1142 sizeof(struct nvme_completion) * acqs); 1143 if (sc->compl_queues[0].qbase == NULL) { 1144 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, 1145 sc->regs.acq); 1146 sc->regs.csts |= NVME_CSTS_CFS; 1147 return (-1); 1148 } 1149 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1150 1151 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1152 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1153 1154 return (0); 1155 } 1156 1157 static int 1158 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1159 size_t len, enum nvme_copy_dir dir) 1160 { 1161 uint8_t *p; 1162 size_t bytes; 1163 1164 if (len > (8 * 1024)) { 1165 return (-1); 1166 } 1167 1168 /* Copy from the start of prp1 to the end of the physical page */ 1169 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1170 bytes = MIN(bytes, len); 1171 1172 p = vm_map_gpa(ctx, prp1, bytes); 1173 if (p == NULL) { 1174 return (-1); 1175 } 1176 1177 if (dir == NVME_COPY_TO_PRP) 1178 memcpy(p, b, bytes); 1179 else 1180 memcpy(b, p, bytes); 1181 1182 b += bytes; 1183 1184 len -= bytes; 1185 if (len == 0) { 1186 return (0); 1187 } 1188 1189 len = MIN(len, PAGE_SIZE); 1190 1191 p = vm_map_gpa(ctx, prp2, len); 1192 if (p == NULL) { 1193 return (-1); 1194 } 1195 1196 if (dir == NVME_COPY_TO_PRP) 1197 memcpy(p, b, len); 1198 else 1199 memcpy(b, p, len); 1200 1201 return (0); 1202 } 1203 1204 /* 1205 * Write a Completion Queue Entry update 1206 * 1207 * Write the completion and update the doorbell value 1208 */ 1209 static void 1210 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1211 struct nvme_completion_queue *cq, 1212 uint32_t cdw0, 1213 uint16_t cid, 1214 uint16_t sqid, 1215 uint16_t status) 1216 { 1217 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1218 struct nvme_completion *cqe; 1219 1220 assert(cq->qbase != NULL); 1221 1222 pthread_mutex_lock(&cq->mtx); 1223 1224 cqe = &cq->qbase[cq->tail]; 1225 1226 /* Flip the phase bit */ 1227 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1228 1229 cqe->cdw0 = cdw0; 1230 cqe->sqhd = sq->head; 1231 cqe->sqid = sqid; 1232 cqe->cid = cid; 1233 cqe->status = status; 1234 1235 cq->tail++; 1236 if (cq->tail >= cq->size) { 1237 cq->tail = 0; 1238 } 1239 1240 pthread_mutex_unlock(&cq->mtx); 1241 } 1242 1243 static int 1244 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1245 struct nvme_completion* compl) 1246 { 1247 uint16_t qid = command->cdw10 & 0xffff; 1248 1249 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1250 if (qid == 0 || qid > sc->num_squeues || 1251 (sc->submit_queues[qid].qbase == NULL)) { 1252 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1253 __func__, qid, sc->num_squeues); 1254 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1255 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1256 return (1); 1257 } 1258 1259 sc->submit_queues[qid].qbase = NULL; 1260 sc->submit_queues[qid].cqid = 0; 1261 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1262 return (1); 1263 } 1264 1265 static int 1266 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1267 struct nvme_completion* compl) 1268 { 1269 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1270 uint16_t qid = command->cdw10 & 0xffff; 1271 struct nvme_submission_queue *nsq; 1272 1273 if ((qid == 0) || (qid > sc->num_squeues) || 1274 (sc->submit_queues[qid].qbase != NULL)) { 1275 WPRINTF("%s queue index %u > num_squeues %u", 1276 __func__, qid, sc->num_squeues); 1277 pci_nvme_status_tc(&compl->status, 1278 NVME_SCT_COMMAND_SPECIFIC, 1279 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1280 return (1); 1281 } 1282 1283 nsq = &sc->submit_queues[qid]; 1284 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1285 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1286 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1287 /* 1288 * Queues must specify at least two entries 1289 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1290 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1291 */ 1292 pci_nvme_status_tc(&compl->status, 1293 NVME_SCT_COMMAND_SPECIFIC, 1294 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1295 return (1); 1296 } 1297 nsq->head = nsq->tail = 0; 1298 1299 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1300 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1301 pci_nvme_status_tc(&compl->status, 1302 NVME_SCT_COMMAND_SPECIFIC, 1303 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1304 return (1); 1305 } 1306 1307 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1308 pci_nvme_status_tc(&compl->status, 1309 NVME_SCT_COMMAND_SPECIFIC, 1310 NVME_SC_COMPLETION_QUEUE_INVALID); 1311 return (1); 1312 } 1313 1314 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1315 1316 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1317 sizeof(struct nvme_command) * (size_t)nsq->size); 1318 1319 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1320 qid, nsq->size, nsq->qbase, nsq->cqid); 1321 1322 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1323 1324 DPRINTF("%s completed creating IOSQ qid %u", 1325 __func__, qid); 1326 } else { 1327 /* 1328 * Guest sent non-cont submission queue request. 1329 * This setting is unsupported by this emulation. 1330 */ 1331 WPRINTF("%s unsupported non-contig (list-based) " 1332 "create i/o submission queue", __func__); 1333 1334 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1335 } 1336 return (1); 1337 } 1338 1339 static int 1340 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1341 struct nvme_completion* compl) 1342 { 1343 uint16_t qid = command->cdw10 & 0xffff; 1344 uint16_t sqid; 1345 1346 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1347 if (qid == 0 || qid > sc->num_cqueues || 1348 (sc->compl_queues[qid].qbase == NULL)) { 1349 WPRINTF("%s queue index %u / num_cqueues %u", 1350 __func__, qid, sc->num_cqueues); 1351 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1352 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1353 return (1); 1354 } 1355 1356 /* Deleting an Active CQ is an error */ 1357 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1358 if (sc->submit_queues[sqid].cqid == qid) { 1359 pci_nvme_status_tc(&compl->status, 1360 NVME_SCT_COMMAND_SPECIFIC, 1361 NVME_SC_INVALID_QUEUE_DELETION); 1362 return (1); 1363 } 1364 1365 sc->compl_queues[qid].qbase = NULL; 1366 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1367 return (1); 1368 } 1369 1370 static int 1371 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1372 struct nvme_completion* compl) 1373 { 1374 struct nvme_completion_queue *ncq; 1375 uint16_t qid = command->cdw10 & 0xffff; 1376 1377 /* Only support Physically Contiguous queues */ 1378 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1379 WPRINTF("%s unsupported non-contig (list-based) " 1380 "create i/o completion queue", 1381 __func__); 1382 1383 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1384 return (1); 1385 } 1386 1387 if ((qid == 0) || (qid > sc->num_cqueues) || 1388 (sc->compl_queues[qid].qbase != NULL)) { 1389 WPRINTF("%s queue index %u > num_cqueues %u", 1390 __func__, qid, sc->num_cqueues); 1391 pci_nvme_status_tc(&compl->status, 1392 NVME_SCT_COMMAND_SPECIFIC, 1393 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1394 return (1); 1395 } 1396 1397 ncq = &sc->compl_queues[qid]; 1398 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1399 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1400 if (ncq->intr_vec > (sc->max_queues + 1)) { 1401 pci_nvme_status_tc(&compl->status, 1402 NVME_SCT_COMMAND_SPECIFIC, 1403 NVME_SC_INVALID_INTERRUPT_VECTOR); 1404 return (1); 1405 } 1406 1407 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1408 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1409 /* 1410 * Queues must specify at least two entries 1411 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1412 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1413 */ 1414 pci_nvme_status_tc(&compl->status, 1415 NVME_SCT_COMMAND_SPECIFIC, 1416 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1417 return (1); 1418 } 1419 ncq->head = ncq->tail = 0; 1420 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1421 command->prp1, 1422 sizeof(struct nvme_command) * (size_t)ncq->size); 1423 1424 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1425 1426 1427 return (1); 1428 } 1429 1430 static int 1431 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1432 struct nvme_completion* compl) 1433 { 1434 uint64_t logoff; 1435 uint32_t logsize; 1436 uint8_t logpage; 1437 1438 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1439 1440 /* 1441 * Command specifies the number of dwords to return in fields NUMDU 1442 * and NUMDL. This is a zero-based value. 1443 */ 1444 logpage = command->cdw10 & 0xFF; 1445 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1446 logsize *= sizeof(uint32_t); 1447 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1448 1449 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1450 1451 switch (logpage) { 1452 case NVME_LOG_ERROR: 1453 if (logoff >= sizeof(sc->err_log)) { 1454 pci_nvme_status_genc(&compl->status, 1455 NVME_SC_INVALID_FIELD); 1456 break; 1457 } 1458 1459 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1460 command->prp2, (uint8_t *)&sc->err_log + logoff, 1461 MIN(logsize - logoff, sizeof(sc->err_log)), 1462 NVME_COPY_TO_PRP); 1463 break; 1464 case NVME_LOG_HEALTH_INFORMATION: 1465 if (logoff >= sizeof(sc->health_log)) { 1466 pci_nvme_status_genc(&compl->status, 1467 NVME_SC_INVALID_FIELD); 1468 break; 1469 } 1470 1471 pthread_mutex_lock(&sc->mtx); 1472 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1473 sizeof(sc->health_log.data_units_read)); 1474 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1475 sizeof(sc->health_log.data_units_written)); 1476 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1477 sizeof(sc->health_log.host_read_commands)); 1478 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1479 sizeof(sc->health_log.host_write_commands)); 1480 pthread_mutex_unlock(&sc->mtx); 1481 1482 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1483 command->prp2, (uint8_t *)&sc->health_log + logoff, 1484 MIN(logsize - logoff, sizeof(sc->health_log)), 1485 NVME_COPY_TO_PRP); 1486 break; 1487 case NVME_LOG_FIRMWARE_SLOT: 1488 if (logoff >= sizeof(sc->fw_log)) { 1489 pci_nvme_status_genc(&compl->status, 1490 NVME_SC_INVALID_FIELD); 1491 break; 1492 } 1493 1494 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1495 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1496 MIN(logsize - logoff, sizeof(sc->fw_log)), 1497 NVME_COPY_TO_PRP); 1498 break; 1499 case NVME_LOG_CHANGED_NAMESPACE: 1500 if (logoff >= sizeof(sc->ns_log)) { 1501 pci_nvme_status_genc(&compl->status, 1502 NVME_SC_INVALID_FIELD); 1503 break; 1504 } 1505 1506 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1507 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1508 MIN(logsize - logoff, sizeof(sc->ns_log)), 1509 NVME_COPY_TO_PRP); 1510 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1511 break; 1512 default: 1513 DPRINTF("%s get log page %x command not supported", 1514 __func__, logpage); 1515 1516 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1517 NVME_SC_INVALID_LOG_PAGE); 1518 } 1519 1520 return (1); 1521 } 1522 1523 static int 1524 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1525 struct nvme_completion* compl) 1526 { 1527 void *dest; 1528 uint16_t status; 1529 1530 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1531 command->cdw10 & 0xFF, command->nsid); 1532 1533 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1534 1535 switch (command->cdw10 & 0xFF) { 1536 case 0x00: /* return Identify Namespace data structure */ 1537 /* Global NS only valid with NS Management */ 1538 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1539 pci_nvme_status_genc(&status, 1540 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1541 break; 1542 } 1543 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1544 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1545 NVME_COPY_TO_PRP); 1546 break; 1547 case 0x01: /* return Identify Controller data structure */ 1548 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1549 command->prp2, (uint8_t *)&sc->ctrldata, 1550 sizeof(sc->ctrldata), 1551 NVME_COPY_TO_PRP); 1552 break; 1553 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1554 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1555 sizeof(uint32_t) * 1024); 1556 /* All unused entries shall be zero */ 1557 memset(dest, 0, sizeof(uint32_t) * 1024); 1558 ((uint32_t *)dest)[0] = 1; 1559 break; 1560 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1561 if (command->nsid != 1) { 1562 pci_nvme_status_genc(&status, 1563 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1564 break; 1565 } 1566 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1567 sizeof(uint32_t) * 1024); 1568 /* All bytes after the descriptor shall be zero */ 1569 memset(dest, 0, sizeof(uint32_t) * 1024); 1570 1571 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1572 ((uint8_t *)dest)[0] = 1; 1573 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1574 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); 1575 break; 1576 case 0x13: 1577 /* 1578 * Controller list is optional but used by UNH tests. Return 1579 * a valid but empty list. 1580 */ 1581 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1582 sizeof(uint16_t) * 2048); 1583 memset(dest, 0, sizeof(uint16_t) * 2048); 1584 break; 1585 default: 1586 DPRINTF("%s unsupported identify command requested 0x%x", 1587 __func__, command->cdw10 & 0xFF); 1588 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1589 break; 1590 } 1591 1592 compl->status = status; 1593 return (1); 1594 } 1595 1596 static const char * 1597 nvme_fid_to_name(uint8_t fid) 1598 { 1599 const char *name; 1600 1601 switch (fid) { 1602 case NVME_FEAT_ARBITRATION: 1603 name = "Arbitration"; 1604 break; 1605 case NVME_FEAT_POWER_MANAGEMENT: 1606 name = "Power Management"; 1607 break; 1608 case NVME_FEAT_LBA_RANGE_TYPE: 1609 name = "LBA Range Type"; 1610 break; 1611 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1612 name = "Temperature Threshold"; 1613 break; 1614 case NVME_FEAT_ERROR_RECOVERY: 1615 name = "Error Recovery"; 1616 break; 1617 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1618 name = "Volatile Write Cache"; 1619 break; 1620 case NVME_FEAT_NUMBER_OF_QUEUES: 1621 name = "Number of Queues"; 1622 break; 1623 case NVME_FEAT_INTERRUPT_COALESCING: 1624 name = "Interrupt Coalescing"; 1625 break; 1626 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1627 name = "Interrupt Vector Configuration"; 1628 break; 1629 case NVME_FEAT_WRITE_ATOMICITY: 1630 name = "Write Atomicity Normal"; 1631 break; 1632 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1633 name = "Asynchronous Event Configuration"; 1634 break; 1635 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1636 name = "Autonomous Power State Transition"; 1637 break; 1638 case NVME_FEAT_HOST_MEMORY_BUFFER: 1639 name = "Host Memory Buffer"; 1640 break; 1641 case NVME_FEAT_TIMESTAMP: 1642 name = "Timestamp"; 1643 break; 1644 case NVME_FEAT_KEEP_ALIVE_TIMER: 1645 name = "Keep Alive Timer"; 1646 break; 1647 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1648 name = "Host Controlled Thermal Management"; 1649 break; 1650 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1651 name = "Non-Operation Power State Config"; 1652 break; 1653 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1654 name = "Read Recovery Level Config"; 1655 break; 1656 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1657 name = "Predictable Latency Mode Config"; 1658 break; 1659 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1660 name = "Predictable Latency Mode Window"; 1661 break; 1662 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1663 name = "LBA Status Information Report Interval"; 1664 break; 1665 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1666 name = "Host Behavior Support"; 1667 break; 1668 case NVME_FEAT_SANITIZE_CONFIG: 1669 name = "Sanitize Config"; 1670 break; 1671 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1672 name = "Endurance Group Event Configuration"; 1673 break; 1674 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1675 name = "Software Progress Marker"; 1676 break; 1677 case NVME_FEAT_HOST_IDENTIFIER: 1678 name = "Host Identifier"; 1679 break; 1680 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1681 name = "Reservation Notification Mask"; 1682 break; 1683 case NVME_FEAT_RESERVATION_PERSISTENCE: 1684 name = "Reservation Persistence"; 1685 break; 1686 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1687 name = "Namespace Write Protection Config"; 1688 break; 1689 default: 1690 name = "Unknown"; 1691 break; 1692 } 1693 1694 return (name); 1695 } 1696 1697 static void 1698 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, 1699 struct nvme_feature_obj *feat __unused, 1700 struct nvme_command *command __unused, 1701 struct nvme_completion *compl) 1702 { 1703 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1704 } 1705 1706 static void 1707 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1708 struct nvme_feature_obj *feat __unused, 1709 struct nvme_command *command, 1710 struct nvme_completion *compl) 1711 { 1712 uint32_t i; 1713 uint32_t cdw11 = command->cdw11; 1714 uint16_t iv; 1715 bool cd; 1716 1717 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1718 1719 iv = cdw11 & 0xffff; 1720 cd = cdw11 & (1 << 16); 1721 1722 if (iv > (sc->max_queues + 1)) { 1723 return; 1724 } 1725 1726 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1727 if ((iv == 0) && !cd) 1728 return; 1729 1730 /* Requested Interrupt Vector must be used by a CQ */ 1731 for (i = 0; i < sc->num_cqueues + 1; i++) { 1732 if (sc->compl_queues[i].intr_vec == iv) { 1733 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1734 } 1735 } 1736 } 1737 1738 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1739 static void 1740 nvme_feature_async_event(struct pci_nvme_softc *sc __unused, 1741 struct nvme_feature_obj *feat __unused, 1742 struct nvme_command *command, 1743 struct nvme_completion *compl) 1744 { 1745 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1746 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1747 } 1748 1749 #define NVME_TEMP_THRESH_OVER 0 1750 #define NVME_TEMP_THRESH_UNDER 1 1751 static void 1752 nvme_feature_temperature(struct pci_nvme_softc *sc, 1753 struct nvme_feature_obj *feat __unused, 1754 struct nvme_command *command, 1755 struct nvme_completion *compl) 1756 { 1757 uint16_t tmpth; /* Temperature Threshold */ 1758 uint8_t tmpsel; /* Threshold Temperature Select */ 1759 uint8_t thsel; /* Threshold Type Select */ 1760 bool set_crit = false; 1761 1762 tmpth = command->cdw11 & 0xffff; 1763 tmpsel = (command->cdw11 >> 16) & 0xf; 1764 thsel = (command->cdw11 >> 20) & 0x3; 1765 1766 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1767 1768 /* Check for unsupported values */ 1769 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1770 (thsel > NVME_TEMP_THRESH_UNDER)) { 1771 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1772 return; 1773 } 1774 1775 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1776 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1777 set_crit = true; 1778 1779 pthread_mutex_lock(&sc->mtx); 1780 if (set_crit) 1781 sc->health_log.critical_warning |= 1782 NVME_CRIT_WARN_ST_TEMPERATURE; 1783 else 1784 sc->health_log.critical_warning &= 1785 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1786 pthread_mutex_unlock(&sc->mtx); 1787 1788 if (set_crit) 1789 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1790 sc->health_log.critical_warning); 1791 1792 1793 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1794 } 1795 1796 static void 1797 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1798 struct nvme_feature_obj *feat __unused, 1799 struct nvme_command *command, 1800 struct nvme_completion *compl) 1801 { 1802 uint16_t nqr; /* Number of Queues Requested */ 1803 1804 if (sc->num_q_is_set) { 1805 WPRINTF("%s: Number of Queues already set", __func__); 1806 pci_nvme_status_genc(&compl->status, 1807 NVME_SC_COMMAND_SEQUENCE_ERROR); 1808 return; 1809 } 1810 1811 nqr = command->cdw11 & 0xFFFF; 1812 if (nqr == 0xffff) { 1813 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1814 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1815 return; 1816 } 1817 1818 sc->num_squeues = ONE_BASED(nqr); 1819 if (sc->num_squeues > sc->max_queues) { 1820 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1821 sc->max_queues); 1822 sc->num_squeues = sc->max_queues; 1823 } 1824 1825 nqr = (command->cdw11 >> 16) & 0xFFFF; 1826 if (nqr == 0xffff) { 1827 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1828 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1829 return; 1830 } 1831 1832 sc->num_cqueues = ONE_BASED(nqr); 1833 if (sc->num_cqueues > sc->max_queues) { 1834 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1835 sc->max_queues); 1836 sc->num_cqueues = sc->max_queues; 1837 } 1838 1839 /* Patch the command value which will be saved on callback's return */ 1840 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1841 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1842 1843 sc->num_q_is_set = true; 1844 } 1845 1846 static int 1847 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1848 struct nvme_completion *compl) 1849 { 1850 struct nvme_feature_obj *feat; 1851 uint32_t nsid = command->nsid; 1852 uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); 1853 bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); 1854 1855 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1856 1857 if (fid >= NVME_FID_MAX) { 1858 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1859 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1860 return (1); 1861 } 1862 1863 if (sv) { 1864 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1865 NVME_SC_FEATURE_NOT_SAVEABLE); 1866 return (1); 1867 } 1868 1869 feat = &sc->feat[fid]; 1870 1871 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1872 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1873 return (1); 1874 } 1875 1876 if (!feat->namespace_specific && 1877 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1878 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1879 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1880 return (1); 1881 } 1882 1883 compl->cdw0 = 0; 1884 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1885 1886 if (feat->set) 1887 feat->set(sc, feat, command, compl); 1888 else { 1889 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1890 NVME_SC_FEATURE_NOT_CHANGEABLE); 1891 return (1); 1892 } 1893 1894 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1895 if (compl->status == NVME_SC_SUCCESS) { 1896 feat->cdw11 = command->cdw11; 1897 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1898 (command->cdw11 != 0)) 1899 pci_nvme_aen_notify(sc); 1900 } 1901 1902 return (0); 1903 } 1904 1905 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1906 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1907 1908 static int 1909 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1910 struct nvme_completion* compl) 1911 { 1912 struct nvme_feature_obj *feat; 1913 uint8_t fid = command->cdw10 & 0xFF; 1914 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1915 1916 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1917 1918 if (fid >= NVME_FID_MAX) { 1919 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1920 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1921 return (1); 1922 } 1923 1924 compl->cdw0 = 0; 1925 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1926 1927 feat = &sc->feat[fid]; 1928 if (feat->get) { 1929 feat->get(sc, feat, command, compl); 1930 } 1931 1932 if (compl->status == NVME_SC_SUCCESS) { 1933 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1934 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1935 else 1936 compl->cdw0 = feat->cdw11; 1937 } 1938 1939 return (0); 1940 } 1941 1942 static int 1943 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1944 struct nvme_completion* compl) 1945 { 1946 uint8_t ses, lbaf, pi; 1947 1948 /* Only supports Secure Erase Setting - User Data Erase */ 1949 ses = (command->cdw10 >> 9) & 0x7; 1950 if (ses > 0x1) { 1951 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1952 return (1); 1953 } 1954 1955 /* Only supports a single LBA Format */ 1956 lbaf = command->cdw10 & 0xf; 1957 if (lbaf != 0) { 1958 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1959 NVME_SC_INVALID_FORMAT); 1960 return (1); 1961 } 1962 1963 /* Doesn't support Protection Infomation */ 1964 pi = (command->cdw10 >> 5) & 0x7; 1965 if (pi != 0) { 1966 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1967 return (1); 1968 } 1969 1970 if (sc->nvstore.type == NVME_STOR_RAM) { 1971 if (sc->nvstore.ctx) 1972 free(sc->nvstore.ctx); 1973 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1974 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1975 } else { 1976 struct pci_nvme_ioreq *req; 1977 int err; 1978 1979 req = pci_nvme_get_ioreq(sc); 1980 if (req == NULL) { 1981 pci_nvme_status_genc(&compl->status, 1982 NVME_SC_INTERNAL_DEVICE_ERROR); 1983 WPRINTF("%s: unable to allocate IO req", __func__); 1984 return (1); 1985 } 1986 req->nvme_sq = &sc->submit_queues[0]; 1987 req->sqid = 0; 1988 req->opc = command->opc; 1989 req->cid = command->cid; 1990 req->nsid = command->nsid; 1991 1992 req->io_req.br_offset = 0; 1993 req->io_req.br_resid = sc->nvstore.size; 1994 req->io_req.br_callback = pci_nvme_io_done; 1995 1996 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1997 if (err) { 1998 pci_nvme_status_genc(&compl->status, 1999 NVME_SC_INTERNAL_DEVICE_ERROR); 2000 pci_nvme_release_ioreq(sc, req); 2001 } else 2002 compl->status = NVME_NO_STATUS; 2003 } 2004 2005 return (1); 2006 } 2007 2008 static int 2009 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, 2010 struct nvme_completion *compl) 2011 { 2012 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 2013 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 2014 2015 /* TODO: search for the command ID and abort it */ 2016 2017 compl->cdw0 = 1; 2018 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 2019 return (1); 2020 } 2021 2022 static int 2023 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 2024 struct nvme_command* command, struct nvme_completion* compl) 2025 { 2026 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 2027 sc->aer_count, sc->ctrldata.aerl, command->cid); 2028 2029 /* Don't exceed the Async Event Request Limit (AERL). */ 2030 if (pci_nvme_aer_limit_reached(sc)) { 2031 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 2032 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 2033 return (1); 2034 } 2035 2036 if (pci_nvme_aer_add(sc, command->cid)) { 2037 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 2038 NVME_SC_INTERNAL_DEVICE_ERROR); 2039 return (1); 2040 } 2041 2042 /* 2043 * Raise events when they happen based on the Set Features cmd. 2044 * These events happen async, so only set completion successful if 2045 * there is an event reflective of the request to get event. 2046 */ 2047 compl->status = NVME_NO_STATUS; 2048 pci_nvme_aen_notify(sc); 2049 2050 return (0); 2051 } 2052 2053 static void 2054 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2055 { 2056 struct nvme_completion compl; 2057 struct nvme_command *cmd; 2058 struct nvme_submission_queue *sq; 2059 struct nvme_completion_queue *cq; 2060 uint16_t sqhead; 2061 2062 DPRINTF("%s index %u", __func__, (uint32_t)value); 2063 2064 sq = &sc->submit_queues[0]; 2065 cq = &sc->compl_queues[0]; 2066 2067 pthread_mutex_lock(&sq->mtx); 2068 2069 sqhead = sq->head; 2070 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2071 2072 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2073 cmd = &(sq->qbase)[sqhead]; 2074 compl.cdw0 = 0; 2075 compl.status = 0; 2076 2077 switch (cmd->opc) { 2078 case NVME_OPC_DELETE_IO_SQ: 2079 DPRINTF("%s command DELETE_IO_SQ", __func__); 2080 nvme_opc_delete_io_sq(sc, cmd, &compl); 2081 break; 2082 case NVME_OPC_CREATE_IO_SQ: 2083 DPRINTF("%s command CREATE_IO_SQ", __func__); 2084 nvme_opc_create_io_sq(sc, cmd, &compl); 2085 break; 2086 case NVME_OPC_DELETE_IO_CQ: 2087 DPRINTF("%s command DELETE_IO_CQ", __func__); 2088 nvme_opc_delete_io_cq(sc, cmd, &compl); 2089 break; 2090 case NVME_OPC_CREATE_IO_CQ: 2091 DPRINTF("%s command CREATE_IO_CQ", __func__); 2092 nvme_opc_create_io_cq(sc, cmd, &compl); 2093 break; 2094 case NVME_OPC_GET_LOG_PAGE: 2095 DPRINTF("%s command GET_LOG_PAGE", __func__); 2096 nvme_opc_get_log_page(sc, cmd, &compl); 2097 break; 2098 case NVME_OPC_IDENTIFY: 2099 DPRINTF("%s command IDENTIFY", __func__); 2100 nvme_opc_identify(sc, cmd, &compl); 2101 break; 2102 case NVME_OPC_ABORT: 2103 DPRINTF("%s command ABORT", __func__); 2104 nvme_opc_abort(sc, cmd, &compl); 2105 break; 2106 case NVME_OPC_SET_FEATURES: 2107 DPRINTF("%s command SET_FEATURES", __func__); 2108 nvme_opc_set_features(sc, cmd, &compl); 2109 break; 2110 case NVME_OPC_GET_FEATURES: 2111 DPRINTF("%s command GET_FEATURES", __func__); 2112 nvme_opc_get_features(sc, cmd, &compl); 2113 break; 2114 case NVME_OPC_FIRMWARE_ACTIVATE: 2115 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2116 pci_nvme_status_tc(&compl.status, 2117 NVME_SCT_COMMAND_SPECIFIC, 2118 NVME_SC_INVALID_FIRMWARE_SLOT); 2119 break; 2120 case NVME_OPC_ASYNC_EVENT_REQUEST: 2121 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2122 nvme_opc_async_event_req(sc, cmd, &compl); 2123 break; 2124 case NVME_OPC_FORMAT_NVM: 2125 DPRINTF("%s command FORMAT_NVM", __func__); 2126 if ((sc->ctrldata.oacs & 2127 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 2128 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2129 break; 2130 } 2131 nvme_opc_format_nvm(sc, cmd, &compl); 2132 break; 2133 case NVME_OPC_SECURITY_SEND: 2134 case NVME_OPC_SECURITY_RECEIVE: 2135 case NVME_OPC_SANITIZE: 2136 case NVME_OPC_GET_LBA_STATUS: 2137 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2138 cmd->opc); 2139 /* Valid but unsupported opcodes */ 2140 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2141 break; 2142 default: 2143 DPRINTF("%s command OPC=%#X (not implemented)", 2144 __func__, 2145 cmd->opc); 2146 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2147 } 2148 sqhead = (sqhead + 1) % sq->size; 2149 2150 if (NVME_COMPLETION_VALID(compl)) { 2151 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2152 compl.cdw0, 2153 cmd->cid, 2154 0, /* SQID */ 2155 compl.status); 2156 } 2157 } 2158 2159 DPRINTF("setting sqhead %u", sqhead); 2160 sq->head = sqhead; 2161 2162 if (cq->head != cq->tail) 2163 pci_generate_msix(sc->nsc_pi, 0); 2164 2165 pthread_mutex_unlock(&sq->mtx); 2166 } 2167 2168 /* 2169 * Update the Write and Read statistics reported in SMART data 2170 * 2171 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2172 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2173 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 2174 */ 2175 static void 2176 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2177 size_t bytes, uint16_t status) 2178 { 2179 2180 pthread_mutex_lock(&sc->mtx); 2181 switch (opc) { 2182 case NVME_OPC_WRITE: 2183 sc->write_commands++; 2184 if (status != NVME_SC_SUCCESS) 2185 break; 2186 sc->write_dunits_remainder += (bytes / 512); 2187 while (sc->write_dunits_remainder >= 1000) { 2188 sc->write_data_units++; 2189 sc->write_dunits_remainder -= 1000; 2190 } 2191 break; 2192 case NVME_OPC_READ: 2193 sc->read_commands++; 2194 if (status != NVME_SC_SUCCESS) 2195 break; 2196 sc->read_dunits_remainder += (bytes / 512); 2197 while (sc->read_dunits_remainder >= 1000) { 2198 sc->read_data_units++; 2199 sc->read_dunits_remainder -= 1000; 2200 } 2201 break; 2202 default: 2203 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2204 break; 2205 } 2206 pthread_mutex_unlock(&sc->mtx); 2207 } 2208 2209 /* 2210 * Check if the combination of Starting LBA (slba) and number of blocks 2211 * exceeds the range of the underlying storage. 2212 * 2213 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2214 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2215 * overflow. 2216 */ 2217 static bool 2218 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2219 uint32_t nblocks) 2220 { 2221 size_t offset, bytes; 2222 2223 /* Overflow check of multiplying Starting LBA by the sector size */ 2224 if (slba >> (64 - nvstore->sectsz_bits)) 2225 return (true); 2226 2227 offset = slba << nvstore->sectsz_bits; 2228 bytes = nblocks << nvstore->sectsz_bits; 2229 2230 /* Overflow check of Number of Logical Blocks */ 2231 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2232 return (true); 2233 2234 return (false); 2235 } 2236 2237 static int 2238 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, 2239 struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) 2240 { 2241 int iovidx; 2242 bool range_is_contiguous; 2243 2244 if (req == NULL) 2245 return (-1); 2246 2247 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2248 return (-1); 2249 } 2250 2251 /* 2252 * Minimize the number of IOVs by concatenating contiguous address 2253 * ranges. If the IOV count is zero, there is no previous range to 2254 * concatenate. 2255 */ 2256 if (req->io_req.br_iovcnt == 0) 2257 range_is_contiguous = false; 2258 else 2259 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2260 2261 if (range_is_contiguous) { 2262 iovidx = req->io_req.br_iovcnt - 1; 2263 2264 req->io_req.br_iov[iovidx].iov_base = 2265 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2266 req->prev_gpaddr, size); 2267 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2268 return (-1); 2269 2270 req->prev_size += size; 2271 req->io_req.br_resid += size; 2272 2273 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2274 } else { 2275 iovidx = req->io_req.br_iovcnt; 2276 if (iovidx == 0) { 2277 req->io_req.br_offset = offset; 2278 req->io_req.br_resid = 0; 2279 req->io_req.br_param = req; 2280 } 2281 2282 req->io_req.br_iov[iovidx].iov_base = 2283 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2284 gpaddr, size); 2285 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2286 return (-1); 2287 2288 req->io_req.br_iov[iovidx].iov_len = size; 2289 2290 req->prev_gpaddr = gpaddr; 2291 req->prev_size = size; 2292 req->io_req.br_resid += size; 2293 2294 req->io_req.br_iovcnt++; 2295 } 2296 2297 return (0); 2298 } 2299 2300 static void 2301 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2302 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) 2303 { 2304 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2305 2306 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2307 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2308 NVME_STATUS_GET_SC(status)); 2309 2310 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); 2311 2312 if (cq->head != cq->tail) { 2313 if (cq->intr_en & NVME_CQ_INTEN) { 2314 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2315 } else { 2316 DPRINTF("%s: CQ%u interrupt disabled", 2317 __func__, sq->cqid); 2318 } 2319 } 2320 } 2321 2322 static void 2323 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2324 { 2325 req->sc = NULL; 2326 req->nvme_sq = NULL; 2327 req->sqid = 0; 2328 2329 pthread_mutex_lock(&sc->mtx); 2330 2331 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2332 sc->pending_ios--; 2333 2334 /* when no more IO pending, can set to ready if device reset/enabled */ 2335 if (sc->pending_ios == 0 && 2336 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2337 sc->regs.csts |= NVME_CSTS_RDY; 2338 2339 pthread_mutex_unlock(&sc->mtx); 2340 2341 sem_post(&sc->iosemlock); 2342 } 2343 2344 static struct pci_nvme_ioreq * 2345 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2346 { 2347 struct pci_nvme_ioreq *req = NULL; 2348 2349 sem_wait(&sc->iosemlock); 2350 pthread_mutex_lock(&sc->mtx); 2351 2352 req = STAILQ_FIRST(&sc->ioreqs_free); 2353 assert(req != NULL); 2354 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2355 2356 req->sc = sc; 2357 2358 sc->pending_ios++; 2359 2360 pthread_mutex_unlock(&sc->mtx); 2361 2362 req->io_req.br_iovcnt = 0; 2363 req->io_req.br_offset = 0; 2364 req->io_req.br_resid = 0; 2365 req->io_req.br_param = req; 2366 req->prev_gpaddr = 0; 2367 req->prev_size = 0; 2368 2369 return req; 2370 } 2371 2372 static void 2373 pci_nvme_io_done(struct blockif_req *br, int err) 2374 { 2375 struct pci_nvme_ioreq *req = br->br_param; 2376 struct nvme_submission_queue *sq = req->nvme_sq; 2377 uint16_t code, status; 2378 2379 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2380 2381 /* TODO return correct error */ 2382 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2383 pci_nvme_status_genc(&status, code); 2384 2385 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); 2386 pci_nvme_stats_write_read_update(req->sc, req->opc, 2387 req->bytes, status); 2388 pci_nvme_release_ioreq(req->sc, req); 2389 } 2390 2391 /* 2392 * Implements the Flush command. The specification states: 2393 * If a volatile write cache is not present, Flush commands complete 2394 * successfully and have no effect 2395 * in the description of the Volatile Write Cache (VWC) field of the Identify 2396 * Controller data. Therefore, set status to Success if the command is 2397 * not supported (i.e. RAM or as indicated by the blockif). 2398 */ 2399 static bool 2400 nvme_opc_flush(struct pci_nvme_softc *sc __unused, 2401 struct nvme_command *cmd __unused, 2402 struct pci_nvme_blockstore *nvstore, 2403 struct pci_nvme_ioreq *req, 2404 uint16_t *status) 2405 { 2406 bool pending = false; 2407 2408 if (nvstore->type == NVME_STOR_RAM) { 2409 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2410 } else { 2411 int err; 2412 2413 req->io_req.br_callback = pci_nvme_io_done; 2414 2415 err = blockif_flush(nvstore->ctx, &req->io_req); 2416 switch (err) { 2417 case 0: 2418 pending = true; 2419 break; 2420 case EOPNOTSUPP: 2421 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2422 break; 2423 default: 2424 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2425 } 2426 } 2427 2428 return (pending); 2429 } 2430 2431 static uint16_t 2432 nvme_write_read_ram(struct pci_nvme_softc *sc, 2433 struct pci_nvme_blockstore *nvstore, 2434 uint64_t prp1, uint64_t prp2, 2435 size_t offset, uint64_t bytes, 2436 bool is_write) 2437 { 2438 uint8_t *buf = nvstore->ctx; 2439 enum nvme_copy_dir dir; 2440 uint16_t status; 2441 2442 if (is_write) 2443 dir = NVME_COPY_TO_PRP; 2444 else 2445 dir = NVME_COPY_FROM_PRP; 2446 2447 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2448 buf + offset, bytes, dir)) 2449 pci_nvme_status_genc(&status, 2450 NVME_SC_DATA_TRANSFER_ERROR); 2451 else 2452 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2453 2454 return (status); 2455 } 2456 2457 static uint16_t 2458 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2459 struct pci_nvme_blockstore *nvstore, 2460 struct pci_nvme_ioreq *req, 2461 uint64_t prp1, uint64_t prp2, 2462 size_t offset, uint64_t bytes, 2463 bool is_write) 2464 { 2465 uint64_t size; 2466 int err; 2467 uint16_t status = NVME_NO_STATUS; 2468 2469 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2470 if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { 2471 err = -1; 2472 goto out; 2473 } 2474 2475 offset += size; 2476 bytes -= size; 2477 2478 if (bytes == 0) { 2479 ; 2480 } else if (bytes <= PAGE_SIZE) { 2481 size = bytes; 2482 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { 2483 err = -1; 2484 goto out; 2485 } 2486 } else { 2487 void *vmctx = sc->nsc_pi->pi_vmctx; 2488 uint64_t *prp_list = &prp2; 2489 uint64_t *last = prp_list; 2490 2491 /* PRP2 is pointer to a physical region page list */ 2492 while (bytes) { 2493 /* Last entry in list points to the next list */ 2494 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2495 uint64_t prp = *prp_list; 2496 2497 prp_list = paddr_guest2host(vmctx, prp, 2498 PAGE_SIZE - (prp % PAGE_SIZE)); 2499 if (prp_list == NULL) { 2500 err = -1; 2501 goto out; 2502 } 2503 last = prp_list + (NVME_PRP2_ITEMS - 1); 2504 } 2505 2506 size = MIN(bytes, PAGE_SIZE); 2507 2508 if (pci_nvme_append_iov_req(sc, req, *prp_list, size, 2509 offset)) { 2510 err = -1; 2511 goto out; 2512 } 2513 2514 offset += size; 2515 bytes -= size; 2516 2517 prp_list++; 2518 } 2519 } 2520 req->io_req.br_callback = pci_nvme_io_done; 2521 if (is_write) 2522 err = blockif_write(nvstore->ctx, &req->io_req); 2523 else 2524 err = blockif_read(nvstore->ctx, &req->io_req); 2525 out: 2526 if (err) 2527 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2528 2529 return (status); 2530 } 2531 2532 static bool 2533 nvme_opc_write_read(struct pci_nvme_softc *sc, 2534 struct nvme_command *cmd, 2535 struct pci_nvme_blockstore *nvstore, 2536 struct pci_nvme_ioreq *req, 2537 uint16_t *status) 2538 { 2539 uint64_t lba, nblocks, bytes; 2540 size_t offset; 2541 bool is_write = cmd->opc == NVME_OPC_WRITE; 2542 bool pending = false; 2543 2544 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2545 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2546 bytes = nblocks << nvstore->sectsz_bits; 2547 if (bytes > NVME_MAX_DATA_SIZE) { 2548 WPRINTF("%s command would exceed MDTS", __func__); 2549 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2550 goto out; 2551 } 2552 2553 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2554 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2555 __func__, lba, nblocks); 2556 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2557 goto out; 2558 } 2559 2560 offset = lba << nvstore->sectsz_bits; 2561 2562 req->bytes = bytes; 2563 req->io_req.br_offset = lba; 2564 2565 /* PRP bits 1:0 must be zero */ 2566 cmd->prp1 &= ~0x3UL; 2567 cmd->prp2 &= ~0x3UL; 2568 2569 if (nvstore->type == NVME_STOR_RAM) { 2570 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2571 cmd->prp2, offset, bytes, is_write); 2572 } else { 2573 *status = nvme_write_read_blockif(sc, nvstore, req, 2574 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2575 2576 if (*status == NVME_NO_STATUS) 2577 pending = true; 2578 } 2579 out: 2580 if (!pending) 2581 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2582 2583 return (pending); 2584 } 2585 2586 static void 2587 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2588 { 2589 struct pci_nvme_ioreq *req = br->br_param; 2590 struct pci_nvme_softc *sc = req->sc; 2591 bool done = true; 2592 uint16_t status; 2593 2594 if (err) { 2595 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2596 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2597 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2598 } else { 2599 struct iovec *iov = req->io_req.br_iov; 2600 2601 req->prev_gpaddr++; 2602 iov += req->prev_gpaddr; 2603 2604 /* The iov_* values already include the sector size */ 2605 req->io_req.br_offset = (off_t)iov->iov_base; 2606 req->io_req.br_resid = iov->iov_len; 2607 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2608 pci_nvme_status_genc(&status, 2609 NVME_SC_INTERNAL_DEVICE_ERROR); 2610 } else 2611 done = false; 2612 } 2613 2614 if (done) { 2615 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 2616 status); 2617 pci_nvme_release_ioreq(sc, req); 2618 } 2619 } 2620 2621 static bool 2622 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2623 struct nvme_command *cmd, 2624 struct pci_nvme_blockstore *nvstore, 2625 struct pci_nvme_ioreq *req, 2626 uint16_t *status) 2627 { 2628 struct nvme_dsm_range *range = NULL; 2629 uint32_t nr, r, non_zero, dr; 2630 int err; 2631 bool pending = false; 2632 2633 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2634 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2635 goto out; 2636 } 2637 2638 nr = cmd->cdw10 & 0xff; 2639 2640 /* copy locally because a range entry could straddle PRPs */ 2641 range = calloc(1, NVME_MAX_DSM_TRIM); 2642 if (range == NULL) { 2643 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2644 goto out; 2645 } 2646 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2647 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2648 2649 /* Check for invalid ranges and the number of non-zero lengths */ 2650 non_zero = 0; 2651 for (r = 0; r <= nr; r++) { 2652 if (pci_nvme_out_of_range(nvstore, 2653 range[r].starting_lba, range[r].length)) { 2654 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2655 goto out; 2656 } 2657 if (range[r].length != 0) 2658 non_zero++; 2659 } 2660 2661 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2662 size_t offset, bytes; 2663 int sectsz_bits = sc->nvstore.sectsz_bits; 2664 2665 /* 2666 * DSM calls are advisory only, and compliant controllers 2667 * may choose to take no actions (i.e. return Success). 2668 */ 2669 if (!nvstore->deallocate) { 2670 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2671 goto out; 2672 } 2673 2674 /* If all ranges have a zero length, return Success */ 2675 if (non_zero == 0) { 2676 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2677 goto out; 2678 } 2679 2680 if (req == NULL) { 2681 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2682 goto out; 2683 } 2684 2685 offset = range[0].starting_lba << sectsz_bits; 2686 bytes = range[0].length << sectsz_bits; 2687 2688 /* 2689 * If the request is for more than a single range, store 2690 * the ranges in the br_iov. Optimize for the common case 2691 * of a single range. 2692 * 2693 * Note that NVMe Number of Ranges is a zero based value 2694 */ 2695 req->io_req.br_iovcnt = 0; 2696 req->io_req.br_offset = offset; 2697 req->io_req.br_resid = bytes; 2698 2699 if (nr == 0) { 2700 req->io_req.br_callback = pci_nvme_io_done; 2701 } else { 2702 struct iovec *iov = req->io_req.br_iov; 2703 2704 for (r = 0, dr = 0; r <= nr; r++) { 2705 offset = range[r].starting_lba << sectsz_bits; 2706 bytes = range[r].length << sectsz_bits; 2707 if (bytes == 0) 2708 continue; 2709 2710 if ((nvstore->size - offset) < bytes) { 2711 pci_nvme_status_genc(status, 2712 NVME_SC_LBA_OUT_OF_RANGE); 2713 goto out; 2714 } 2715 iov[dr].iov_base = (void *)offset; 2716 iov[dr].iov_len = bytes; 2717 dr++; 2718 } 2719 req->io_req.br_callback = pci_nvme_dealloc_sm; 2720 2721 /* 2722 * Use prev_gpaddr to track the current entry and 2723 * prev_size to track the number of entries 2724 */ 2725 req->prev_gpaddr = 0; 2726 req->prev_size = dr; 2727 } 2728 2729 err = blockif_delete(nvstore->ctx, &req->io_req); 2730 if (err) 2731 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2732 else 2733 pending = true; 2734 } 2735 out: 2736 free(range); 2737 return (pending); 2738 } 2739 2740 static void 2741 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2742 { 2743 struct nvme_submission_queue *sq; 2744 uint16_t status; 2745 uint16_t sqhead; 2746 2747 /* handle all submissions up to sq->tail index */ 2748 sq = &sc->submit_queues[idx]; 2749 2750 pthread_mutex_lock(&sq->mtx); 2751 2752 sqhead = sq->head; 2753 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2754 idx, sqhead, sq->tail, sq->qbase); 2755 2756 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2757 struct nvme_command *cmd; 2758 struct pci_nvme_ioreq *req; 2759 uint32_t nsid; 2760 bool pending; 2761 2762 pending = false; 2763 req = NULL; 2764 status = 0; 2765 2766 cmd = &sq->qbase[sqhead]; 2767 sqhead = (sqhead + 1) % sq->size; 2768 2769 nsid = le32toh(cmd->nsid); 2770 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2771 pci_nvme_status_genc(&status, 2772 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2773 status |= 2774 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2775 goto complete; 2776 } 2777 2778 req = pci_nvme_get_ioreq(sc); 2779 if (req == NULL) { 2780 pci_nvme_status_genc(&status, 2781 NVME_SC_INTERNAL_DEVICE_ERROR); 2782 WPRINTF("%s: unable to allocate IO req", __func__); 2783 goto complete; 2784 } 2785 req->nvme_sq = sq; 2786 req->sqid = idx; 2787 req->opc = cmd->opc; 2788 req->cid = cmd->cid; 2789 req->nsid = cmd->nsid; 2790 2791 switch (cmd->opc) { 2792 case NVME_OPC_FLUSH: 2793 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2794 req, &status); 2795 break; 2796 case NVME_OPC_WRITE: 2797 case NVME_OPC_READ: 2798 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2799 req, &status); 2800 break; 2801 case NVME_OPC_WRITE_ZEROES: 2802 /* TODO: write zeroes 2803 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2804 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2805 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2806 break; 2807 case NVME_OPC_DATASET_MANAGEMENT: 2808 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2809 req, &status); 2810 break; 2811 default: 2812 WPRINTF("%s unhandled io command 0x%x", 2813 __func__, cmd->opc); 2814 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2815 } 2816 complete: 2817 if (!pending) { 2818 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); 2819 if (req != NULL) 2820 pci_nvme_release_ioreq(sc, req); 2821 } 2822 } 2823 2824 sq->head = sqhead; 2825 2826 pthread_mutex_unlock(&sq->mtx); 2827 } 2828 2829 static void 2830 pci_nvme_handle_doorbell(struct vmctx *ctx __unused, struct pci_nvme_softc* sc, 2831 uint64_t idx, int is_sq, uint64_t value) 2832 { 2833 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2834 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2835 2836 if (is_sq) { 2837 if (idx > sc->num_squeues) { 2838 WPRINTF("%s queue index %lu overflow from " 2839 "guest (max %u)", 2840 __func__, idx, sc->num_squeues); 2841 return; 2842 } 2843 2844 atomic_store_short(&sc->submit_queues[idx].tail, 2845 (uint16_t)value); 2846 2847 if (idx == 0) { 2848 pci_nvme_handle_admin_cmd(sc, value); 2849 } else { 2850 /* submission queue; handle new entries in SQ */ 2851 if (idx > sc->num_squeues) { 2852 WPRINTF("%s SQ index %lu overflow from " 2853 "guest (max %u)", 2854 __func__, idx, sc->num_squeues); 2855 return; 2856 } 2857 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2858 } 2859 } else { 2860 if (idx > sc->num_cqueues) { 2861 WPRINTF("%s queue index %lu overflow from " 2862 "guest (max %u)", 2863 __func__, idx, sc->num_cqueues); 2864 return; 2865 } 2866 2867 atomic_store_short(&sc->compl_queues[idx].head, 2868 (uint16_t)value); 2869 } 2870 } 2871 2872 static void 2873 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2874 { 2875 const char *s = iswrite ? "WRITE" : "READ"; 2876 2877 switch (offset) { 2878 case NVME_CR_CAP_LOW: 2879 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2880 break; 2881 case NVME_CR_CAP_HI: 2882 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2883 break; 2884 case NVME_CR_VS: 2885 DPRINTF("%s %s NVME_CR_VS", func, s); 2886 break; 2887 case NVME_CR_INTMS: 2888 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2889 break; 2890 case NVME_CR_INTMC: 2891 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2892 break; 2893 case NVME_CR_CC: 2894 DPRINTF("%s %s NVME_CR_CC", func, s); 2895 break; 2896 case NVME_CR_CSTS: 2897 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2898 break; 2899 case NVME_CR_NSSR: 2900 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2901 break; 2902 case NVME_CR_AQA: 2903 DPRINTF("%s %s NVME_CR_AQA", func, s); 2904 break; 2905 case NVME_CR_ASQ_LOW: 2906 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2907 break; 2908 case NVME_CR_ASQ_HI: 2909 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2910 break; 2911 case NVME_CR_ACQ_LOW: 2912 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2913 break; 2914 case NVME_CR_ACQ_HI: 2915 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2916 break; 2917 default: 2918 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2919 } 2920 2921 } 2922 2923 static void 2924 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2925 uint64_t offset, int size, uint64_t value) 2926 { 2927 uint32_t ccreg; 2928 2929 if (offset >= NVME_DOORBELL_OFFSET) { 2930 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2931 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2932 int is_sq = (belloffset % 8) < 4; 2933 2934 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { 2935 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", 2936 offset); 2937 return; 2938 } 2939 2940 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2941 WPRINTF("guest attempted an overflow write offset " 2942 "0x%lx, val 0x%lx in %s", 2943 offset, value, __func__); 2944 return; 2945 } 2946 2947 if (is_sq) { 2948 if (sc->submit_queues[idx].qbase == NULL) 2949 return; 2950 } else if (sc->compl_queues[idx].qbase == NULL) 2951 return; 2952 2953 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2954 return; 2955 } 2956 2957 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2958 offset, size, value); 2959 2960 if (size != 4) { 2961 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2962 "val 0x%lx) to bar0 in %s", 2963 size, offset, value, __func__); 2964 /* TODO: shutdown device */ 2965 return; 2966 } 2967 2968 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2969 2970 pthread_mutex_lock(&sc->mtx); 2971 2972 switch (offset) { 2973 case NVME_CR_CAP_LOW: 2974 case NVME_CR_CAP_HI: 2975 /* readonly */ 2976 break; 2977 case NVME_CR_VS: 2978 /* readonly */ 2979 break; 2980 case NVME_CR_INTMS: 2981 /* MSI-X, so ignore */ 2982 break; 2983 case NVME_CR_INTMC: 2984 /* MSI-X, so ignore */ 2985 break; 2986 case NVME_CR_CC: 2987 ccreg = (uint32_t)value; 2988 2989 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2990 "iocqes %u", 2991 __func__, 2992 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2993 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2994 NVME_CC_GET_IOCQES(ccreg)); 2995 2996 if (NVME_CC_GET_SHN(ccreg)) { 2997 /* perform shutdown - flush out data to backend */ 2998 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2999 NVME_CSTS_REG_SHST_SHIFT); 3000 sc->regs.csts |= NVME_SHST_COMPLETE << 3001 NVME_CSTS_REG_SHST_SHIFT; 3002 } 3003 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 3004 if (NVME_CC_GET_EN(ccreg) == 0) 3005 /* transition 1-> causes controller reset */ 3006 pci_nvme_reset_locked(sc); 3007 else 3008 pci_nvme_init_controller(ctx, sc); 3009 } 3010 3011 /* Insert the iocqes, iosqes and en bits from the write */ 3012 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 3013 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 3014 if (NVME_CC_GET_EN(ccreg) == 0) { 3015 /* Insert the ams, mps and css bit fields */ 3016 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 3017 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 3018 sc->regs.csts &= ~NVME_CSTS_RDY; 3019 } else if ((sc->pending_ios == 0) && 3020 !(sc->regs.csts & NVME_CSTS_CFS)) { 3021 sc->regs.csts |= NVME_CSTS_RDY; 3022 } 3023 break; 3024 case NVME_CR_CSTS: 3025 break; 3026 case NVME_CR_NSSR: 3027 /* ignore writes; don't support subsystem reset */ 3028 break; 3029 case NVME_CR_AQA: 3030 sc->regs.aqa = (uint32_t)value; 3031 break; 3032 case NVME_CR_ASQ_LOW: 3033 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 3034 (0xFFFFF000 & value); 3035 break; 3036 case NVME_CR_ASQ_HI: 3037 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 3038 (value << 32); 3039 break; 3040 case NVME_CR_ACQ_LOW: 3041 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 3042 (0xFFFFF000 & value); 3043 break; 3044 case NVME_CR_ACQ_HI: 3045 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3046 (value << 32); 3047 break; 3048 default: 3049 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3050 __func__, offset, value, size); 3051 } 3052 pthread_mutex_unlock(&sc->mtx); 3053 } 3054 3055 static void 3056 pci_nvme_write(struct vmctx *ctx, int vcpu __unused, struct pci_devinst *pi, 3057 int baridx, uint64_t offset, int size, uint64_t value) 3058 { 3059 struct pci_nvme_softc* sc = pi->pi_arg; 3060 3061 if (baridx == pci_msix_table_bar(pi) || 3062 baridx == pci_msix_pba_bar(pi)) { 3063 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3064 " value 0x%lx", baridx, offset, size, value); 3065 3066 pci_emul_msix_twrite(pi, offset, size, value); 3067 return; 3068 } 3069 3070 switch (baridx) { 3071 case 0: 3072 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 3073 break; 3074 3075 default: 3076 DPRINTF("%s unknown baridx %d, val 0x%lx", 3077 __func__, baridx, value); 3078 } 3079 } 3080 3081 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3082 uint64_t offset, int size) 3083 { 3084 uint64_t value; 3085 3086 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3087 3088 if (offset < NVME_DOORBELL_OFFSET) { 3089 void *p = &(sc->regs); 3090 pthread_mutex_lock(&sc->mtx); 3091 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3092 pthread_mutex_unlock(&sc->mtx); 3093 } else { 3094 value = 0; 3095 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3096 } 3097 3098 switch (size) { 3099 case 1: 3100 value &= 0xFF; 3101 break; 3102 case 2: 3103 value &= 0xFFFF; 3104 break; 3105 case 4: 3106 value &= 0xFFFFFFFF; 3107 break; 3108 } 3109 3110 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3111 offset, size, (uint32_t)value); 3112 3113 return (value); 3114 } 3115 3116 3117 3118 static uint64_t 3119 pci_nvme_read(struct vmctx *ctx __unused, int vcpu __unused, 3120 struct pci_devinst *pi, int baridx, uint64_t offset, int size) 3121 { 3122 struct pci_nvme_softc* sc = pi->pi_arg; 3123 3124 if (baridx == pci_msix_table_bar(pi) || 3125 baridx == pci_msix_pba_bar(pi)) { 3126 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3127 baridx, offset, size); 3128 3129 return pci_emul_msix_tread(pi, offset, size); 3130 } 3131 3132 switch (baridx) { 3133 case 0: 3134 return pci_nvme_read_bar_0(sc, offset, size); 3135 3136 default: 3137 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3138 } 3139 3140 return (0); 3141 } 3142 3143 static int 3144 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3145 { 3146 char bident[sizeof("XX:X:X")]; 3147 const char *value; 3148 uint32_t sectsz; 3149 3150 sc->max_queues = NVME_QUEUES; 3151 sc->max_qentries = NVME_MAX_QENTRIES; 3152 sc->ioslots = NVME_IOSLOTS; 3153 sc->num_squeues = sc->max_queues; 3154 sc->num_cqueues = sc->max_queues; 3155 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3156 sectsz = 0; 3157 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3158 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3159 3160 value = get_config_value_node(nvl, "maxq"); 3161 if (value != NULL) 3162 sc->max_queues = atoi(value); 3163 value = get_config_value_node(nvl, "qsz"); 3164 if (value != NULL) { 3165 sc->max_qentries = atoi(value); 3166 if (sc->max_qentries <= 0) { 3167 EPRINTLN("nvme: Invalid qsz option %d", 3168 sc->max_qentries); 3169 return (-1); 3170 } 3171 } 3172 value = get_config_value_node(nvl, "ioslots"); 3173 if (value != NULL) { 3174 sc->ioslots = atoi(value); 3175 if (sc->ioslots <= 0) { 3176 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3177 return (-1); 3178 } 3179 } 3180 value = get_config_value_node(nvl, "sectsz"); 3181 if (value != NULL) 3182 sectsz = atoi(value); 3183 value = get_config_value_node(nvl, "ser"); 3184 if (value != NULL) { 3185 /* 3186 * This field indicates the Product Serial Number in 3187 * 7-bit ASCII, unused bytes should be space characters. 3188 * Ref: NVMe v1.3c. 3189 */ 3190 cpywithpad((char *)sc->ctrldata.sn, 3191 sizeof(sc->ctrldata.sn), value, ' '); 3192 } 3193 value = get_config_value_node(nvl, "eui64"); 3194 if (value != NULL) 3195 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3196 value = get_config_value_node(nvl, "dsm"); 3197 if (value != NULL) { 3198 if (strcmp(value, "auto") == 0) 3199 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3200 else if (strcmp(value, "enable") == 0) 3201 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3202 else if (strcmp(value, "disable") == 0) 3203 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3204 } 3205 3206 value = get_config_value_node(nvl, "ram"); 3207 if (value != NULL) { 3208 uint64_t sz = strtoull(value, NULL, 10); 3209 3210 sc->nvstore.type = NVME_STOR_RAM; 3211 sc->nvstore.size = sz * 1024 * 1024; 3212 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3213 sc->nvstore.sectsz = 4096; 3214 sc->nvstore.sectsz_bits = 12; 3215 if (sc->nvstore.ctx == NULL) { 3216 EPRINTLN("nvme: Unable to allocate RAM"); 3217 return (-1); 3218 } 3219 } else { 3220 snprintf(bident, sizeof(bident), "%d:%d", 3221 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3222 sc->nvstore.ctx = blockif_open(nvl, bident); 3223 if (sc->nvstore.ctx == NULL) { 3224 EPRINTLN("nvme: Could not open backing file: %s", 3225 strerror(errno)); 3226 return (-1); 3227 } 3228 sc->nvstore.type = NVME_STOR_BLOCKIF; 3229 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3230 } 3231 3232 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3233 sc->nvstore.sectsz = sectsz; 3234 else if (sc->nvstore.type != NVME_STOR_RAM) 3235 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3236 for (sc->nvstore.sectsz_bits = 9; 3237 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3238 sc->nvstore.sectsz_bits++); 3239 3240 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3241 sc->max_queues = NVME_QUEUES; 3242 3243 return (0); 3244 } 3245 3246 static void 3247 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, 3248 size_t new_size) 3249 { 3250 struct pci_nvme_softc *sc; 3251 struct pci_nvme_blockstore *nvstore; 3252 struct nvme_namespace_data *nd; 3253 3254 sc = arg; 3255 nvstore = &sc->nvstore; 3256 nd = &sc->nsdata; 3257 3258 nvstore->size = new_size; 3259 pci_nvme_init_nsdata_size(nvstore, nd); 3260 3261 /* Add changed NSID to list */ 3262 sc->ns_log.ns[0] = 1; 3263 sc->ns_log.ns[1] = 0; 3264 3265 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3266 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3267 } 3268 3269 static int 3270 pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl) 3271 { 3272 struct pci_nvme_softc *sc; 3273 uint32_t pci_membar_sz; 3274 int error; 3275 3276 error = 0; 3277 3278 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3279 pi->pi_arg = sc; 3280 sc->nsc_pi = pi; 3281 3282 error = pci_nvme_parse_config(sc, nvl); 3283 if (error < 0) 3284 goto done; 3285 else 3286 error = 0; 3287 3288 STAILQ_INIT(&sc->ioreqs_free); 3289 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3290 for (uint32_t i = 0; i < sc->ioslots; i++) { 3291 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3292 } 3293 3294 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3295 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3296 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3297 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3298 pci_set_cfgdata8(pi, PCIR_PROGIF, 3299 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3300 3301 /* 3302 * Allocate size of NVMe registers + doorbell space for all queues. 3303 * 3304 * The specification requires a minimum memory I/O window size of 16K. 3305 * The Windows driver will refuse to start a device with a smaller 3306 * window. 3307 */ 3308 pci_membar_sz = sizeof(struct nvme_registers) + 3309 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3310 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3311 3312 DPRINTF("nvme membar size: %u", pci_membar_sz); 3313 3314 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3315 if (error) { 3316 WPRINTF("%s pci alloc mem bar failed", __func__); 3317 goto done; 3318 } 3319 3320 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3321 if (error) { 3322 WPRINTF("%s pci add msixcap failed", __func__); 3323 goto done; 3324 } 3325 3326 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3327 if (error) { 3328 WPRINTF("%s pci add Express capability failed", __func__); 3329 goto done; 3330 } 3331 3332 pthread_mutex_init(&sc->mtx, NULL); 3333 sem_init(&sc->iosemlock, 0, sc->ioslots); 3334 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3335 3336 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3337 /* 3338 * Controller data depends on Namespace data so initialize Namespace 3339 * data first. 3340 */ 3341 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3342 pci_nvme_init_ctrldata(sc); 3343 pci_nvme_init_logpages(sc); 3344 pci_nvme_init_features(sc); 3345 3346 pci_nvme_aer_init(sc); 3347 pci_nvme_aen_init(sc); 3348 3349 pci_nvme_reset(sc); 3350 3351 pci_lintr_request(pi); 3352 3353 done: 3354 return (error); 3355 } 3356 3357 static int 3358 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3359 { 3360 char *cp, *ram; 3361 3362 if (opts == NULL) 3363 return (0); 3364 3365 if (strncmp(opts, "ram=", 4) == 0) { 3366 cp = strchr(opts, ','); 3367 if (cp == NULL) { 3368 set_config_value_node(nvl, "ram", opts + 4); 3369 return (0); 3370 } 3371 ram = strndup(opts + 4, cp - opts - 4); 3372 set_config_value_node(nvl, "ram", ram); 3373 free(ram); 3374 return (pci_parse_legacy_config(nvl, cp + 1)); 3375 } else 3376 return (blockif_legacy_config(nvl, opts)); 3377 } 3378 3379 static const struct pci_devemu pci_de_nvme = { 3380 .pe_emu = "nvme", 3381 .pe_init = pci_nvme_init, 3382 .pe_legacy_config = pci_nvme_legacy_config, 3383 .pe_barwrite = pci_nvme_write, 3384 .pe_barread = pci_nvme_read 3385 }; 3386 PCI_EMUL_SET(pci_de_nvme); 3387