1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 14 * Copyright 2016 Tegile Systems, Inc. All rights reserved. 15 * Copyright (c) 2016 The MathWorks, Inc. All rights reserved. 16 */ 17 18 /* 19 * blkdev driver for NVMe compliant storage devices 20 * 21 * This driver was written to conform to version 1.1b of the NVMe specification. 22 * It may work with newer versions, but that is completely untested and disabled 23 * by default. 24 * 25 * The driver has only been tested on x86 systems and will not work on big- 26 * endian systems without changes to the code accessing registers and data 27 * structures used by the hardware. 28 * 29 * 30 * Interrupt Usage: 31 * 32 * The driver will use a FIXED interrupt while configuring the device as the 33 * specification requires. Later in the attach process it will switch to MSI-X 34 * or MSI if supported. The driver wants to have one interrupt vector per CPU, 35 * but it will work correctly if less are available. Interrupts can be shared 36 * by queues, the interrupt handler will iterate through the I/O queue array by 37 * steps of n_intr_cnt. Usually only the admin queue will share an interrupt 38 * with one I/O queue. The interrupt handler will retrieve completed commands 39 * from all queues sharing an interrupt vector and will post them to a taskq 40 * for completion processing. 41 * 42 * 43 * Command Processing: 44 * 45 * NVMe devices can have up to 65536 I/O queue pairs, with each queue holding up 46 * to 65536 I/O commands. The driver will configure one I/O queue pair per 47 * available interrupt vector, with the queue length usually much smaller than 48 * the maximum of 65536. If the hardware doesn't provide enough queues, fewer 49 * interrupt vectors will be used. 50 * 51 * Additionally the hardware provides a single special admin queue pair that can 52 * hold up to 4096 admin commands. 53 * 54 * From the hardware perspective both queues of a queue pair are independent, 55 * but they share some driver state: the command array (holding pointers to 56 * commands currently being processed by the hardware) and the active command 57 * counter. Access to the submission side of a queue pair and the shared state 58 * is protected by nq_mutex. The completion side of a queue pair does not need 59 * that protection apart from its access to the shared state; it is called only 60 * in the interrupt handler which does not run concurrently for the same 61 * interrupt vector. 62 * 63 * When a command is submitted to a queue pair the active command counter is 64 * incremented and a pointer to the command is stored in the command array. The 65 * array index is used as command identifier (CID) in the submission queue 66 * entry. Some commands may take a very long time to complete, and if the queue 67 * wraps around in that time a submission may find the next array slot to still 68 * be used by a long-running command. In this case the array is sequentially 69 * searched for the next free slot. The length of the command array is the same 70 * as the configured queue length. 71 * 72 * 73 * Namespace Support: 74 * 75 * NVMe devices can have multiple namespaces, each being a independent data 76 * store. The driver supports multiple namespaces and creates a blkdev interface 77 * for each namespace found. Namespaces can have various attributes to support 78 * thin provisioning and protection information. This driver does not support 79 * any of this and ignores namespaces that have these attributes. 80 * 81 * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier 82 * (EUI64). This driver uses the EUI64 if present to generate the devid and 83 * passes it to blkdev to use it in the device node names. As this is currently 84 * untested namespaces with EUI64 are ignored by default. 85 * 86 * 87 * Blkdev Interface: 88 * 89 * This driver uses blkdev to do all the heavy lifting involved with presenting 90 * a disk device to the system. As a result, the processing of I/O requests is 91 * relatively simple as blkdev takes care of partitioning, boundary checks, DMA 92 * setup, and splitting of transfers into manageable chunks. 93 * 94 * I/O requests coming in from blkdev are turned into NVM commands and posted to 95 * an I/O queue. The queue is selected by taking the CPU id modulo the number of 96 * queues. There is currently no timeout handling of I/O commands. 97 * 98 * Blkdev also supports querying device/media information and generating a 99 * devid. The driver reports the best block size as determined by the namespace 100 * format back to blkdev as physical block size to support partition and block 101 * alignment. The devid is either based on the namespace EUI64, if present, or 102 * composed using the device vendor ID, model number, serial number, and the 103 * namespace ID. 104 * 105 * 106 * Error Handling: 107 * 108 * Error handling is currently limited to detecting fatal hardware errors, 109 * either by asynchronous events, or synchronously through command status or 110 * admin command timeouts. In case of severe errors the device is fenced off, 111 * all further requests will return EIO. FMA is then called to fault the device. 112 * 113 * The hardware has a limit for outstanding asynchronous event requests. Before 114 * this limit is known the driver assumes it is at least 1 and posts a single 115 * asynchronous request. Later when the limit is known more asynchronous event 116 * requests are posted to allow quicker reception of error information. When an 117 * asynchronous event is posted by the hardware the driver will parse the error 118 * status fields and log information or fault the device, depending on the 119 * severity of the asynchronous event. The asynchronous event request is then 120 * reused and posted to the admin queue again. 121 * 122 * On command completion the command status is checked for errors. In case of 123 * errors indicating a driver bug the driver panics. Almost all other error 124 * status values just cause EIO to be returned. 125 * 126 * Command timeouts are currently detected for all admin commands except 127 * asynchronous event requests. If a command times out and the hardware appears 128 * to be healthy the driver attempts to abort the command. If this fails the 129 * driver assumes the device to be dead, fences it off, and calls FMA to retire 130 * it. In general admin commands are issued at attach time only. No timeout 131 * handling of normal I/O commands is presently done. 132 * 133 * In some cases it may be possible that the ABORT command times out, too. In 134 * that case the device is also declared dead and fenced off. 135 * 136 * 137 * Quiesce / Fast Reboot: 138 * 139 * The driver currently does not support fast reboot. A quiesce(9E) entry point 140 * is still provided which is used to send a shutdown notification to the 141 * device. 142 * 143 * 144 * Driver Configuration: 145 * 146 * The following driver properties can be changed to control some aspects of the 147 * drivers operation: 148 * - strict-version: can be set to 0 to allow devices conforming to newer 149 * versions or namespaces with EUI64 to be used 150 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor 151 * specific command status as a fatal error leading device faulting 152 * - admin-queue-len: the maximum length of the admin queue (16-4096) 153 * - io-queue-len: the maximum length of the I/O queues (16-65536) 154 * - async-event-limit: the maximum number of asynchronous event requests to be 155 * posted by the driver 156 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write 157 * cache 158 * - min-phys-block-size: the minimum physical block size to report to blkdev, 159 * which is among other things the basis for ZFS vdev ashift 160 * 161 * 162 * TODO: 163 * - figure out sane default for I/O queue depth reported to blkdev 164 * - polled I/O support to support kernel core dumping 165 * - FMA handling of media errors 166 * - support for devices supporting very large I/O requests using chained PRPs 167 * - support for querying log pages from user space 168 * - support for configuring hardware parameters like interrupt coalescing 169 * - support for media formatting and hard partitioning into namespaces 170 * - support for big-endian systems 171 * - support for fast reboot 172 * - support for firmware updates 173 * - support for NVMe Subsystem Reset (1.1) 174 * - support for Scatter/Gather lists (1.1) 175 * - support for Reservations (1.1) 176 * - support for power management 177 */ 178 179 #include <sys/byteorder.h> 180 #ifdef _BIG_ENDIAN 181 #error nvme driver needs porting for big-endian platforms 182 #endif 183 184 #include <sys/modctl.h> 185 #include <sys/conf.h> 186 #include <sys/devops.h> 187 #include <sys/ddi.h> 188 #include <sys/sunddi.h> 189 #include <sys/bitmap.h> 190 #include <sys/sysmacros.h> 191 #include <sys/param.h> 192 #include <sys/varargs.h> 193 #include <sys/cpuvar.h> 194 #include <sys/disp.h> 195 #include <sys/blkdev.h> 196 #include <sys/atomic.h> 197 #include <sys/archsystm.h> 198 #include <sys/sata/sata_hba.h> 199 200 #ifdef __x86 201 #include <sys/x86_archext.h> 202 #endif 203 204 #include "nvme_reg.h" 205 #include "nvme_var.h" 206 207 208 /* NVMe spec version supported */ 209 static const int nvme_version_major = 1; 210 static const int nvme_version_minor = 1; 211 212 /* tunable for admin command timeout in seconds, default is 1s */ 213 static volatile int nvme_admin_cmd_timeout = 1; 214 215 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t); 216 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t); 217 static int nvme_quiesce(dev_info_t *); 218 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *); 219 static int nvme_setup_interrupts(nvme_t *, int, int); 220 static void nvme_release_interrupts(nvme_t *); 221 static uint_t nvme_intr(caddr_t, caddr_t); 222 223 static void nvme_shutdown(nvme_t *, int, boolean_t); 224 static boolean_t nvme_reset(nvme_t *, boolean_t); 225 static int nvme_init(nvme_t *); 226 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int); 227 static void nvme_free_cmd(nvme_cmd_t *); 228 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t, 229 bd_xfer_t *); 230 static int nvme_admin_cmd(nvme_cmd_t *, int); 231 static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *); 232 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *); 233 static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t); 234 static void nvme_wakeup_cmd(void *); 235 static void nvme_async_event_task(void *); 236 237 static int nvme_check_unknown_cmd_status(nvme_cmd_t *); 238 static int nvme_check_vendor_cmd_status(nvme_cmd_t *); 239 static int nvme_check_integrity_cmd_status(nvme_cmd_t *); 240 static int nvme_check_specific_cmd_status(nvme_cmd_t *); 241 static int nvme_check_generic_cmd_status(nvme_cmd_t *); 242 static inline int nvme_check_cmd_status(nvme_cmd_t *); 243 244 static void nvme_abort_cmd(nvme_cmd_t *); 245 static int nvme_async_event(nvme_t *); 246 static void *nvme_get_logpage(nvme_t *, uint8_t, ...); 247 static void *nvme_identify(nvme_t *, uint32_t); 248 static boolean_t nvme_set_features(nvme_t *, uint32_t, uint8_t, uint32_t, 249 uint32_t *); 250 static boolean_t nvme_write_cache_set(nvme_t *, boolean_t); 251 static int nvme_set_nqueues(nvme_t *, uint16_t); 252 253 static void nvme_free_dma(nvme_dma_t *); 254 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *, 255 nvme_dma_t **); 256 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t, 257 nvme_dma_t **); 258 static void nvme_free_qpair(nvme_qpair_t *); 259 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, int); 260 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t); 261 262 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t); 263 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t); 264 static inline uint64_t nvme_get64(nvme_t *, uintptr_t); 265 static inline uint32_t nvme_get32(nvme_t *, uintptr_t); 266 267 static boolean_t nvme_check_regs_hdl(nvme_t *); 268 static boolean_t nvme_check_dma_hdl(nvme_dma_t *); 269 270 static int nvme_fill_prp(nvme_cmd_t *, bd_xfer_t *); 271 272 static void nvme_bd_xfer_done(void *); 273 static void nvme_bd_driveinfo(void *, bd_drive_t *); 274 static int nvme_bd_mediainfo(void *, bd_media_t *); 275 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t); 276 static int nvme_bd_read(void *, bd_xfer_t *); 277 static int nvme_bd_write(void *, bd_xfer_t *); 278 static int nvme_bd_sync(void *, bd_xfer_t *); 279 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *); 280 281 static int nvme_prp_dma_constructor(void *, void *, int); 282 static void nvme_prp_dma_destructor(void *, void *); 283 284 static void nvme_prepare_devid(nvme_t *, uint32_t); 285 286 static void *nvme_state; 287 static kmem_cache_t *nvme_cmd_cache; 288 289 /* 290 * DMA attributes for queue DMA memory 291 * 292 * Queue DMA memory must be page aligned. The maximum length of a queue is 293 * 65536 entries, and an entry can be 64 bytes long. 294 */ 295 static ddi_dma_attr_t nvme_queue_dma_attr = { 296 .dma_attr_version = DMA_ATTR_V0, 297 .dma_attr_addr_lo = 0, 298 .dma_attr_addr_hi = 0xffffffffffffffffULL, 299 .dma_attr_count_max = (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1, 300 .dma_attr_align = 0x1000, 301 .dma_attr_burstsizes = 0x7ff, 302 .dma_attr_minxfer = 0x1000, 303 .dma_attr_maxxfer = (UINT16_MAX + 1) * sizeof (nvme_sqe_t), 304 .dma_attr_seg = 0xffffffffffffffffULL, 305 .dma_attr_sgllen = 1, 306 .dma_attr_granular = 1, 307 .dma_attr_flags = 0, 308 }; 309 310 /* 311 * DMA attributes for transfers using Physical Region Page (PRP) entries 312 * 313 * A PRP entry describes one page of DMA memory using the page size specified 314 * in the controller configuration's memory page size register (CC.MPS). It uses 315 * a 64bit base address aligned to this page size. There is no limitation on 316 * chaining PRPs together for arbitrarily large DMA transfers. 317 */ 318 static ddi_dma_attr_t nvme_prp_dma_attr = { 319 .dma_attr_version = DMA_ATTR_V0, 320 .dma_attr_addr_lo = 0, 321 .dma_attr_addr_hi = 0xffffffffffffffffULL, 322 .dma_attr_count_max = 0xfff, 323 .dma_attr_align = 0x1000, 324 .dma_attr_burstsizes = 0x7ff, 325 .dma_attr_minxfer = 0x1000, 326 .dma_attr_maxxfer = 0x1000, 327 .dma_attr_seg = 0xfff, 328 .dma_attr_sgllen = -1, 329 .dma_attr_granular = 1, 330 .dma_attr_flags = 0, 331 }; 332 333 /* 334 * DMA attributes for transfers using scatter/gather lists 335 * 336 * A SGL entry describes a chunk of DMA memory using a 64bit base address and a 337 * 32bit length field. SGL Segment and SGL Last Segment entries require the 338 * length to be a multiple of 16 bytes. 339 */ 340 static ddi_dma_attr_t nvme_sgl_dma_attr = { 341 .dma_attr_version = DMA_ATTR_V0, 342 .dma_attr_addr_lo = 0, 343 .dma_attr_addr_hi = 0xffffffffffffffffULL, 344 .dma_attr_count_max = 0xffffffffUL, 345 .dma_attr_align = 1, 346 .dma_attr_burstsizes = 0x7ff, 347 .dma_attr_minxfer = 0x10, 348 .dma_attr_maxxfer = 0xfffffffffULL, 349 .dma_attr_seg = 0xffffffffffffffffULL, 350 .dma_attr_sgllen = -1, 351 .dma_attr_granular = 0x10, 352 .dma_attr_flags = 0 353 }; 354 355 static ddi_device_acc_attr_t nvme_reg_acc_attr = { 356 .devacc_attr_version = DDI_DEVICE_ATTR_V0, 357 .devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC, 358 .devacc_attr_dataorder = DDI_STRICTORDER_ACC 359 }; 360 361 static struct dev_ops nvme_dev_ops = { 362 .devo_rev = DEVO_REV, 363 .devo_refcnt = 0, 364 .devo_getinfo = ddi_no_info, 365 .devo_identify = nulldev, 366 .devo_probe = nulldev, 367 .devo_attach = nvme_attach, 368 .devo_detach = nvme_detach, 369 .devo_reset = nodev, 370 .devo_cb_ops = NULL, 371 .devo_bus_ops = NULL, 372 .devo_power = NULL, 373 .devo_quiesce = nvme_quiesce, 374 }; 375 376 static struct modldrv nvme_modldrv = { 377 .drv_modops = &mod_driverops, 378 .drv_linkinfo = "NVMe v1.1b", 379 .drv_dev_ops = &nvme_dev_ops 380 }; 381 382 static struct modlinkage nvme_modlinkage = { 383 .ml_rev = MODREV_1, 384 .ml_linkage = { &nvme_modldrv, NULL } 385 }; 386 387 static bd_ops_t nvme_bd_ops = { 388 .o_version = BD_OPS_VERSION_0, 389 .o_drive_info = nvme_bd_driveinfo, 390 .o_media_info = nvme_bd_mediainfo, 391 .o_devid_init = nvme_bd_devid, 392 .o_sync_cache = nvme_bd_sync, 393 .o_read = nvme_bd_read, 394 .o_write = nvme_bd_write, 395 }; 396 397 int 398 _init(void) 399 { 400 int error; 401 402 error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1); 403 if (error != DDI_SUCCESS) 404 return (error); 405 406 nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache", 407 sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 408 409 bd_mod_init(&nvme_dev_ops); 410 411 error = mod_install(&nvme_modlinkage); 412 if (error != DDI_SUCCESS) { 413 ddi_soft_state_fini(&nvme_state); 414 bd_mod_fini(&nvme_dev_ops); 415 } 416 417 return (error); 418 } 419 420 int 421 _fini(void) 422 { 423 int error; 424 425 error = mod_remove(&nvme_modlinkage); 426 if (error == DDI_SUCCESS) { 427 ddi_soft_state_fini(&nvme_state); 428 kmem_cache_destroy(nvme_cmd_cache); 429 bd_mod_fini(&nvme_dev_ops); 430 } 431 432 return (error); 433 } 434 435 int 436 _info(struct modinfo *modinfop) 437 { 438 return (mod_info(&nvme_modlinkage, modinfop)); 439 } 440 441 static inline void 442 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val) 443 { 444 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 445 446 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 447 ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val); 448 } 449 450 static inline void 451 nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val) 452 { 453 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 454 455 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 456 ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val); 457 } 458 459 static inline uint64_t 460 nvme_get64(nvme_t *nvme, uintptr_t reg) 461 { 462 uint64_t val; 463 464 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 465 466 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 467 val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg)); 468 469 return (val); 470 } 471 472 static inline uint32_t 473 nvme_get32(nvme_t *nvme, uintptr_t reg) 474 { 475 uint32_t val; 476 477 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 478 479 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 480 val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg)); 481 482 return (val); 483 } 484 485 static boolean_t 486 nvme_check_regs_hdl(nvme_t *nvme) 487 { 488 ddi_fm_error_t error; 489 490 ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION); 491 492 if (error.fme_status != DDI_FM_OK) 493 return (B_TRUE); 494 495 return (B_FALSE); 496 } 497 498 static boolean_t 499 nvme_check_dma_hdl(nvme_dma_t *dma) 500 { 501 ddi_fm_error_t error; 502 503 if (dma == NULL) 504 return (B_FALSE); 505 506 ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION); 507 508 if (error.fme_status != DDI_FM_OK) 509 return (B_TRUE); 510 511 return (B_FALSE); 512 } 513 514 static void 515 nvme_free_dma_common(nvme_dma_t *dma) 516 { 517 if (dma->nd_dmah != NULL) 518 (void) ddi_dma_unbind_handle(dma->nd_dmah); 519 if (dma->nd_acch != NULL) 520 ddi_dma_mem_free(&dma->nd_acch); 521 if (dma->nd_dmah != NULL) 522 ddi_dma_free_handle(&dma->nd_dmah); 523 } 524 525 static void 526 nvme_free_dma(nvme_dma_t *dma) 527 { 528 nvme_free_dma_common(dma); 529 kmem_free(dma, sizeof (*dma)); 530 } 531 532 /* ARGSUSED */ 533 static void 534 nvme_prp_dma_destructor(void *buf, void *private) 535 { 536 nvme_dma_t *dma = (nvme_dma_t *)buf; 537 538 nvme_free_dma_common(dma); 539 } 540 541 static int 542 nvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma, 543 size_t len, uint_t flags, ddi_dma_attr_t *dma_attr) 544 { 545 if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL, 546 &dma->nd_dmah) != DDI_SUCCESS) { 547 /* 548 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and 549 * the only other possible error is DDI_DMA_BADATTR which 550 * indicates a driver bug which should cause a panic. 551 */ 552 dev_err(nvme->n_dip, CE_PANIC, 553 "!failed to get DMA handle, check DMA attributes"); 554 return (DDI_FAILURE); 555 } 556 557 /* 558 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified 559 * or the flags are conflicting, which isn't the case here. 560 */ 561 (void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr, 562 DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp, 563 &dma->nd_len, &dma->nd_acch); 564 565 if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp, 566 dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, 567 &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) { 568 dev_err(nvme->n_dip, CE_WARN, 569 "!failed to bind DMA memory"); 570 atomic_inc_32(&nvme->n_dma_bind_err); 571 nvme_free_dma_common(dma); 572 return (DDI_FAILURE); 573 } 574 575 return (DDI_SUCCESS); 576 } 577 578 static int 579 nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags, 580 ddi_dma_attr_t *dma_attr, nvme_dma_t **ret) 581 { 582 nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP); 583 584 if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) != 585 DDI_SUCCESS) { 586 *ret = NULL; 587 kmem_free(dma, sizeof (nvme_dma_t)); 588 return (DDI_FAILURE); 589 } 590 591 bzero(dma->nd_memp, dma->nd_len); 592 593 *ret = dma; 594 return (DDI_SUCCESS); 595 } 596 597 /* ARGSUSED */ 598 static int 599 nvme_prp_dma_constructor(void *buf, void *private, int flags) 600 { 601 nvme_dma_t *dma = (nvme_dma_t *)buf; 602 nvme_t *nvme = (nvme_t *)private; 603 604 dma->nd_dmah = NULL; 605 dma->nd_acch = NULL; 606 607 if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize, 608 DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) { 609 return (-1); 610 } 611 612 ASSERT(dma->nd_ncookie == 1); 613 614 dma->nd_cached = B_TRUE; 615 616 return (0); 617 } 618 619 static int 620 nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len, 621 uint_t flags, nvme_dma_t **dma) 622 { 623 uint32_t len = nentry * qe_len; 624 ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr; 625 626 len = roundup(len, nvme->n_pagesize); 627 628 q_dma_attr.dma_attr_minxfer = len; 629 630 if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma) 631 != DDI_SUCCESS) { 632 dev_err(nvme->n_dip, CE_WARN, 633 "!failed to get DMA memory for queue"); 634 goto fail; 635 } 636 637 if ((*dma)->nd_ncookie != 1) { 638 dev_err(nvme->n_dip, CE_WARN, 639 "!got too many cookies for queue DMA"); 640 goto fail; 641 } 642 643 return (DDI_SUCCESS); 644 645 fail: 646 if (*dma) { 647 nvme_free_dma(*dma); 648 *dma = NULL; 649 } 650 651 return (DDI_FAILURE); 652 } 653 654 static void 655 nvme_free_qpair(nvme_qpair_t *qp) 656 { 657 int i; 658 659 mutex_destroy(&qp->nq_mutex); 660 661 if (qp->nq_sqdma != NULL) 662 nvme_free_dma(qp->nq_sqdma); 663 if (qp->nq_cqdma != NULL) 664 nvme_free_dma(qp->nq_cqdma); 665 666 if (qp->nq_active_cmds > 0) 667 for (i = 0; i != qp->nq_nentry; i++) 668 if (qp->nq_cmd[i] != NULL) 669 nvme_free_cmd(qp->nq_cmd[i]); 670 671 if (qp->nq_cmd != NULL) 672 kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry); 673 674 kmem_free(qp, sizeof (nvme_qpair_t)); 675 } 676 677 static int 678 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp, 679 int idx) 680 { 681 nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP); 682 683 mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER, 684 DDI_INTR_PRI(nvme->n_intr_pri)); 685 686 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t), 687 DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS) 688 goto fail; 689 690 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t), 691 DDI_DMA_READ, &qp->nq_cqdma) != DDI_SUCCESS) 692 goto fail; 693 694 qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp; 695 qp->nq_cq = (nvme_cqe_t *)qp->nq_cqdma->nd_memp; 696 qp->nq_nentry = nentry; 697 698 qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx); 699 qp->nq_cqhdbl = NVME_REG_CQHDBL(nvme, idx); 700 701 qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP); 702 qp->nq_next_cmd = 0; 703 704 *nqp = qp; 705 return (DDI_SUCCESS); 706 707 fail: 708 nvme_free_qpair(qp); 709 *nqp = NULL; 710 711 return (DDI_FAILURE); 712 } 713 714 static nvme_cmd_t * 715 nvme_alloc_cmd(nvme_t *nvme, int kmflag) 716 { 717 nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag); 718 719 if (cmd == NULL) 720 return (cmd); 721 722 bzero(cmd, sizeof (nvme_cmd_t)); 723 724 cmd->nc_nvme = nvme; 725 726 mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER, 727 DDI_INTR_PRI(nvme->n_intr_pri)); 728 cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL); 729 730 return (cmd); 731 } 732 733 static void 734 nvme_free_cmd(nvme_cmd_t *cmd) 735 { 736 if (cmd->nc_dma) { 737 if (cmd->nc_dma->nd_cached) 738 kmem_cache_free(cmd->nc_nvme->n_prp_cache, 739 cmd->nc_dma); 740 else 741 nvme_free_dma(cmd->nc_dma); 742 cmd->nc_dma = NULL; 743 } 744 745 cv_destroy(&cmd->nc_cv); 746 mutex_destroy(&cmd->nc_mutex); 747 748 kmem_cache_free(nvme_cmd_cache, cmd); 749 } 750 751 static int 752 nvme_submit_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) 753 { 754 nvme_reg_sqtdbl_t tail = { 0 }; 755 756 mutex_enter(&qp->nq_mutex); 757 758 if (qp->nq_active_cmds == qp->nq_nentry) { 759 mutex_exit(&qp->nq_mutex); 760 return (DDI_FAILURE); 761 } 762 763 cmd->nc_completed = B_FALSE; 764 765 /* 766 * Try to insert the cmd into the active cmd array at the nq_next_cmd 767 * slot. If the slot is already occupied advance to the next slot and 768 * try again. This can happen for long running commands like async event 769 * requests. 770 */ 771 while (qp->nq_cmd[qp->nq_next_cmd] != NULL) 772 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 773 qp->nq_cmd[qp->nq_next_cmd] = cmd; 774 775 qp->nq_active_cmds++; 776 777 cmd->nc_sqe.sqe_cid = qp->nq_next_cmd; 778 bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t)); 779 (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah, 780 sizeof (nvme_sqe_t) * qp->nq_sqtail, 781 sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV); 782 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 783 784 tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry; 785 nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r); 786 787 mutex_exit(&qp->nq_mutex); 788 return (DDI_SUCCESS); 789 } 790 791 static nvme_cmd_t * 792 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp) 793 { 794 nvme_reg_cqhdbl_t head = { 0 }; 795 796 nvme_cqe_t *cqe; 797 nvme_cmd_t *cmd; 798 799 (void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0, 800 sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL); 801 802 cqe = &qp->nq_cq[qp->nq_cqhead]; 803 804 /* Check phase tag of CQE. Hardware inverts it for new entries. */ 805 if (cqe->cqe_sf.sf_p == qp->nq_phase) 806 return (NULL); 807 808 ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp); 809 ASSERT(cqe->cqe_cid < qp->nq_nentry); 810 811 mutex_enter(&qp->nq_mutex); 812 cmd = qp->nq_cmd[cqe->cqe_cid]; 813 qp->nq_cmd[cqe->cqe_cid] = NULL; 814 qp->nq_active_cmds--; 815 mutex_exit(&qp->nq_mutex); 816 817 ASSERT(cmd != NULL); 818 ASSERT(cmd->nc_nvme == nvme); 819 ASSERT(cmd->nc_sqid == cqe->cqe_sqid); 820 ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid); 821 bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t)); 822 823 qp->nq_sqhead = cqe->cqe_sqhd; 824 825 head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry; 826 827 /* Toggle phase on wrap-around. */ 828 if (qp->nq_cqhead == 0) 829 qp->nq_phase = qp->nq_phase ? 0 : 1; 830 831 nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r); 832 833 return (cmd); 834 } 835 836 static int 837 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd) 838 { 839 nvme_cqe_t *cqe = &cmd->nc_cqe; 840 841 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 842 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 843 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 844 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 845 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 846 847 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 848 849 if (cmd->nc_nvme->n_strict_version) { 850 cmd->nc_nvme->n_dead = B_TRUE; 851 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 852 } 853 854 return (EIO); 855 } 856 857 static int 858 nvme_check_vendor_cmd_status(nvme_cmd_t *cmd) 859 { 860 nvme_cqe_t *cqe = &cmd->nc_cqe; 861 862 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 863 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 864 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 865 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 866 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 867 if (!cmd->nc_nvme->n_ignore_unknown_vendor_status) { 868 cmd->nc_nvme->n_dead = B_TRUE; 869 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 870 } 871 872 return (EIO); 873 } 874 875 static int 876 nvme_check_integrity_cmd_status(nvme_cmd_t *cmd) 877 { 878 nvme_cqe_t *cqe = &cmd->nc_cqe; 879 880 switch (cqe->cqe_sf.sf_sc) { 881 case NVME_CQE_SC_INT_NVM_WRITE: 882 /* write fail */ 883 /* TODO: post ereport */ 884 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 885 return (EIO); 886 887 case NVME_CQE_SC_INT_NVM_READ: 888 /* read fail */ 889 /* TODO: post ereport */ 890 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 891 return (EIO); 892 893 default: 894 return (nvme_check_unknown_cmd_status(cmd)); 895 } 896 } 897 898 static int 899 nvme_check_generic_cmd_status(nvme_cmd_t *cmd) 900 { 901 nvme_cqe_t *cqe = &cmd->nc_cqe; 902 903 switch (cqe->cqe_sf.sf_sc) { 904 case NVME_CQE_SC_GEN_SUCCESS: 905 return (0); 906 907 /* 908 * Errors indicating a bug in the driver should cause a panic. 909 */ 910 case NVME_CQE_SC_GEN_INV_OPC: 911 /* Invalid Command Opcode */ 912 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 913 "invalid opcode in cmd %p", (void *)cmd); 914 return (0); 915 916 case NVME_CQE_SC_GEN_INV_FLD: 917 /* Invalid Field in Command */ 918 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 919 "invalid field in cmd %p", (void *)cmd); 920 return (0); 921 922 case NVME_CQE_SC_GEN_ID_CNFL: 923 /* Command ID Conflict */ 924 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 925 "cmd ID conflict in cmd %p", (void *)cmd); 926 return (0); 927 928 case NVME_CQE_SC_GEN_INV_NS: 929 /* Invalid Namespace or Format */ 930 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 931 "invalid NS/format in cmd %p", (void *)cmd); 932 return (0); 933 934 case NVME_CQE_SC_GEN_NVM_LBA_RANGE: 935 /* LBA Out Of Range */ 936 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 937 "LBA out of range in cmd %p", (void *)cmd); 938 return (0); 939 940 /* 941 * Non-fatal errors, handle gracefully. 942 */ 943 case NVME_CQE_SC_GEN_DATA_XFR_ERR: 944 /* Data Transfer Error (DMA) */ 945 /* TODO: post ereport */ 946 atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err); 947 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 948 return (EIO); 949 950 case NVME_CQE_SC_GEN_INTERNAL_ERR: 951 /* 952 * Internal Error. The spec (v1.0, section 4.5.1.2) says 953 * detailed error information is returned as async event, 954 * so we pretty much ignore the error here and handle it 955 * in the async event handler. 956 */ 957 atomic_inc_32(&cmd->nc_nvme->n_internal_err); 958 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 959 return (EIO); 960 961 case NVME_CQE_SC_GEN_ABORT_REQUEST: 962 /* 963 * Command Abort Requested. This normally happens only when a 964 * command times out. 965 */ 966 /* TODO: post ereport or change blkdev to handle this? */ 967 atomic_inc_32(&cmd->nc_nvme->n_abort_rq_err); 968 return (ECANCELED); 969 970 case NVME_CQE_SC_GEN_ABORT_PWRLOSS: 971 /* Command Aborted due to Power Loss Notification */ 972 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 973 cmd->nc_nvme->n_dead = B_TRUE; 974 return (EIO); 975 976 case NVME_CQE_SC_GEN_ABORT_SQ_DEL: 977 /* Command Aborted due to SQ Deletion */ 978 atomic_inc_32(&cmd->nc_nvme->n_abort_sq_del); 979 return (EIO); 980 981 case NVME_CQE_SC_GEN_NVM_CAP_EXC: 982 /* Capacity Exceeded */ 983 atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc); 984 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 985 return (EIO); 986 987 case NVME_CQE_SC_GEN_NVM_NS_NOTRDY: 988 /* Namespace Not Ready */ 989 atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy); 990 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 991 return (EIO); 992 993 default: 994 return (nvme_check_unknown_cmd_status(cmd)); 995 } 996 } 997 998 static int 999 nvme_check_specific_cmd_status(nvme_cmd_t *cmd) 1000 { 1001 nvme_cqe_t *cqe = &cmd->nc_cqe; 1002 1003 switch (cqe->cqe_sf.sf_sc) { 1004 case NVME_CQE_SC_SPC_INV_CQ: 1005 /* Completion Queue Invalid */ 1006 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE); 1007 atomic_inc_32(&cmd->nc_nvme->n_inv_cq_err); 1008 return (EINVAL); 1009 1010 case NVME_CQE_SC_SPC_INV_QID: 1011 /* Invalid Queue Identifier */ 1012 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 1013 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE || 1014 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE || 1015 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 1016 atomic_inc_32(&cmd->nc_nvme->n_inv_qid_err); 1017 return (EINVAL); 1018 1019 case NVME_CQE_SC_SPC_MAX_QSZ_EXC: 1020 /* Max Queue Size Exceeded */ 1021 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 1022 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 1023 atomic_inc_32(&cmd->nc_nvme->n_max_qsz_exc); 1024 return (EINVAL); 1025 1026 case NVME_CQE_SC_SPC_ABRT_CMD_EXC: 1027 /* Abort Command Limit Exceeded */ 1028 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT); 1029 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1030 "abort command limit exceeded in cmd %p", (void *)cmd); 1031 return (0); 1032 1033 case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC: 1034 /* Async Event Request Limit Exceeded */ 1035 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT); 1036 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1037 "async event request limit exceeded in cmd %p", 1038 (void *)cmd); 1039 return (0); 1040 1041 case NVME_CQE_SC_SPC_INV_INT_VECT: 1042 /* Invalid Interrupt Vector */ 1043 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 1044 atomic_inc_32(&cmd->nc_nvme->n_inv_int_vect); 1045 return (EINVAL); 1046 1047 case NVME_CQE_SC_SPC_INV_LOG_PAGE: 1048 /* Invalid Log Page */ 1049 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE); 1050 atomic_inc_32(&cmd->nc_nvme->n_inv_log_page); 1051 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1052 return (EINVAL); 1053 1054 case NVME_CQE_SC_SPC_INV_FORMAT: 1055 /* Invalid Format */ 1056 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT); 1057 atomic_inc_32(&cmd->nc_nvme->n_inv_format); 1058 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1059 return (EINVAL); 1060 1061 case NVME_CQE_SC_SPC_INV_Q_DEL: 1062 /* Invalid Queue Deletion */ 1063 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 1064 atomic_inc_32(&cmd->nc_nvme->n_inv_q_del); 1065 return (EINVAL); 1066 1067 case NVME_CQE_SC_SPC_NVM_CNFL_ATTR: 1068 /* Conflicting Attributes */ 1069 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT || 1070 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 1071 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1072 atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr); 1073 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1074 return (EINVAL); 1075 1076 case NVME_CQE_SC_SPC_NVM_INV_PROT: 1077 /* Invalid Protection Information */ 1078 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE || 1079 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 1080 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1081 atomic_inc_32(&cmd->nc_nvme->n_inv_prot); 1082 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1083 return (EINVAL); 1084 1085 case NVME_CQE_SC_SPC_NVM_READONLY: 1086 /* Write to Read Only Range */ 1087 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1088 atomic_inc_32(&cmd->nc_nvme->n_readonly); 1089 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1090 return (EROFS); 1091 1092 default: 1093 return (nvme_check_unknown_cmd_status(cmd)); 1094 } 1095 } 1096 1097 static inline int 1098 nvme_check_cmd_status(nvme_cmd_t *cmd) 1099 { 1100 nvme_cqe_t *cqe = &cmd->nc_cqe; 1101 1102 /* take a shortcut if everything is alright */ 1103 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1104 cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS) 1105 return (0); 1106 1107 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC) 1108 return (nvme_check_generic_cmd_status(cmd)); 1109 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) 1110 return (nvme_check_specific_cmd_status(cmd)); 1111 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY) 1112 return (nvme_check_integrity_cmd_status(cmd)); 1113 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR) 1114 return (nvme_check_vendor_cmd_status(cmd)); 1115 1116 return (nvme_check_unknown_cmd_status(cmd)); 1117 } 1118 1119 /* 1120 * nvme_abort_cmd_cb -- replaces nc_callback of aborted commands 1121 * 1122 * This functions takes care of cleaning up aborted commands. The command 1123 * status is checked to catch any fatal errors. 1124 */ 1125 static void 1126 nvme_abort_cmd_cb(void *arg) 1127 { 1128 nvme_cmd_t *cmd = arg; 1129 1130 /* 1131 * Grab the command mutex. Once we have it we hold the last reference 1132 * to the command and can safely free it. 1133 */ 1134 mutex_enter(&cmd->nc_mutex); 1135 (void) nvme_check_cmd_status(cmd); 1136 mutex_exit(&cmd->nc_mutex); 1137 1138 nvme_free_cmd(cmd); 1139 } 1140 1141 static void 1142 nvme_abort_cmd(nvme_cmd_t *abort_cmd) 1143 { 1144 nvme_t *nvme = abort_cmd->nc_nvme; 1145 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1146 nvme_abort_cmd_t ac = { 0 }; 1147 1148 sema_p(&nvme->n_abort_sema); 1149 1150 ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid; 1151 ac.b.ac_sqid = abort_cmd->nc_sqid; 1152 1153 /* 1154 * Drop the mutex of the aborted command. From this point on 1155 * we must assume that the abort callback has freed the command. 1156 */ 1157 mutex_exit(&abort_cmd->nc_mutex); 1158 1159 cmd->nc_sqid = 0; 1160 cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT; 1161 cmd->nc_callback = nvme_wakeup_cmd; 1162 cmd->nc_sqe.sqe_cdw10 = ac.r; 1163 1164 /* 1165 * Send the ABORT to the hardware. The ABORT command will return _after_ 1166 * the aborted command has completed (aborted or otherwise). 1167 */ 1168 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { 1169 sema_v(&nvme->n_abort_sema); 1170 dev_err(nvme->n_dip, CE_WARN, 1171 "!nvme_admin_cmd failed for ABORT"); 1172 atomic_inc_32(&nvme->n_abort_failed); 1173 return; 1174 } 1175 sema_v(&nvme->n_abort_sema); 1176 1177 if (nvme_check_cmd_status(cmd)) { 1178 dev_err(nvme->n_dip, CE_WARN, 1179 "!ABORT failed with sct = %x, sc = %x", 1180 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 1181 atomic_inc_32(&nvme->n_abort_failed); 1182 } else { 1183 atomic_inc_32(&nvme->n_cmd_aborted); 1184 } 1185 1186 nvme_free_cmd(cmd); 1187 } 1188 1189 /* 1190 * nvme_wait_cmd -- wait for command completion or timeout 1191 * 1192 * Returns B_TRUE if the command completed normally. 1193 * 1194 * Returns B_FALSE if the command timed out and an abort was attempted. The 1195 * command mutex will be dropped and the command must be considered freed. The 1196 * freeing of the command is normally done by the abort command callback. 1197 * 1198 * In case of a serious error or a timeout of the abort command the hardware 1199 * will be declared dead and FMA will be notified. 1200 */ 1201 static boolean_t 1202 nvme_wait_cmd(nvme_cmd_t *cmd, uint_t sec) 1203 { 1204 clock_t timeout = ddi_get_lbolt() + drv_usectohz(sec * MICROSEC); 1205 nvme_t *nvme = cmd->nc_nvme; 1206 nvme_reg_csts_t csts; 1207 1208 ASSERT(mutex_owned(&cmd->nc_mutex)); 1209 1210 while (!cmd->nc_completed) { 1211 if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1) 1212 break; 1213 } 1214 1215 if (cmd->nc_completed) 1216 return (B_TRUE); 1217 1218 /* 1219 * The command timed out. Change the callback to the cleanup function. 1220 */ 1221 cmd->nc_callback = nvme_abort_cmd_cb; 1222 1223 /* 1224 * Check controller for fatal status, any errors associated with the 1225 * register or DMA handle, or for a double timeout (abort command timed 1226 * out). If necessary log a warning and call FMA. 1227 */ 1228 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 1229 dev_err(nvme->n_dip, CE_WARN, "!command timeout, " 1230 "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_opc, csts.b.csts_cfs); 1231 atomic_inc_32(&nvme->n_cmd_timeout); 1232 1233 if (csts.b.csts_cfs || 1234 nvme_check_regs_hdl(nvme) || 1235 nvme_check_dma_hdl(cmd->nc_dma) || 1236 cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) { 1237 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1238 nvme->n_dead = B_TRUE; 1239 mutex_exit(&cmd->nc_mutex); 1240 } else { 1241 /* 1242 * Try to abort the command. The command mutex is released by 1243 * nvme_abort_cmd(). 1244 * If the abort succeeds it will have freed the aborted command. 1245 * If the abort fails for other reasons we must assume that the 1246 * command may complete at any time, and the callback will free 1247 * it for us. 1248 */ 1249 nvme_abort_cmd(cmd); 1250 } 1251 1252 return (B_FALSE); 1253 } 1254 1255 static void 1256 nvme_wakeup_cmd(void *arg) 1257 { 1258 nvme_cmd_t *cmd = arg; 1259 1260 mutex_enter(&cmd->nc_mutex); 1261 /* 1262 * There is a slight chance that this command completed shortly after 1263 * the timeout was hit in nvme_wait_cmd() but before the callback was 1264 * changed. Catch that case here and clean up accordingly. 1265 */ 1266 if (cmd->nc_callback == nvme_abort_cmd_cb) { 1267 mutex_exit(&cmd->nc_mutex); 1268 nvme_abort_cmd_cb(cmd); 1269 return; 1270 } 1271 1272 cmd->nc_completed = B_TRUE; 1273 cv_signal(&cmd->nc_cv); 1274 mutex_exit(&cmd->nc_mutex); 1275 } 1276 1277 static void 1278 nvme_async_event_task(void *arg) 1279 { 1280 nvme_cmd_t *cmd = arg; 1281 nvme_t *nvme = cmd->nc_nvme; 1282 nvme_error_log_entry_t *error_log = NULL; 1283 nvme_health_log_t *health_log = NULL; 1284 nvme_async_event_t event; 1285 int ret; 1286 1287 /* 1288 * Check for errors associated with the async request itself. The only 1289 * command-specific error is "async event limit exceeded", which 1290 * indicates a programming error in the driver and causes a panic in 1291 * nvme_check_cmd_status(). 1292 * 1293 * Other possible errors are various scenarios where the async request 1294 * was aborted, or internal errors in the device. Internal errors are 1295 * reported to FMA, the command aborts need no special handling here. 1296 */ 1297 if (nvme_check_cmd_status(cmd)) { 1298 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1299 "!async event request returned failure, sct = %x, " 1300 "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct, 1301 cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr, 1302 cmd->nc_cqe.cqe_sf.sf_m); 1303 1304 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1305 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) { 1306 cmd->nc_nvme->n_dead = B_TRUE; 1307 ddi_fm_service_impact(cmd->nc_nvme->n_dip, 1308 DDI_SERVICE_LOST); 1309 } 1310 nvme_free_cmd(cmd); 1311 return; 1312 } 1313 1314 1315 event.r = cmd->nc_cqe.cqe_dw0; 1316 1317 /* Clear CQE and re-submit the async request. */ 1318 bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t)); 1319 ret = nvme_submit_cmd(nvme->n_adminq, cmd); 1320 1321 if (ret != DDI_SUCCESS) { 1322 dev_err(nvme->n_dip, CE_WARN, 1323 "!failed to resubmit async event request"); 1324 atomic_inc_32(&nvme->n_async_resubmit_failed); 1325 nvme_free_cmd(cmd); 1326 } 1327 1328 switch (event.b.ae_type) { 1329 case NVME_ASYNC_TYPE_ERROR: 1330 if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) { 1331 error_log = (nvme_error_log_entry_t *) 1332 nvme_get_logpage(nvme, event.b.ae_logpage); 1333 } else { 1334 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 1335 "async event reply: %d", event.b.ae_logpage); 1336 atomic_inc_32(&nvme->n_wrong_logpage); 1337 } 1338 1339 switch (event.b.ae_info) { 1340 case NVME_ASYNC_ERROR_INV_SQ: 1341 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 1342 "invalid submission queue"); 1343 return; 1344 1345 case NVME_ASYNC_ERROR_INV_DBL: 1346 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 1347 "invalid doorbell write value"); 1348 return; 1349 1350 case NVME_ASYNC_ERROR_DIAGFAIL: 1351 dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure"); 1352 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1353 nvme->n_dead = B_TRUE; 1354 atomic_inc_32(&nvme->n_diagfail_event); 1355 break; 1356 1357 case NVME_ASYNC_ERROR_PERSISTENT: 1358 dev_err(nvme->n_dip, CE_WARN, "!persistent internal " 1359 "device error"); 1360 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1361 nvme->n_dead = B_TRUE; 1362 atomic_inc_32(&nvme->n_persistent_event); 1363 break; 1364 1365 case NVME_ASYNC_ERROR_TRANSIENT: 1366 dev_err(nvme->n_dip, CE_WARN, "!transient internal " 1367 "device error"); 1368 /* TODO: send ereport */ 1369 atomic_inc_32(&nvme->n_transient_event); 1370 break; 1371 1372 case NVME_ASYNC_ERROR_FW_LOAD: 1373 dev_err(nvme->n_dip, CE_WARN, 1374 "!firmware image load error"); 1375 atomic_inc_32(&nvme->n_fw_load_event); 1376 break; 1377 } 1378 break; 1379 1380 case NVME_ASYNC_TYPE_HEALTH: 1381 if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) { 1382 health_log = (nvme_health_log_t *) 1383 nvme_get_logpage(nvme, event.b.ae_logpage, -1); 1384 } else { 1385 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 1386 "async event reply: %d", event.b.ae_logpage); 1387 atomic_inc_32(&nvme->n_wrong_logpage); 1388 } 1389 1390 switch (event.b.ae_info) { 1391 case NVME_ASYNC_HEALTH_RELIABILITY: 1392 dev_err(nvme->n_dip, CE_WARN, 1393 "!device reliability compromised"); 1394 /* TODO: send ereport */ 1395 atomic_inc_32(&nvme->n_reliability_event); 1396 break; 1397 1398 case NVME_ASYNC_HEALTH_TEMPERATURE: 1399 dev_err(nvme->n_dip, CE_WARN, 1400 "!temperature above threshold"); 1401 /* TODO: send ereport */ 1402 atomic_inc_32(&nvme->n_temperature_event); 1403 break; 1404 1405 case NVME_ASYNC_HEALTH_SPARE: 1406 dev_err(nvme->n_dip, CE_WARN, 1407 "!spare space below threshold"); 1408 /* TODO: send ereport */ 1409 atomic_inc_32(&nvme->n_spare_event); 1410 break; 1411 } 1412 break; 1413 1414 case NVME_ASYNC_TYPE_VENDOR: 1415 dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event " 1416 "received, info = %x, logpage = %x", event.b.ae_info, 1417 event.b.ae_logpage); 1418 atomic_inc_32(&nvme->n_vendor_event); 1419 break; 1420 1421 default: 1422 dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, " 1423 "type = %x, info = %x, logpage = %x", event.b.ae_type, 1424 event.b.ae_info, event.b.ae_logpage); 1425 atomic_inc_32(&nvme->n_unknown_event); 1426 break; 1427 } 1428 1429 if (error_log) 1430 kmem_free(error_log, sizeof (nvme_error_log_entry_t) * 1431 nvme->n_error_log_len); 1432 1433 if (health_log) 1434 kmem_free(health_log, sizeof (nvme_health_log_t)); 1435 } 1436 1437 static int 1438 nvme_admin_cmd(nvme_cmd_t *cmd, int sec) 1439 { 1440 int ret; 1441 1442 mutex_enter(&cmd->nc_mutex); 1443 ret = nvme_submit_cmd(cmd->nc_nvme->n_adminq, cmd); 1444 1445 if (ret != DDI_SUCCESS) { 1446 mutex_exit(&cmd->nc_mutex); 1447 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1448 "!nvme_submit_cmd failed"); 1449 atomic_inc_32(&cmd->nc_nvme->n_admin_queue_full); 1450 nvme_free_cmd(cmd); 1451 return (DDI_FAILURE); 1452 } 1453 1454 if (nvme_wait_cmd(cmd, sec) == B_FALSE) { 1455 /* 1456 * The command timed out. An abort command was posted that 1457 * will take care of the cleanup. 1458 */ 1459 return (DDI_FAILURE); 1460 } 1461 mutex_exit(&cmd->nc_mutex); 1462 1463 return (DDI_SUCCESS); 1464 } 1465 1466 static int 1467 nvme_async_event(nvme_t *nvme) 1468 { 1469 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1470 int ret; 1471 1472 cmd->nc_sqid = 0; 1473 cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT; 1474 cmd->nc_callback = nvme_async_event_task; 1475 1476 ret = nvme_submit_cmd(nvme->n_adminq, cmd); 1477 1478 if (ret != DDI_SUCCESS) { 1479 dev_err(nvme->n_dip, CE_WARN, 1480 "!nvme_submit_cmd failed for ASYNCHRONOUS EVENT"); 1481 nvme_free_cmd(cmd); 1482 return (DDI_FAILURE); 1483 } 1484 1485 return (DDI_SUCCESS); 1486 } 1487 1488 static void * 1489 nvme_get_logpage(nvme_t *nvme, uint8_t logpage, ...) 1490 { 1491 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1492 void *buf = NULL; 1493 nvme_getlogpage_t getlogpage = { 0 }; 1494 size_t bufsize; 1495 va_list ap; 1496 1497 va_start(ap, logpage); 1498 1499 cmd->nc_sqid = 0; 1500 cmd->nc_callback = nvme_wakeup_cmd; 1501 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE; 1502 1503 getlogpage.b.lp_lid = logpage; 1504 1505 switch (logpage) { 1506 case NVME_LOGPAGE_ERROR: 1507 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 1508 bufsize = nvme->n_error_log_len * 1509 sizeof (nvme_error_log_entry_t); 1510 break; 1511 1512 case NVME_LOGPAGE_HEALTH: 1513 cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t); 1514 bufsize = sizeof (nvme_health_log_t); 1515 break; 1516 1517 case NVME_LOGPAGE_FWSLOT: 1518 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 1519 bufsize = sizeof (nvme_fwslot_log_t); 1520 break; 1521 1522 default: 1523 dev_err(nvme->n_dip, CE_WARN, "!unknown log page requested: %d", 1524 logpage); 1525 atomic_inc_32(&nvme->n_unknown_logpage); 1526 goto fail; 1527 } 1528 1529 va_end(ap); 1530 1531 getlogpage.b.lp_numd = bufsize / sizeof (uint32_t) - 1; 1532 1533 cmd->nc_sqe.sqe_cdw10 = getlogpage.r; 1534 1535 if (nvme_zalloc_dma(nvme, getlogpage.b.lp_numd * sizeof (uint32_t), 1536 DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 1537 dev_err(nvme->n_dip, CE_WARN, 1538 "!nvme_zalloc_dma failed for GET LOG PAGE"); 1539 goto fail; 1540 } 1541 1542 if (cmd->nc_dma->nd_ncookie > 2) { 1543 dev_err(nvme->n_dip, CE_WARN, 1544 "!too many DMA cookies for GET LOG PAGE"); 1545 atomic_inc_32(&nvme->n_too_many_cookies); 1546 goto fail; 1547 } 1548 1549 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress; 1550 if (cmd->nc_dma->nd_ncookie > 1) { 1551 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 1552 &cmd->nc_dma->nd_cookie); 1553 cmd->nc_sqe.sqe_dptr.d_prp[1] = 1554 cmd->nc_dma->nd_cookie.dmac_laddress; 1555 } 1556 1557 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { 1558 dev_err(nvme->n_dip, CE_WARN, 1559 "!nvme_admin_cmd failed for GET LOG PAGE"); 1560 return (NULL); 1561 } 1562 1563 if (nvme_check_cmd_status(cmd)) { 1564 dev_err(nvme->n_dip, CE_WARN, 1565 "!GET LOG PAGE failed with sct = %x, sc = %x", 1566 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 1567 goto fail; 1568 } 1569 1570 buf = kmem_alloc(bufsize, KM_SLEEP); 1571 bcopy(cmd->nc_dma->nd_memp, buf, bufsize); 1572 1573 fail: 1574 nvme_free_cmd(cmd); 1575 1576 return (buf); 1577 } 1578 1579 static void * 1580 nvme_identify(nvme_t *nvme, uint32_t nsid) 1581 { 1582 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1583 void *buf = NULL; 1584 1585 cmd->nc_sqid = 0; 1586 cmd->nc_callback = nvme_wakeup_cmd; 1587 cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY; 1588 cmd->nc_sqe.sqe_nsid = nsid; 1589 cmd->nc_sqe.sqe_cdw10 = nsid ? NVME_IDENTIFY_NSID : NVME_IDENTIFY_CTRL; 1590 1591 if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ, 1592 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 1593 dev_err(nvme->n_dip, CE_WARN, 1594 "!nvme_zalloc_dma failed for IDENTIFY"); 1595 goto fail; 1596 } 1597 1598 if (cmd->nc_dma->nd_ncookie > 2) { 1599 dev_err(nvme->n_dip, CE_WARN, 1600 "!too many DMA cookies for IDENTIFY"); 1601 atomic_inc_32(&nvme->n_too_many_cookies); 1602 goto fail; 1603 } 1604 1605 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress; 1606 if (cmd->nc_dma->nd_ncookie > 1) { 1607 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 1608 &cmd->nc_dma->nd_cookie); 1609 cmd->nc_sqe.sqe_dptr.d_prp[1] = 1610 cmd->nc_dma->nd_cookie.dmac_laddress; 1611 } 1612 1613 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { 1614 dev_err(nvme->n_dip, CE_WARN, 1615 "!nvme_admin_cmd failed for IDENTIFY"); 1616 return (NULL); 1617 } 1618 1619 if (nvme_check_cmd_status(cmd)) { 1620 dev_err(nvme->n_dip, CE_WARN, 1621 "!IDENTIFY failed with sct = %x, sc = %x", 1622 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 1623 goto fail; 1624 } 1625 1626 buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP); 1627 bcopy(cmd->nc_dma->nd_memp, buf, NVME_IDENTIFY_BUFSIZE); 1628 1629 fail: 1630 nvme_free_cmd(cmd); 1631 1632 return (buf); 1633 } 1634 1635 static boolean_t 1636 nvme_set_features(nvme_t *nvme, uint32_t nsid, uint8_t feature, uint32_t val, 1637 uint32_t *res) 1638 { 1639 _NOTE(ARGUNUSED(nsid)); 1640 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1641 boolean_t ret = B_FALSE; 1642 1643 ASSERT(res != NULL); 1644 1645 cmd->nc_sqid = 0; 1646 cmd->nc_callback = nvme_wakeup_cmd; 1647 cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES; 1648 cmd->nc_sqe.sqe_cdw10 = feature; 1649 cmd->nc_sqe.sqe_cdw11 = val; 1650 1651 switch (feature) { 1652 case NVME_FEAT_WRITE_CACHE: 1653 if (!nvme->n_write_cache_present) 1654 goto fail; 1655 break; 1656 1657 case NVME_FEAT_NQUEUES: 1658 break; 1659 1660 default: 1661 goto fail; 1662 } 1663 1664 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { 1665 dev_err(nvme->n_dip, CE_WARN, 1666 "!nvme_admin_cmd failed for SET FEATURES"); 1667 return (ret); 1668 } 1669 1670 if (nvme_check_cmd_status(cmd)) { 1671 dev_err(nvme->n_dip, CE_WARN, 1672 "!SET FEATURES %d failed with sct = %x, sc = %x", 1673 feature, cmd->nc_cqe.cqe_sf.sf_sct, 1674 cmd->nc_cqe.cqe_sf.sf_sc); 1675 goto fail; 1676 } 1677 1678 *res = cmd->nc_cqe.cqe_dw0; 1679 ret = B_TRUE; 1680 1681 fail: 1682 nvme_free_cmd(cmd); 1683 return (ret); 1684 } 1685 1686 static boolean_t 1687 nvme_write_cache_set(nvme_t *nvme, boolean_t enable) 1688 { 1689 nvme_write_cache_t nwc = { 0 }; 1690 1691 if (enable) 1692 nwc.b.wc_wce = 1; 1693 1694 if (!nvme_set_features(nvme, 0, NVME_FEAT_WRITE_CACHE, nwc.r, &nwc.r)) 1695 return (B_FALSE); 1696 1697 return (B_TRUE); 1698 } 1699 1700 static int 1701 nvme_set_nqueues(nvme_t *nvme, uint16_t nqueues) 1702 { 1703 nvme_nqueue_t nq = { 0 }; 1704 1705 nq.b.nq_nsq = nq.b.nq_ncq = nqueues - 1; 1706 1707 if (!nvme_set_features(nvme, 0, NVME_FEAT_NQUEUES, nq.r, &nq.r)) { 1708 return (0); 1709 } 1710 1711 /* 1712 * Always use the same number of submission and completion queues, and 1713 * never use more than the requested number of queues. 1714 */ 1715 return (MIN(nqueues, MIN(nq.b.nq_nsq, nq.b.nq_ncq) + 1)); 1716 } 1717 1718 static int 1719 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx) 1720 { 1721 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1722 nvme_create_queue_dw10_t dw10 = { 0 }; 1723 nvme_create_cq_dw11_t c_dw11 = { 0 }; 1724 nvme_create_sq_dw11_t s_dw11 = { 0 }; 1725 1726 dw10.b.q_qid = idx; 1727 dw10.b.q_qsize = qp->nq_nentry - 1; 1728 1729 c_dw11.b.cq_pc = 1; 1730 c_dw11.b.cq_ien = 1; 1731 c_dw11.b.cq_iv = idx % nvme->n_intr_cnt; 1732 1733 cmd->nc_sqid = 0; 1734 cmd->nc_callback = nvme_wakeup_cmd; 1735 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE; 1736 cmd->nc_sqe.sqe_cdw10 = dw10.r; 1737 cmd->nc_sqe.sqe_cdw11 = c_dw11.r; 1738 cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_cqdma->nd_cookie.dmac_laddress; 1739 1740 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { 1741 dev_err(nvme->n_dip, CE_WARN, 1742 "!nvme_admin_cmd failed for CREATE CQUEUE"); 1743 return (DDI_FAILURE); 1744 } 1745 1746 if (nvme_check_cmd_status(cmd)) { 1747 dev_err(nvme->n_dip, CE_WARN, 1748 "!CREATE CQUEUE failed with sct = %x, sc = %x", 1749 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 1750 nvme_free_cmd(cmd); 1751 return (DDI_FAILURE); 1752 } 1753 1754 nvme_free_cmd(cmd); 1755 1756 s_dw11.b.sq_pc = 1; 1757 s_dw11.b.sq_cqid = idx; 1758 1759 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1760 cmd->nc_sqid = 0; 1761 cmd->nc_callback = nvme_wakeup_cmd; 1762 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE; 1763 cmd->nc_sqe.sqe_cdw10 = dw10.r; 1764 cmd->nc_sqe.sqe_cdw11 = s_dw11.r; 1765 cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress; 1766 1767 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { 1768 dev_err(nvme->n_dip, CE_WARN, 1769 "!nvme_admin_cmd failed for CREATE SQUEUE"); 1770 return (DDI_FAILURE); 1771 } 1772 1773 if (nvme_check_cmd_status(cmd)) { 1774 dev_err(nvme->n_dip, CE_WARN, 1775 "!CREATE SQUEUE failed with sct = %x, sc = %x", 1776 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 1777 nvme_free_cmd(cmd); 1778 return (DDI_FAILURE); 1779 } 1780 1781 nvme_free_cmd(cmd); 1782 1783 return (DDI_SUCCESS); 1784 } 1785 1786 static boolean_t 1787 nvme_reset(nvme_t *nvme, boolean_t quiesce) 1788 { 1789 nvme_reg_csts_t csts; 1790 int i; 1791 1792 nvme_put32(nvme, NVME_REG_CC, 0); 1793 1794 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 1795 if (csts.b.csts_rdy == 1) { 1796 nvme_put32(nvme, NVME_REG_CC, 0); 1797 for (i = 0; i != nvme->n_timeout * 10; i++) { 1798 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 1799 if (csts.b.csts_rdy == 0) 1800 break; 1801 1802 if (quiesce) 1803 drv_usecwait(50000); 1804 else 1805 delay(drv_usectohz(50000)); 1806 } 1807 } 1808 1809 nvme_put32(nvme, NVME_REG_AQA, 0); 1810 nvme_put32(nvme, NVME_REG_ASQ, 0); 1811 nvme_put32(nvme, NVME_REG_ACQ, 0); 1812 1813 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 1814 return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE); 1815 } 1816 1817 static void 1818 nvme_shutdown(nvme_t *nvme, int mode, boolean_t quiesce) 1819 { 1820 nvme_reg_cc_t cc; 1821 nvme_reg_csts_t csts; 1822 int i; 1823 1824 ASSERT(mode == NVME_CC_SHN_NORMAL || mode == NVME_CC_SHN_ABRUPT); 1825 1826 cc.r = nvme_get32(nvme, NVME_REG_CC); 1827 cc.b.cc_shn = mode & 0x3; 1828 nvme_put32(nvme, NVME_REG_CC, cc.r); 1829 1830 for (i = 0; i != 10; i++) { 1831 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 1832 if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE) 1833 break; 1834 1835 if (quiesce) 1836 drv_usecwait(100000); 1837 else 1838 delay(drv_usectohz(100000)); 1839 } 1840 } 1841 1842 1843 static void 1844 nvme_prepare_devid(nvme_t *nvme, uint32_t nsid) 1845 { 1846 /* 1847 * Section 7.7 of the spec describes how to get a unique ID for 1848 * the controller: the vendor ID, the model name and the serial 1849 * number shall be unique when combined. 1850 * 1851 * If a namespace has no EUI64 we use the above and add the hex 1852 * namespace ID to get a unique ID for the namespace. 1853 */ 1854 char model[sizeof (nvme->n_idctl->id_model) + 1]; 1855 char serial[sizeof (nvme->n_idctl->id_serial) + 1]; 1856 1857 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 1858 bcopy(nvme->n_idctl->id_serial, serial, 1859 sizeof (nvme->n_idctl->id_serial)); 1860 1861 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 1862 serial[sizeof (nvme->n_idctl->id_serial)] = '\0'; 1863 1864 nvme->n_ns[nsid - 1].ns_devid = kmem_asprintf("%4X-%s-%s-%X", 1865 nvme->n_idctl->id_vid, model, serial, nsid); 1866 } 1867 1868 static int 1869 nvme_init(nvme_t *nvme) 1870 { 1871 nvme_reg_cc_t cc = { 0 }; 1872 nvme_reg_aqa_t aqa = { 0 }; 1873 nvme_reg_asq_t asq = { 0 }; 1874 nvme_reg_acq_t acq = { 0 }; 1875 nvme_reg_cap_t cap; 1876 nvme_reg_vs_t vs; 1877 nvme_reg_csts_t csts; 1878 int i = 0; 1879 int nqueues; 1880 char model[sizeof (nvme->n_idctl->id_model) + 1]; 1881 char *vendor, *product; 1882 1883 /* Check controller version */ 1884 vs.r = nvme_get32(nvme, NVME_REG_VS); 1885 nvme->n_version.v_major = vs.b.vs_mjr; 1886 nvme->n_version.v_minor = vs.b.vs_mnr; 1887 dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d", 1888 nvme->n_version.v_major, nvme->n_version.v_minor); 1889 1890 if (NVME_VERSION_HIGHER(&nvme->n_version, 1891 nvme_version_major, nvme_version_minor)) { 1892 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.%d", 1893 nvme_version_major, nvme_version_minor); 1894 if (nvme->n_strict_version) 1895 goto fail; 1896 } 1897 1898 /* retrieve controller configuration */ 1899 cap.r = nvme_get64(nvme, NVME_REG_CAP); 1900 1901 if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) { 1902 dev_err(nvme->n_dip, CE_WARN, 1903 "!NVM command set not supported by hardware"); 1904 goto fail; 1905 } 1906 1907 nvme->n_nssr_supported = cap.b.cap_nssrs; 1908 nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd; 1909 nvme->n_timeout = cap.b.cap_to; 1910 nvme->n_arbitration_mechanisms = cap.b.cap_ams; 1911 nvme->n_cont_queues_reqd = cap.b.cap_cqr; 1912 nvme->n_max_queue_entries = cap.b.cap_mqes + 1; 1913 1914 /* 1915 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify 1916 * the base page size of 4k (1<<12), so add 12 here to get the real 1917 * page size value. 1918 */ 1919 nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT), 1920 cap.b.cap_mpsmax + 12); 1921 nvme->n_pagesize = 1UL << (nvme->n_pageshift); 1922 1923 /* 1924 * Set up Queue DMA to transfer at least 1 page-aligned page at a time. 1925 */ 1926 nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize; 1927 nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 1928 1929 /* 1930 * Set up PRP DMA to transfer 1 page-aligned page at a time. 1931 * Maxxfer may be increased after we identified the controller limits. 1932 */ 1933 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize; 1934 nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 1935 nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize; 1936 nvme->n_prp_dma_attr.dma_attr_seg = nvme->n_pagesize - 1; 1937 1938 /* 1939 * Reset controller if it's still in ready state. 1940 */ 1941 if (nvme_reset(nvme, B_FALSE) == B_FALSE) { 1942 dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller"); 1943 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1944 nvme->n_dead = B_TRUE; 1945 goto fail; 1946 } 1947 1948 /* 1949 * Create the admin queue pair. 1950 */ 1951 if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0) 1952 != DDI_SUCCESS) { 1953 dev_err(nvme->n_dip, CE_WARN, 1954 "!unable to allocate admin qpair"); 1955 goto fail; 1956 } 1957 nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP); 1958 nvme->n_ioq[0] = nvme->n_adminq; 1959 1960 nvme->n_progress |= NVME_ADMIN_QUEUE; 1961 1962 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 1963 "admin-queue-len", nvme->n_admin_queue_len); 1964 1965 aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1; 1966 asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress; 1967 acq = nvme->n_adminq->nq_cqdma->nd_cookie.dmac_laddress; 1968 1969 ASSERT((asq & (nvme->n_pagesize - 1)) == 0); 1970 ASSERT((acq & (nvme->n_pagesize - 1)) == 0); 1971 1972 nvme_put32(nvme, NVME_REG_AQA, aqa.r); 1973 nvme_put64(nvme, NVME_REG_ASQ, asq); 1974 nvme_put64(nvme, NVME_REG_ACQ, acq); 1975 1976 cc.b.cc_ams = 0; /* use Round-Robin arbitration */ 1977 cc.b.cc_css = 0; /* use NVM command set */ 1978 cc.b.cc_mps = nvme->n_pageshift - 12; 1979 cc.b.cc_shn = 0; /* no shutdown in progress */ 1980 cc.b.cc_en = 1; /* enable controller */ 1981 cc.b.cc_iosqes = 6; /* submission queue entry is 2^6 bytes long */ 1982 cc.b.cc_iocqes = 4; /* completion queue entry is 2^4 bytes long */ 1983 1984 nvme_put32(nvme, NVME_REG_CC, cc.r); 1985 1986 /* 1987 * Wait for the controller to become ready. 1988 */ 1989 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 1990 if (csts.b.csts_rdy == 0) { 1991 for (i = 0; i != nvme->n_timeout * 10; i++) { 1992 delay(drv_usectohz(50000)); 1993 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 1994 1995 if (csts.b.csts_cfs == 1) { 1996 dev_err(nvme->n_dip, CE_WARN, 1997 "!controller fatal status at init"); 1998 ddi_fm_service_impact(nvme->n_dip, 1999 DDI_SERVICE_LOST); 2000 nvme->n_dead = B_TRUE; 2001 goto fail; 2002 } 2003 2004 if (csts.b.csts_rdy == 1) 2005 break; 2006 } 2007 } 2008 2009 if (csts.b.csts_rdy == 0) { 2010 dev_err(nvme->n_dip, CE_WARN, "!controller not ready"); 2011 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 2012 nvme->n_dead = B_TRUE; 2013 goto fail; 2014 } 2015 2016 /* 2017 * Assume an abort command limit of 1. We'll destroy and re-init 2018 * that later when we know the true abort command limit. 2019 */ 2020 sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL); 2021 2022 /* 2023 * Setup initial interrupt for admin queue. 2024 */ 2025 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1) 2026 != DDI_SUCCESS) && 2027 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1) 2028 != DDI_SUCCESS) && 2029 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1) 2030 != DDI_SUCCESS)) { 2031 dev_err(nvme->n_dip, CE_WARN, 2032 "!failed to setup initial interrupt"); 2033 goto fail; 2034 } 2035 2036 /* 2037 * Post an asynchronous event command to catch errors. 2038 */ 2039 if (nvme_async_event(nvme) != DDI_SUCCESS) { 2040 dev_err(nvme->n_dip, CE_WARN, 2041 "!failed to post async event"); 2042 goto fail; 2043 } 2044 2045 /* 2046 * Identify Controller 2047 */ 2048 nvme->n_idctl = nvme_identify(nvme, 0); 2049 if (nvme->n_idctl == NULL) { 2050 dev_err(nvme->n_dip, CE_WARN, 2051 "!failed to identify controller"); 2052 goto fail; 2053 } 2054 2055 /* 2056 * Get Vendor & Product ID 2057 */ 2058 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 2059 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 2060 sata_split_model(model, &vendor, &product); 2061 2062 if (vendor == NULL) 2063 nvme->n_vendor = strdup("NVMe"); 2064 else 2065 nvme->n_vendor = strdup(vendor); 2066 2067 nvme->n_product = strdup(product); 2068 2069 /* 2070 * Get controller limits. 2071 */ 2072 nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT, 2073 MIN(nvme->n_admin_queue_len / 10, 2074 MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit))); 2075 2076 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 2077 "async-event-limit", nvme->n_async_event_limit); 2078 2079 nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1; 2080 2081 /* 2082 * Reinitialize the semaphore with the true abort command limit 2083 * supported by the hardware. It's not necessary to disable interrupts 2084 * as only command aborts use the semaphore, and no commands are 2085 * executed or aborted while we're here. 2086 */ 2087 sema_destroy(&nvme->n_abort_sema); 2088 sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL, 2089 SEMA_DRIVER, NULL); 2090 2091 nvme->n_progress |= NVME_CTRL_LIMITS; 2092 2093 if (nvme->n_idctl->id_mdts == 0) 2094 nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536; 2095 else 2096 nvme->n_max_data_transfer_size = 2097 1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts); 2098 2099 nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1; 2100 2101 /* 2102 * Limit n_max_data_transfer_size to what we can handle in one PRP. 2103 * Chained PRPs are currently unsupported. 2104 * 2105 * This is a no-op on hardware which doesn't support a transfer size 2106 * big enough to require chained PRPs. 2107 */ 2108 nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size, 2109 (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize)); 2110 2111 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size; 2112 2113 /* 2114 * Make sure the minimum/maximum queue entry sizes are not 2115 * larger/smaller than the default. 2116 */ 2117 2118 if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) || 2119 ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) || 2120 ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) || 2121 ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t))) 2122 goto fail; 2123 2124 /* 2125 * Check for the presence of a Volatile Write Cache. If present, 2126 * enable or disable based on the value of the property 2127 * volatile-write-cache-enable (default is enabled). 2128 */ 2129 nvme->n_write_cache_present = 2130 nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE; 2131 2132 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 2133 "volatile-write-cache-present", 2134 nvme->n_write_cache_present ? 1 : 0); 2135 2136 if (!nvme->n_write_cache_present) { 2137 nvme->n_write_cache_enabled = B_FALSE; 2138 } else if (!nvme_write_cache_set(nvme, nvme->n_write_cache_enabled)) { 2139 dev_err(nvme->n_dip, CE_WARN, 2140 "!failed to %sable volatile write cache", 2141 nvme->n_write_cache_enabled ? "en" : "dis"); 2142 /* 2143 * Assume the cache is (still) enabled. 2144 */ 2145 nvme->n_write_cache_enabled = B_TRUE; 2146 } 2147 2148 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 2149 "volatile-write-cache-enable", 2150 nvme->n_write_cache_enabled ? 1 : 0); 2151 2152 /* 2153 * Grab a copy of all mandatory log pages. 2154 * 2155 * TODO: should go away once user space tool exists to print logs 2156 */ 2157 nvme->n_error_log = (nvme_error_log_entry_t *) 2158 nvme_get_logpage(nvme, NVME_LOGPAGE_ERROR); 2159 nvme->n_health_log = (nvme_health_log_t *) 2160 nvme_get_logpage(nvme, NVME_LOGPAGE_HEALTH, -1); 2161 nvme->n_fwslot_log = (nvme_fwslot_log_t *) 2162 nvme_get_logpage(nvme, NVME_LOGPAGE_FWSLOT); 2163 2164 /* 2165 * Identify Namespaces 2166 */ 2167 nvme->n_namespace_count = nvme->n_idctl->id_nn; 2168 nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) * 2169 nvme->n_namespace_count, KM_SLEEP); 2170 2171 for (i = 0; i != nvme->n_namespace_count; i++) { 2172 nvme_identify_nsid_t *idns; 2173 int last_rp; 2174 2175 nvme->n_ns[i].ns_nvme = nvme; 2176 nvme->n_ns[i].ns_idns = idns = nvme_identify(nvme, i + 1); 2177 2178 if (idns == NULL) { 2179 dev_err(nvme->n_dip, CE_WARN, 2180 "!failed to identify namespace %d", i + 1); 2181 goto fail; 2182 } 2183 2184 nvme->n_ns[i].ns_id = i + 1; 2185 nvme->n_ns[i].ns_block_count = idns->id_nsize; 2186 nvme->n_ns[i].ns_block_size = 2187 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; 2188 nvme->n_ns[i].ns_best_block_size = nvme->n_ns[i].ns_block_size; 2189 2190 /* 2191 * Get the EUI64 if present. If not present prepare the devid 2192 * from other device data. 2193 */ 2194 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) 2195 bcopy(idns->id_eui64, nvme->n_ns[i].ns_eui64, 2196 sizeof (nvme->n_ns[i].ns_eui64)); 2197 2198 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 2199 if (*(uint64_t *)nvme->n_ns[i].ns_eui64 == 0) { 2200 nvme_prepare_devid(nvme, nvme->n_ns[i].ns_id); 2201 } else { 2202 /* 2203 * Until EUI64 support is tested on real hardware we 2204 * will ignore namespaces with an EUI64. This can 2205 * be overriden by setting strict-version=0 in nvme.conf 2206 */ 2207 if (nvme->n_strict_version) 2208 nvme->n_ns[i].ns_ignore = B_TRUE; 2209 } 2210 2211 /* 2212 * Find the LBA format with no metadata and the best relative 2213 * performance. A value of 3 means "degraded", 0 is best. 2214 */ 2215 last_rp = 3; 2216 for (int j = 0; j <= idns->id_nlbaf; j++) { 2217 if (idns->id_lbaf[j].lbaf_lbads == 0) 2218 break; 2219 if (idns->id_lbaf[j].lbaf_ms != 0) 2220 continue; 2221 if (idns->id_lbaf[j].lbaf_rp >= last_rp) 2222 continue; 2223 last_rp = idns->id_lbaf[j].lbaf_rp; 2224 nvme->n_ns[i].ns_best_block_size = 2225 1 << idns->id_lbaf[j].lbaf_lbads; 2226 } 2227 2228 if (nvme->n_ns[i].ns_best_block_size < nvme->n_min_block_size) 2229 nvme->n_ns[i].ns_best_block_size = 2230 nvme->n_min_block_size; 2231 2232 /* 2233 * We currently don't support namespaces that use either: 2234 * - thin provisioning 2235 * - protection information 2236 */ 2237 if (idns->id_nsfeat.f_thin || 2238 idns->id_dps.dp_pinfo) { 2239 dev_err(nvme->n_dip, CE_WARN, 2240 "!ignoring namespace %d, unsupported features: " 2241 "thin = %d, pinfo = %d", i + 1, 2242 idns->id_nsfeat.f_thin, idns->id_dps.dp_pinfo); 2243 nvme->n_ns[i].ns_ignore = B_TRUE; 2244 } 2245 } 2246 2247 /* 2248 * Try to set up MSI/MSI-X interrupts. 2249 */ 2250 if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX)) 2251 != 0) { 2252 nvme_release_interrupts(nvme); 2253 2254 nqueues = MIN(UINT16_MAX, ncpus); 2255 2256 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 2257 nqueues) != DDI_SUCCESS) && 2258 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 2259 nqueues) != DDI_SUCCESS)) { 2260 dev_err(nvme->n_dip, CE_WARN, 2261 "!failed to setup MSI/MSI-X interrupts"); 2262 goto fail; 2263 } 2264 } 2265 2266 nqueues = nvme->n_intr_cnt; 2267 2268 /* 2269 * Create I/O queue pairs. 2270 */ 2271 nvme->n_ioq_count = nvme_set_nqueues(nvme, nqueues); 2272 if (nvme->n_ioq_count == 0) { 2273 dev_err(nvme->n_dip, CE_WARN, 2274 "!failed to set number of I/O queues to %d", nqueues); 2275 goto fail; 2276 } 2277 2278 /* 2279 * Reallocate I/O queue array 2280 */ 2281 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *)); 2282 nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) * 2283 (nvme->n_ioq_count + 1), KM_SLEEP); 2284 nvme->n_ioq[0] = nvme->n_adminq; 2285 2286 /* 2287 * If we got less queues than we asked for we might as well give 2288 * some of the interrupt vectors back to the system. 2289 */ 2290 if (nvme->n_ioq_count < nqueues) { 2291 nvme_release_interrupts(nvme); 2292 2293 if (nvme_setup_interrupts(nvme, nvme->n_intr_type, 2294 nvme->n_ioq_count) != DDI_SUCCESS) { 2295 dev_err(nvme->n_dip, CE_WARN, 2296 "!failed to reduce number of interrupts"); 2297 goto fail; 2298 } 2299 } 2300 2301 /* 2302 * Alloc & register I/O queue pairs 2303 */ 2304 nvme->n_io_queue_len = 2305 MIN(nvme->n_io_queue_len, nvme->n_max_queue_entries); 2306 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-queue-len", 2307 nvme->n_io_queue_len); 2308 2309 for (i = 1; i != nvme->n_ioq_count + 1; i++) { 2310 if (nvme_alloc_qpair(nvme, nvme->n_io_queue_len, 2311 &nvme->n_ioq[i], i) != DDI_SUCCESS) { 2312 dev_err(nvme->n_dip, CE_WARN, 2313 "!unable to allocate I/O qpair %d", i); 2314 goto fail; 2315 } 2316 2317 if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) 2318 != DDI_SUCCESS) { 2319 dev_err(nvme->n_dip, CE_WARN, 2320 "!unable to create I/O qpair %d", i); 2321 goto fail; 2322 } 2323 } 2324 2325 /* 2326 * Post more asynchronous events commands to reduce event reporting 2327 * latency as suggested by the spec. 2328 */ 2329 for (i = 1; i != nvme->n_async_event_limit; i++) { 2330 if (nvme_async_event(nvme) != DDI_SUCCESS) { 2331 dev_err(nvme->n_dip, CE_WARN, 2332 "!failed to post async event %d", i); 2333 goto fail; 2334 } 2335 } 2336 2337 return (DDI_SUCCESS); 2338 2339 fail: 2340 (void) nvme_reset(nvme, B_FALSE); 2341 return (DDI_FAILURE); 2342 } 2343 2344 static uint_t 2345 nvme_intr(caddr_t arg1, caddr_t arg2) 2346 { 2347 /*LINTED: E_PTR_BAD_CAST_ALIGN*/ 2348 nvme_t *nvme = (nvme_t *)arg1; 2349 int inum = (int)(uintptr_t)arg2; 2350 int ccnt = 0; 2351 int qnum; 2352 nvme_cmd_t *cmd; 2353 2354 if (inum >= nvme->n_intr_cnt) 2355 return (DDI_INTR_UNCLAIMED); 2356 2357 /* 2358 * The interrupt vector a queue uses is calculated as queue_idx % 2359 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array 2360 * in steps of n_intr_cnt to process all queues using this vector. 2361 */ 2362 for (qnum = inum; 2363 qnum < nvme->n_ioq_count + 1 && nvme->n_ioq[qnum] != NULL; 2364 qnum += nvme->n_intr_cnt) { 2365 while ((cmd = nvme_retrieve_cmd(nvme, nvme->n_ioq[qnum]))) { 2366 taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq, 2367 cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent); 2368 ccnt++; 2369 } 2370 } 2371 2372 return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED); 2373 } 2374 2375 static void 2376 nvme_release_interrupts(nvme_t *nvme) 2377 { 2378 int i; 2379 2380 for (i = 0; i < nvme->n_intr_cnt; i++) { 2381 if (nvme->n_inth[i] == NULL) 2382 break; 2383 2384 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 2385 (void) ddi_intr_block_disable(&nvme->n_inth[i], 1); 2386 else 2387 (void) ddi_intr_disable(nvme->n_inth[i]); 2388 2389 (void) ddi_intr_remove_handler(nvme->n_inth[i]); 2390 (void) ddi_intr_free(nvme->n_inth[i]); 2391 } 2392 2393 kmem_free(nvme->n_inth, nvme->n_inth_sz); 2394 nvme->n_inth = NULL; 2395 nvme->n_inth_sz = 0; 2396 2397 nvme->n_progress &= ~NVME_INTERRUPTS; 2398 } 2399 2400 static int 2401 nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs) 2402 { 2403 int nintrs, navail, count; 2404 int ret; 2405 int i; 2406 2407 if (nvme->n_intr_types == 0) { 2408 ret = ddi_intr_get_supported_types(nvme->n_dip, 2409 &nvme->n_intr_types); 2410 if (ret != DDI_SUCCESS) { 2411 dev_err(nvme->n_dip, CE_WARN, 2412 "!%s: ddi_intr_get_supported types failed", 2413 __func__); 2414 return (ret); 2415 } 2416 #ifdef __x86 2417 if (get_hwenv() == HW_VMWARE) 2418 nvme->n_intr_types &= ~DDI_INTR_TYPE_MSIX; 2419 #endif 2420 } 2421 2422 if ((nvme->n_intr_types & intr_type) == 0) 2423 return (DDI_FAILURE); 2424 2425 ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs); 2426 if (ret != DDI_SUCCESS) { 2427 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed", 2428 __func__); 2429 return (ret); 2430 } 2431 2432 ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail); 2433 if (ret != DDI_SUCCESS) { 2434 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed", 2435 __func__); 2436 return (ret); 2437 } 2438 2439 /* We want at most one interrupt per queue pair. */ 2440 if (navail > nqpairs) 2441 navail = nqpairs; 2442 2443 nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail; 2444 nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP); 2445 2446 ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail, 2447 &count, 0); 2448 if (ret != DDI_SUCCESS) { 2449 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed", 2450 __func__); 2451 goto fail; 2452 } 2453 2454 nvme->n_intr_cnt = count; 2455 2456 ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri); 2457 if (ret != DDI_SUCCESS) { 2458 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed", 2459 __func__); 2460 goto fail; 2461 } 2462 2463 for (i = 0; i < count; i++) { 2464 ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr, 2465 (void *)nvme, (void *)(uintptr_t)i); 2466 if (ret != DDI_SUCCESS) { 2467 dev_err(nvme->n_dip, CE_WARN, 2468 "!%s: ddi_intr_add_handler failed", __func__); 2469 goto fail; 2470 } 2471 } 2472 2473 (void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap); 2474 2475 for (i = 0; i < count; i++) { 2476 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 2477 ret = ddi_intr_block_enable(&nvme->n_inth[i], 1); 2478 else 2479 ret = ddi_intr_enable(nvme->n_inth[i]); 2480 2481 if (ret != DDI_SUCCESS) { 2482 dev_err(nvme->n_dip, CE_WARN, 2483 "!%s: enabling interrupt %d failed", __func__, i); 2484 goto fail; 2485 } 2486 } 2487 2488 nvme->n_intr_type = intr_type; 2489 2490 nvme->n_progress |= NVME_INTERRUPTS; 2491 2492 return (DDI_SUCCESS); 2493 2494 fail: 2495 nvme_release_interrupts(nvme); 2496 2497 return (ret); 2498 } 2499 2500 static int 2501 nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg) 2502 { 2503 _NOTE(ARGUNUSED(arg)); 2504 2505 pci_ereport_post(dip, fm_error, NULL); 2506 return (fm_error->fme_status); 2507 } 2508 2509 static int 2510 nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2511 { 2512 nvme_t *nvme; 2513 int instance; 2514 int nregs; 2515 off_t regsize; 2516 int i; 2517 char name[32]; 2518 2519 if (cmd != DDI_ATTACH) 2520 return (DDI_FAILURE); 2521 2522 instance = ddi_get_instance(dip); 2523 2524 if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS) 2525 return (DDI_FAILURE); 2526 2527 nvme = ddi_get_soft_state(nvme_state, instance); 2528 ddi_set_driver_private(dip, nvme); 2529 nvme->n_dip = dip; 2530 2531 nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2532 DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE; 2533 nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY, 2534 dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ? 2535 B_TRUE : B_FALSE; 2536 nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2537 DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN); 2538 nvme->n_io_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2539 DDI_PROP_DONTPASS, "io-queue-len", NVME_DEFAULT_IO_QUEUE_LEN); 2540 nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2541 DDI_PROP_DONTPASS, "async-event-limit", 2542 NVME_DEFAULT_ASYNC_EVENT_LIMIT); 2543 nvme->n_write_cache_enabled = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2544 DDI_PROP_DONTPASS, "volatile-write-cache-enable", 1) != 0 ? 2545 B_TRUE : B_FALSE; 2546 nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2547 DDI_PROP_DONTPASS, "min-phys-block-size", 2548 NVME_DEFAULT_MIN_BLOCK_SIZE); 2549 2550 if (!ISP2(nvme->n_min_block_size) || 2551 (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) { 2552 dev_err(dip, CE_WARN, "!min-phys-block-size %s, " 2553 "using default %d", ISP2(nvme->n_min_block_size) ? 2554 "too low" : "not a power of 2", 2555 NVME_DEFAULT_MIN_BLOCK_SIZE); 2556 nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE; 2557 } 2558 2559 if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN) 2560 nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN; 2561 else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN) 2562 nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN; 2563 2564 if (nvme->n_io_queue_len < NVME_MIN_IO_QUEUE_LEN) 2565 nvme->n_io_queue_len = NVME_MIN_IO_QUEUE_LEN; 2566 2567 if (nvme->n_async_event_limit < 1) 2568 nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT; 2569 2570 nvme->n_reg_acc_attr = nvme_reg_acc_attr; 2571 nvme->n_queue_dma_attr = nvme_queue_dma_attr; 2572 nvme->n_prp_dma_attr = nvme_prp_dma_attr; 2573 nvme->n_sgl_dma_attr = nvme_sgl_dma_attr; 2574 2575 /* 2576 * Setup FMA support. 2577 */ 2578 nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip, 2579 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable", 2580 DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 2581 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE); 2582 2583 ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc); 2584 2585 if (nvme->n_fm_cap) { 2586 if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE) 2587 nvme->n_reg_acc_attr.devacc_attr_access = 2588 DDI_FLAGERR_ACC; 2589 2590 if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) { 2591 nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 2592 nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 2593 } 2594 2595 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 2596 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 2597 pci_ereport_setup(dip); 2598 2599 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 2600 ddi_fm_handler_register(dip, nvme_fm_errcb, 2601 (void *)nvme); 2602 } 2603 2604 nvme->n_progress |= NVME_FMA_INIT; 2605 2606 /* 2607 * The spec defines several register sets. Only the controller 2608 * registers (set 1) are currently used. 2609 */ 2610 if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE || 2611 nregs < 2 || 2612 ddi_dev_regsize(dip, 1, ®size) == DDI_FAILURE) 2613 goto fail; 2614 2615 if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize, 2616 &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) { 2617 dev_err(dip, CE_WARN, "!failed to map regset 1"); 2618 goto fail; 2619 } 2620 2621 nvme->n_progress |= NVME_REGS_MAPPED; 2622 2623 /* 2624 * Create taskq for command completion. 2625 */ 2626 (void) snprintf(name, sizeof (name), "%s%d_cmd_taskq", 2627 ddi_driver_name(dip), ddi_get_instance(dip)); 2628 nvme->n_cmd_taskq = ddi_taskq_create(dip, name, MIN(UINT16_MAX, ncpus), 2629 TASKQ_DEFAULTPRI, 0); 2630 if (nvme->n_cmd_taskq == NULL) { 2631 dev_err(dip, CE_WARN, "!failed to create cmd taskq"); 2632 goto fail; 2633 } 2634 2635 /* 2636 * Create PRP DMA cache 2637 */ 2638 (void) snprintf(name, sizeof (name), "%s%d_prp_cache", 2639 ddi_driver_name(dip), ddi_get_instance(dip)); 2640 nvme->n_prp_cache = kmem_cache_create(name, sizeof (nvme_dma_t), 2641 0, nvme_prp_dma_constructor, nvme_prp_dma_destructor, 2642 NULL, (void *)nvme, NULL, 0); 2643 2644 if (nvme_init(nvme) != DDI_SUCCESS) 2645 goto fail; 2646 2647 /* 2648 * Attach the blkdev driver for each namespace. 2649 */ 2650 for (i = 0; i != nvme->n_namespace_count; i++) { 2651 if (nvme->n_ns[i].ns_ignore) 2652 continue; 2653 2654 nvme->n_ns[i].ns_bd_hdl = bd_alloc_handle(&nvme->n_ns[i], 2655 &nvme_bd_ops, &nvme->n_prp_dma_attr, KM_SLEEP); 2656 2657 if (nvme->n_ns[i].ns_bd_hdl == NULL) { 2658 dev_err(dip, CE_WARN, 2659 "!failed to get blkdev handle for namespace %d", i); 2660 goto fail; 2661 } 2662 2663 if (bd_attach_handle(dip, nvme->n_ns[i].ns_bd_hdl) 2664 != DDI_SUCCESS) { 2665 dev_err(dip, CE_WARN, 2666 "!failed to attach blkdev handle for namespace %d", 2667 i); 2668 goto fail; 2669 } 2670 } 2671 2672 return (DDI_SUCCESS); 2673 2674 fail: 2675 /* attach successful anyway so that FMA can retire the device */ 2676 if (nvme->n_dead) 2677 return (DDI_SUCCESS); 2678 2679 (void) nvme_detach(dip, DDI_DETACH); 2680 2681 return (DDI_FAILURE); 2682 } 2683 2684 static int 2685 nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2686 { 2687 int instance, i; 2688 nvme_t *nvme; 2689 2690 if (cmd != DDI_DETACH) 2691 return (DDI_FAILURE); 2692 2693 instance = ddi_get_instance(dip); 2694 2695 nvme = ddi_get_soft_state(nvme_state, instance); 2696 2697 if (nvme == NULL) 2698 return (DDI_FAILURE); 2699 2700 if (nvme->n_ns) { 2701 for (i = 0; i != nvme->n_namespace_count; i++) { 2702 if (nvme->n_ns[i].ns_bd_hdl) { 2703 (void) bd_detach_handle( 2704 nvme->n_ns[i].ns_bd_hdl); 2705 bd_free_handle(nvme->n_ns[i].ns_bd_hdl); 2706 } 2707 2708 if (nvme->n_ns[i].ns_idns) 2709 kmem_free(nvme->n_ns[i].ns_idns, 2710 sizeof (nvme_identify_nsid_t)); 2711 if (nvme->n_ns[i].ns_devid) 2712 strfree(nvme->n_ns[i].ns_devid); 2713 } 2714 2715 kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) * 2716 nvme->n_namespace_count); 2717 } 2718 2719 if (nvme->n_progress & NVME_INTERRUPTS) 2720 nvme_release_interrupts(nvme); 2721 2722 if (nvme->n_cmd_taskq) 2723 ddi_taskq_wait(nvme->n_cmd_taskq); 2724 2725 if (nvme->n_ioq_count > 0) { 2726 for (i = 1; i != nvme->n_ioq_count + 1; i++) { 2727 if (nvme->n_ioq[i] != NULL) { 2728 /* TODO: send destroy queue commands */ 2729 nvme_free_qpair(nvme->n_ioq[i]); 2730 } 2731 } 2732 2733 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) * 2734 (nvme->n_ioq_count + 1)); 2735 } 2736 2737 if (nvme->n_prp_cache != NULL) { 2738 kmem_cache_destroy(nvme->n_prp_cache); 2739 } 2740 2741 if (nvme->n_progress & NVME_REGS_MAPPED) { 2742 nvme_shutdown(nvme, NVME_CC_SHN_NORMAL, B_FALSE); 2743 (void) nvme_reset(nvme, B_FALSE); 2744 } 2745 2746 if (nvme->n_cmd_taskq) 2747 ddi_taskq_destroy(nvme->n_cmd_taskq); 2748 2749 if (nvme->n_progress & NVME_CTRL_LIMITS) 2750 sema_destroy(&nvme->n_abort_sema); 2751 2752 if (nvme->n_progress & NVME_ADMIN_QUEUE) 2753 nvme_free_qpair(nvme->n_adminq); 2754 2755 if (nvme->n_idctl) 2756 kmem_free(nvme->n_idctl, sizeof (nvme_identify_ctrl_t)); 2757 2758 if (nvme->n_progress & NVME_REGS_MAPPED) 2759 ddi_regs_map_free(&nvme->n_regh); 2760 2761 if (nvme->n_progress & NVME_FMA_INIT) { 2762 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 2763 ddi_fm_handler_unregister(nvme->n_dip); 2764 2765 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 2766 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 2767 pci_ereport_teardown(nvme->n_dip); 2768 2769 ddi_fm_fini(nvme->n_dip); 2770 } 2771 2772 if (nvme->n_vendor != NULL) 2773 strfree(nvme->n_vendor); 2774 2775 if (nvme->n_product != NULL) 2776 strfree(nvme->n_product); 2777 2778 ddi_soft_state_free(nvme_state, instance); 2779 2780 return (DDI_SUCCESS); 2781 } 2782 2783 static int 2784 nvme_quiesce(dev_info_t *dip) 2785 { 2786 int instance; 2787 nvme_t *nvme; 2788 2789 instance = ddi_get_instance(dip); 2790 2791 nvme = ddi_get_soft_state(nvme_state, instance); 2792 2793 if (nvme == NULL) 2794 return (DDI_FAILURE); 2795 2796 nvme_shutdown(nvme, NVME_CC_SHN_ABRUPT, B_TRUE); 2797 2798 (void) nvme_reset(nvme, B_TRUE); 2799 2800 return (DDI_FAILURE); 2801 } 2802 2803 static int 2804 nvme_fill_prp(nvme_cmd_t *cmd, bd_xfer_t *xfer) 2805 { 2806 nvme_t *nvme = cmd->nc_nvme; 2807 int nprp_page, nprp; 2808 uint64_t *prp; 2809 2810 if (xfer->x_ndmac == 0) 2811 return (DDI_FAILURE); 2812 2813 cmd->nc_sqe.sqe_dptr.d_prp[0] = xfer->x_dmac.dmac_laddress; 2814 ddi_dma_nextcookie(xfer->x_dmah, &xfer->x_dmac); 2815 2816 if (xfer->x_ndmac == 1) { 2817 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0; 2818 return (DDI_SUCCESS); 2819 } else if (xfer->x_ndmac == 2) { 2820 cmd->nc_sqe.sqe_dptr.d_prp[1] = xfer->x_dmac.dmac_laddress; 2821 return (DDI_SUCCESS); 2822 } 2823 2824 xfer->x_ndmac--; 2825 2826 nprp_page = nvme->n_pagesize / sizeof (uint64_t) - 1; 2827 ASSERT(nprp_page > 0); 2828 nprp = (xfer->x_ndmac + nprp_page - 1) / nprp_page; 2829 2830 /* 2831 * We currently don't support chained PRPs and set up our DMA 2832 * attributes to reflect that. If we still get an I/O request 2833 * that needs a chained PRP something is very wrong. 2834 */ 2835 VERIFY(nprp == 1); 2836 2837 cmd->nc_dma = kmem_cache_alloc(nvme->n_prp_cache, KM_SLEEP); 2838 bzero(cmd->nc_dma->nd_memp, cmd->nc_dma->nd_len); 2839 2840 cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_dma->nd_cookie.dmac_laddress; 2841 2842 /*LINTED: E_PTR_BAD_CAST_ALIGN*/ 2843 for (prp = (uint64_t *)cmd->nc_dma->nd_memp; 2844 xfer->x_ndmac > 0; 2845 prp++, xfer->x_ndmac--) { 2846 *prp = xfer->x_dmac.dmac_laddress; 2847 ddi_dma_nextcookie(xfer->x_dmah, &xfer->x_dmac); 2848 } 2849 2850 (void) ddi_dma_sync(cmd->nc_dma->nd_dmah, 0, cmd->nc_dma->nd_len, 2851 DDI_DMA_SYNC_FORDEV); 2852 return (DDI_SUCCESS); 2853 } 2854 2855 static nvme_cmd_t * 2856 nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer) 2857 { 2858 nvme_t *nvme = ns->ns_nvme; 2859 nvme_cmd_t *cmd; 2860 2861 /* 2862 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep. 2863 */ 2864 cmd = nvme_alloc_cmd(nvme, (xfer->x_flags & BD_XFER_POLL) ? 2865 KM_NOSLEEP : KM_SLEEP); 2866 2867 if (cmd == NULL) 2868 return (NULL); 2869 2870 cmd->nc_sqe.sqe_opc = opc; 2871 cmd->nc_callback = nvme_bd_xfer_done; 2872 cmd->nc_xfer = xfer; 2873 2874 switch (opc) { 2875 case NVME_OPC_NVM_WRITE: 2876 case NVME_OPC_NVM_READ: 2877 VERIFY(xfer->x_nblks <= 0x10000); 2878 2879 cmd->nc_sqe.sqe_nsid = ns->ns_id; 2880 2881 cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu; 2882 cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32); 2883 cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1); 2884 2885 if (nvme_fill_prp(cmd, xfer) != DDI_SUCCESS) 2886 goto fail; 2887 break; 2888 2889 case NVME_OPC_NVM_FLUSH: 2890 cmd->nc_sqe.sqe_nsid = ns->ns_id; 2891 break; 2892 2893 default: 2894 goto fail; 2895 } 2896 2897 return (cmd); 2898 2899 fail: 2900 nvme_free_cmd(cmd); 2901 return (NULL); 2902 } 2903 2904 static void 2905 nvme_bd_xfer_done(void *arg) 2906 { 2907 nvme_cmd_t *cmd = arg; 2908 bd_xfer_t *xfer = cmd->nc_xfer; 2909 int error = 0; 2910 2911 error = nvme_check_cmd_status(cmd); 2912 nvme_free_cmd(cmd); 2913 2914 bd_xfer_done(xfer, error); 2915 } 2916 2917 static void 2918 nvme_bd_driveinfo(void *arg, bd_drive_t *drive) 2919 { 2920 nvme_namespace_t *ns = arg; 2921 nvme_t *nvme = ns->ns_nvme; 2922 2923 /* 2924 * blkdev maintains one queue size per instance (namespace), 2925 * but all namespace share the I/O queues. 2926 * TODO: need to figure out a sane default, or use per-NS I/O queues, 2927 * or change blkdev to handle EAGAIN 2928 */ 2929 drive->d_qsize = nvme->n_ioq_count * nvme->n_io_queue_len 2930 / nvme->n_namespace_count; 2931 2932 /* 2933 * d_maxxfer is not set, which means the value is taken from the DMA 2934 * attributes specified to bd_alloc_handle. 2935 */ 2936 2937 drive->d_removable = B_FALSE; 2938 drive->d_hotpluggable = B_FALSE; 2939 2940 bcopy(ns->ns_eui64, drive->d_eui64, sizeof (drive->d_eui64)); 2941 drive->d_target = ns->ns_id; 2942 drive->d_lun = 0; 2943 2944 drive->d_model = nvme->n_idctl->id_model; 2945 drive->d_model_len = sizeof (nvme->n_idctl->id_model); 2946 drive->d_vendor = nvme->n_vendor; 2947 drive->d_vendor_len = strlen(nvme->n_vendor); 2948 drive->d_product = nvme->n_product; 2949 drive->d_product_len = strlen(nvme->n_product); 2950 drive->d_serial = nvme->n_idctl->id_serial; 2951 drive->d_serial_len = sizeof (nvme->n_idctl->id_serial); 2952 drive->d_revision = nvme->n_idctl->id_fwrev; 2953 drive->d_revision_len = sizeof (nvme->n_idctl->id_fwrev); 2954 } 2955 2956 static int 2957 nvme_bd_mediainfo(void *arg, bd_media_t *media) 2958 { 2959 nvme_namespace_t *ns = arg; 2960 2961 media->m_nblks = ns->ns_block_count; 2962 media->m_blksize = ns->ns_block_size; 2963 media->m_readonly = B_FALSE; 2964 media->m_solidstate = B_TRUE; 2965 2966 media->m_pblksize = ns->ns_best_block_size; 2967 2968 return (0); 2969 } 2970 2971 static int 2972 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc) 2973 { 2974 nvme_t *nvme = ns->ns_nvme; 2975 nvme_cmd_t *cmd; 2976 2977 if (nvme->n_dead) 2978 return (EIO); 2979 2980 /* No polling for now */ 2981 if (xfer->x_flags & BD_XFER_POLL) 2982 return (EIO); 2983 2984 cmd = nvme_create_nvm_cmd(ns, opc, xfer); 2985 if (cmd == NULL) 2986 return (ENOMEM); 2987 2988 cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1; 2989 ASSERT(cmd->nc_sqid <= nvme->n_ioq_count); 2990 2991 if (nvme_submit_cmd(nvme->n_ioq[cmd->nc_sqid], cmd) 2992 != DDI_SUCCESS) 2993 return (EAGAIN); 2994 2995 return (0); 2996 } 2997 2998 static int 2999 nvme_bd_read(void *arg, bd_xfer_t *xfer) 3000 { 3001 nvme_namespace_t *ns = arg; 3002 3003 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ)); 3004 } 3005 3006 static int 3007 nvme_bd_write(void *arg, bd_xfer_t *xfer) 3008 { 3009 nvme_namespace_t *ns = arg; 3010 3011 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE)); 3012 } 3013 3014 static int 3015 nvme_bd_sync(void *arg, bd_xfer_t *xfer) 3016 { 3017 nvme_namespace_t *ns = arg; 3018 3019 if (ns->ns_nvme->n_dead) 3020 return (EIO); 3021 3022 /* 3023 * If the volatile write cache is not present or not enabled the FLUSH 3024 * command is a no-op, so we can take a shortcut here. 3025 */ 3026 if (!ns->ns_nvme->n_write_cache_present) { 3027 bd_xfer_done(xfer, ENOTSUP); 3028 return (0); 3029 } 3030 3031 if (!ns->ns_nvme->n_write_cache_enabled) { 3032 bd_xfer_done(xfer, 0); 3033 return (0); 3034 } 3035 3036 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH)); 3037 } 3038 3039 static int 3040 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid) 3041 { 3042 nvme_namespace_t *ns = arg; 3043 3044 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 3045 if (*(uint64_t *)ns->ns_eui64 != 0) { 3046 return (ddi_devid_init(devinfo, DEVID_SCSI3_WWN, 3047 sizeof (ns->ns_eui64), ns->ns_eui64, devid)); 3048 } else { 3049 return (ddi_devid_init(devinfo, DEVID_ENCAP, 3050 strlen(ns->ns_devid), ns->ns_devid, devid)); 3051 } 3052 } 3053