1 /* 2 * Copyright (c) 2016 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "nvme.h" 36 37 static int nvme_pci_attach(device_t); 38 static int nvme_pci_detach(device_t); 39 40 static const nvme_device_t nvme_devices[] = { 41 /* Vendor-specific table goes here (see ahci for example) */ 42 { 0, 0, nvme_pci_attach, nvme_pci_detach, "NVME-PCIe" } 43 }; 44 45 static int nvme_msix_enable = 1; 46 TUNABLE_INT("hw.nvme.msix.enable", &nvme_msix_enable); 47 static int nvme_msi_enable = 0; 48 TUNABLE_INT("hw.nvme.msi.enable", &nvme_msi_enable); 49 50 TAILQ_HEAD(, nvme_softc) nvme_sc_list = TAILQ_HEAD_INITIALIZER(nvme_sc_list); 51 struct lock nvme_master_lock = LOCK_INITIALIZER("nvmstr", 0, 0); 52 53 static int last_global_cpu; 54 55 /* 56 * Match during probe and attach. The device does not yet have a softc. 57 */ 58 const nvme_device_t * 59 nvme_lookup_device(device_t dev) 60 { 61 const nvme_device_t *ad; 62 uint16_t vendor = pci_get_vendor(dev); 63 uint16_t product = pci_get_device(dev); 64 uint8_t class = pci_get_class(dev); 65 uint8_t subclass = pci_get_subclass(dev); 66 uint8_t progif = pci_read_config(dev, PCIR_PROGIF, 1); 67 int is_nvme; 68 69 /* 70 * Generally speaking if the pci device does not identify as 71 * AHCI we skip it. 72 */ 73 if (class == PCIC_STORAGE && subclass == PCIS_STORAGE_NVM && 74 progif == PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0) { 75 is_nvme = 1; 76 } else { 77 is_nvme = 0; 78 } 79 80 for (ad = &nvme_devices[0]; ad->vendor; ++ad) { 81 if (ad->vendor == vendor && ad->product == product) 82 return (ad); 83 } 84 85 /* 86 * Last ad is the default match if the PCI device matches SATA. 87 */ 88 if (is_nvme == 0) 89 ad = NULL; 90 return (ad); 91 } 92 93 /* 94 * Attach functions. They all eventually fall through to nvme_pci_attach(). 95 */ 96 static int 97 nvme_pci_attach(device_t dev) 98 { 99 nvme_softc_t *sc = device_get_softc(dev); 100 uint32_t reg; 101 int error; 102 int msi_enable; 103 int msix_enable; 104 105 #if 0 106 if (pci_read_config(dev, PCIR_COMMAND, 2) & 0x0400) { 107 device_printf(dev, "BIOS disabled PCI interrupt, " 108 "re-enabling\n"); 109 pci_write_config(dev, PCIR_COMMAND, 110 pci_read_config(dev, PCIR_COMMAND, 2) & ~0x0400, 2); 111 } 112 #endif 113 114 sc->dev = dev; 115 116 /* 117 * Map the register window 118 */ 119 sc->rid_regs = PCIR_BAR(0); 120 sc->regs = bus_alloc_resource_any(dev, SYS_RES_MEMORY, 121 &sc->rid_regs, RF_ACTIVE); 122 if (sc->regs == NULL) { 123 device_printf(dev, "unable to map registers\n"); 124 nvme_pci_detach(dev); 125 return (ENXIO); 126 } 127 sc->iot = rman_get_bustag(sc->regs); 128 sc->ioh = rman_get_bushandle(sc->regs); 129 130 /* 131 * NVMe allows the MSI-X table to be mapped to BAR 4/5. 132 * Always try to map BAR4, but it's ok if it fails. Must 133 * be done prior to allocating our interrupts. 134 */ 135 sc->rid_bar4 = PCIR_BAR(4); 136 sc->bar4 = bus_alloc_resource_any(dev, SYS_RES_MEMORY, 137 &sc->rid_bar4, RF_ACTIVE); 138 139 /* 140 * Map the interrupt or initial interrupt which will be used for 141 * the admin queue. NVME chipsets can potentially support a huge 142 * number of MSIX vectors but we really only need enough for 143 * available cpus, plus 1. 144 */ 145 msi_enable = device_getenv_int(dev, "msi.enable", nvme_msi_enable); 146 msix_enable = device_getenv_int(dev, "msix.enable", nvme_msix_enable); 147 148 error = 0; 149 if (msix_enable) { 150 int i; 151 int cpu; 152 153 sc->nirqs = pci_msix_count(dev); 154 sc->irq_type = PCI_INTR_TYPE_MSIX; 155 if (sc->nirqs > ncpus + 1) /* max we need */ 156 sc->nirqs = ncpus + 1; 157 158 error = pci_setup_msix(dev); 159 cpu = (last_global_cpu + 0) % ncpus; /* GCC warn */ 160 for (i = 0; error == 0 && i < sc->nirqs; ++i) { 161 cpu = (last_global_cpu + i) % ncpus; 162 error = pci_alloc_msix_vector(dev, i, 163 &sc->rid_irq[i], cpu); 164 if (error) 165 break; 166 sc->irq[i] = bus_alloc_resource_any(dev, SYS_RES_IRQ, 167 &sc->rid_irq[i], 168 RF_ACTIVE); 169 /* 170 * We want this to overwrite queue 0's cpu vector 171 * when the cpu's rotate through later on. 172 */ 173 if (sc->cputovect[cpu] == 0) 174 sc->cputovect[cpu] = i; 175 } 176 177 /* 178 * If we did not iterate enough cpus (that is, there weren't 179 * enough irqs for all available cpus) we still need to 180 * finish or sc->cputovect[] mapping. 181 */ 182 while (error == 0) { 183 cpu = (cpu + 1) % ncpus; 184 i = (i + 1) % sc->nirqs; 185 if (i == 0) 186 i = 1; 187 if (sc->cputovect[cpu] != 0) 188 break; 189 sc->cputovect[cpu] = i; 190 } 191 192 if (error) { 193 while (--i >= 0) { 194 bus_release_resource(dev, SYS_RES_IRQ, 195 sc->rid_irq[i], 196 sc->irq[i]); 197 pci_release_msix_vector(dev, sc->rid_irq[i]); 198 sc->irq[i] = NULL; 199 } 200 /* leave error intact to fall through to normal */ 201 } else { 202 last_global_cpu = (last_global_cpu + sc->nirqs) % ncpus; 203 pci_enable_msix(dev); 204 } 205 } 206 207 /* 208 * If we have to use a normal interrupt we fake the cputovect[] in 209 * order to try to map at least (ncpus) submission queues. The admin 210 * code will limit the number of completion queues to something 211 * reasonable when nirqs is 1 since the single interrupt polls all 212 * completion queues. 213 * 214 * NOTE: We do NOT want to map a single completion queue (#0), because 215 * then an I/O submission and/or completion queue will overlap 216 * the admin submission or completion queue, and that can cause 217 * havoc when admin commands are submitted that don't return 218 * for long periods of time. 219 * 220 * NOTE: Chipsets supporting MSI-X *MIGHT* *NOT* properly support 221 * a normal pin-based level interrupt. For example, the BPX 222 * NVMe SSD just leaves the level interrupt stuck on. Do not 223 * disable MSI-X unless you have no choice. 224 */ 225 if (msix_enable == 0 || error) { 226 uint32_t irq_flags; 227 int i; 228 229 error = 0; 230 sc->nirqs = 1; 231 sc->irq_type = pci_alloc_1intr(dev, msi_enable, 232 &sc->rid_irq[0], &irq_flags); 233 sc->irq[0] = bus_alloc_resource_any(dev, SYS_RES_IRQ, 234 &sc->rid_irq[0], irq_flags); 235 236 for (i = 0; i < ncpus; ++i) 237 sc->cputovect[i] = i + 1; 238 } 239 if (sc->irq[0] == NULL) { 240 device_printf(dev, "unable to map interrupt\n"); 241 nvme_pci_detach(dev); 242 return (ENXIO); 243 } else { 244 const char *type; 245 switch(sc->irq_type) { 246 case PCI_INTR_TYPE_MSI: 247 type = "MSI"; 248 break; 249 case PCI_INTR_TYPE_MSIX: 250 type = "MSIX"; 251 break; 252 default: 253 type = "normal-int"; 254 break; 255 } 256 device_printf(dev, "mapped %d %s IRQs\n", sc->nirqs, type); 257 } 258 259 /* 260 * Make sure the chip is disabled, which will reset all controller 261 * registers except for the admin queue registers. Device should 262 * already be disabled so this is usually instantanious. Use a 263 * fixed 5-second timeout in case it is not. I'd like my other 264 * reads to occur after the device has been disabled. 265 */ 266 sc->entimo = hz * 5; 267 error = nvme_enable(sc, 0); 268 if (error) { 269 nvme_pci_detach(dev); 270 return (ENXIO); 271 } 272 273 /* 274 * Get capabillities and version and report 275 */ 276 sc->vers = nvme_read(sc, NVME_REG_VERS); 277 sc->cap = nvme_read8(sc, NVME_REG_CAP); 278 sc->maxqe = NVME_CAP_MQES_GET(sc->cap); 279 sc->dstrd4 = NVME_CAP_DSTRD_GET(sc->cap); 280 281 device_printf(dev, "NVME Version %u.%u maxqe=%u caps=%016jx\n", 282 NVME_VERS_MAJOR_GET(sc->vers), 283 NVME_VERS_MINOR_GET(sc->vers), 284 sc->maxqe, sc->cap); 285 286 /* 287 * Enable timeout, 500ms increments. Convert to ticks. 288 */ 289 sc->entimo = NVME_CAP_TIMEOUT_GET(sc->cap) * hz / 2; /* in ticks */ 290 ++sc->entimo; /* fudge */ 291 292 /* 293 * Validate maxqe. To cap the amount of memory we reserve for 294 * PRPs we limit maxqe to 256. Also make sure it is a power of 295 * two. 296 */ 297 if (sc->maxqe < 2) { 298 device_printf(dev, 299 "Attach failed, max queue entries (%d) " 300 "below minimum (2)\n", sc->maxqe); 301 nvme_pci_detach(dev); 302 return (ENXIO); 303 } 304 if (sc->maxqe > 256) 305 sc->maxqe = 256; 306 for (reg = 2; reg <= sc->maxqe; reg <<= 1) 307 ; 308 sc->maxqe = reg >> 1; 309 310 /* 311 * DMA tags 312 * 313 * PRP - Worst case PRPs needed per queue is MAXPHYS / PAGE_SIZE 314 * (typically 64), multiplied by maxqe (typ 256). Roughly 315 * ~128KB per queue. Align for cache performance. We actually 316 * need one more PRP per queue entry worst-case to handle 317 * buffer overlap, but we have an extra one in the command 318 * structure so we don't have to calculate that out. 319 * 320 * Remember that we intend to allocate potentially many queues, 321 * so we don't want to bloat this too much. A queue depth of 322 * 256 is plenty. 323 * 324 * CMD - Storage for the submit queue. maxqe * 64 (~16KB) 325 * 326 * RES - Storage for the completion queue. maxqe * 16 (~4KB) 327 * 328 * ADM - Storage for admin command DMA data. Maximum admin command 329 * DMA data is 4KB so reserve maxqe * 4KB (~1MB). There is only 330 * one admin queue. 331 * 332 * NOTE: There are no boundary requirements for NVMe, but I specify a 333 * 4MB boundary anyway because this reduces mass-bit flipping 334 * of address bits inside the controller when incrementing 335 * DMA addresses. Why not? Can't hurt. 336 */ 337 sc->prp_bytes = sizeof(uint64_t) * (MAXPHYS / PAGE_SIZE) * sc->maxqe; 338 sc->cmd_bytes = sizeof(nvme_subq_item_t) * sc->maxqe; 339 sc->res_bytes = sizeof(nvme_comq_item_t) * sc->maxqe; 340 sc->adm_bytes = NVME_MAX_ADMIN_BUFFER * sc->maxqe; 341 342 error = 0; 343 344 error += bus_dma_tag_create( 345 NULL, /* parent tag */ 346 PAGE_SIZE, /* alignment */ 347 4 * 1024 * 1024, /* boundary */ 348 BUS_SPACE_MAXADDR, /* loaddr? */ 349 BUS_SPACE_MAXADDR, /* hiaddr */ 350 NULL, /* filter */ 351 NULL, /* filterarg */ 352 sc->prp_bytes, /* [max]size */ 353 1, /* maxsegs */ 354 sc->prp_bytes, /* maxsegsz */ 355 0, /* flags */ 356 &sc->prps_tag); /* return tag */ 357 358 error += bus_dma_tag_create( 359 NULL, /* parent tag */ 360 PAGE_SIZE, /* alignment */ 361 4 * 1024 * 1024, /* boundary */ 362 BUS_SPACE_MAXADDR, /* loaddr? */ 363 BUS_SPACE_MAXADDR, /* hiaddr */ 364 NULL, /* filter */ 365 NULL, /* filterarg */ 366 sc->cmd_bytes, /* [max]size */ 367 1, /* maxsegs */ 368 sc->cmd_bytes, /* maxsegsz */ 369 0, /* flags */ 370 &sc->sque_tag); /* return tag */ 371 372 error += bus_dma_tag_create( 373 NULL, /* parent tag */ 374 PAGE_SIZE, /* alignment */ 375 4 * 1024 * 1024, /* boundary */ 376 BUS_SPACE_MAXADDR, /* loaddr? */ 377 BUS_SPACE_MAXADDR, /* hiaddr */ 378 NULL, /* filter */ 379 NULL, /* filterarg */ 380 sc->res_bytes, /* [max]size */ 381 1, /* maxsegs */ 382 sc->res_bytes, /* maxsegsz */ 383 0, /* flags */ 384 &sc->cque_tag); /* return tag */ 385 386 error += bus_dma_tag_create( 387 NULL, /* parent tag */ 388 PAGE_SIZE, /* alignment */ 389 4 * 1024 * 1024, /* boundary */ 390 BUS_SPACE_MAXADDR, /* loaddr? */ 391 BUS_SPACE_MAXADDR, /* hiaddr */ 392 NULL, /* filter */ 393 NULL, /* filterarg */ 394 sc->adm_bytes, /* [max]size */ 395 1, /* maxsegs */ 396 sc->adm_bytes, /* maxsegsz */ 397 0, /* flags */ 398 &sc->adm_tag); /* return tag */ 399 400 if (error) { 401 device_printf(dev, "unable to create dma tags\n"); 402 nvme_pci_detach(dev); 403 return (ENXIO); 404 } 405 406 /* 407 * Setup the admin queues (qid 0). 408 */ 409 error = nvme_alloc_subqueue(sc, 0); 410 if (error) { 411 device_printf(dev, "unable to allocate admin subqueue\n"); 412 nvme_pci_detach(dev); 413 return (ENXIO); 414 } 415 error = nvme_alloc_comqueue(sc, 0); 416 if (error) { 417 device_printf(dev, "unable to allocate admin comqueue\n"); 418 nvme_pci_detach(dev); 419 return (ENXIO); 420 } 421 422 /* 423 * Initialize the admin queue registers 424 */ 425 reg = NVME_ATTR_COM_SET(sc->maxqe) | NVME_ATTR_SUB_SET(sc->maxqe); 426 nvme_write(sc, NVME_REG_ADM_ATTR, reg); 427 nvme_write8(sc, NVME_REG_ADM_SUBADR, (uint64_t)sc->subqueues[0].psubq); 428 nvme_write8(sc, NVME_REG_ADM_COMADR, (uint64_t)sc->comqueues[0].pcomq); 429 430 /* 431 * qemu appears to require this, real hardware does not appear 432 * to require this. 433 */ 434 pci_enable_busmaster(dev); 435 436 /* 437 * Other configuration registers 438 */ 439 reg = NVME_CONFIG_IOSUB_ES_SET(6) | /* 64 byte sub entry */ 440 NVME_CONFIG_IOCOM_ES_SET(4) | /* 16 byte com entry */ 441 NVME_CONFIG_MEMPG_SET(PAGE_SHIFT) | /* 4K pages */ 442 NVME_CONFIG_CSS_NVM; /* NVME command set */ 443 nvme_write(sc, NVME_REG_CONFIG, reg); 444 445 reg = nvme_read(sc, NVME_REG_MEMSIZE); 446 447 /* 448 * Enable the chip for operation 449 */ 450 error = nvme_enable(sc, 1); 451 if (error) { 452 nvme_enable(sc, 0); 453 nvme_pci_detach(dev); 454 return (ENXIO); 455 } 456 457 /* 458 * Start the admin thread. This will also setup the admin queue 459 * interrupt. 460 */ 461 error = nvme_start_admin_thread(sc); 462 if (error) { 463 nvme_pci_detach(dev); 464 return (ENXIO); 465 } 466 lockmgr(&nvme_master_lock, LK_EXCLUSIVE); 467 sc->flags |= NVME_SC_ATTACHED; 468 TAILQ_INSERT_TAIL(&nvme_sc_list, sc, entry); 469 lockmgr(&nvme_master_lock, LK_RELEASE); 470 471 return(0); 472 } 473 474 /* 475 * Device unload / detachment 476 */ 477 static int 478 nvme_pci_detach(device_t dev) 479 { 480 nvme_softc_t *sc = device_get_softc(dev); 481 int i; 482 483 /* 484 * Stop the admin thread 485 */ 486 nvme_stop_admin_thread(sc); 487 488 /* 489 * Issue a normal shutdown and wait for completion 490 */ 491 nvme_issue_shutdown(sc, 0); 492 493 /* 494 * Disable the chip 495 */ 496 nvme_enable(sc, 0); 497 498 /* 499 * Free admin memory 500 */ 501 nvme_free_subqueue(sc, 0); 502 nvme_free_comqueue(sc, 0); 503 504 /* 505 * Release related resources. 506 */ 507 for (i = 0; i < sc->nirqs; ++i) { 508 if (sc->irq[i]) { 509 bus_release_resource(dev, SYS_RES_IRQ, 510 sc->rid_irq[i], sc->irq[i]); 511 sc->irq[i] = NULL; 512 if (sc->irq_type == PCI_INTR_TYPE_MSIX) 513 pci_release_msix_vector(dev, sc->rid_irq[i]); 514 } 515 } 516 switch(sc->irq_type) { 517 case PCI_INTR_TYPE_MSI: 518 pci_release_msi(dev); 519 break; 520 case PCI_INTR_TYPE_MSIX: 521 pci_teardown_msix(dev); 522 break; 523 default: 524 break; 525 } 526 527 /* 528 * Release remaining chipset resources 529 */ 530 if (sc->regs) { 531 bus_release_resource(dev, SYS_RES_MEMORY, 532 sc->rid_regs, sc->regs); 533 sc->regs = NULL; 534 } 535 if (sc->bar4) { 536 bus_release_resource(dev, SYS_RES_MEMORY, 537 sc->rid_bar4, sc->bar4); 538 sc->bar4 = NULL; 539 } 540 541 /* 542 * Cleanup the DMA tags 543 */ 544 if (sc->prps_tag) { 545 bus_dma_tag_destroy(sc->prps_tag); 546 sc->prps_tag = NULL; 547 } 548 if (sc->sque_tag) { 549 bus_dma_tag_destroy(sc->sque_tag); 550 sc->sque_tag = NULL; 551 } 552 if (sc->cque_tag) { 553 bus_dma_tag_destroy(sc->cque_tag); 554 sc->cque_tag = NULL; 555 } 556 if (sc->adm_tag) { 557 bus_dma_tag_destroy(sc->adm_tag); 558 sc->adm_tag = NULL; 559 } 560 561 if (sc->flags & NVME_SC_ATTACHED) { 562 lockmgr(&nvme_master_lock, LK_EXCLUSIVE); 563 sc->flags &= ~NVME_SC_ATTACHED; 564 TAILQ_REMOVE(&nvme_sc_list, sc, entry); 565 lockmgr(&nvme_master_lock, LK_RELEASE); 566 } 567 568 return (0); 569 } 570