1 /* 2 * Copyright (c) 2016 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * Administration thread 36 * 37 * - Handles resetting, features, iteration of namespaces, and disk 38 * attachments. Most admin operations are serialized by the admin thread. 39 * 40 * - Ioctls as well as any BIOs which require more sophisticated processing 41 * are handed to this thread as well. 42 * 43 * - Can freeze/resume other queues for various purposes. 44 */ 45 46 #include "nvme.h" 47 48 static void nvme_admin_thread(void *arg); 49 static int nvme_admin_state_identify_ctlr(nvme_softc_t *sc); 50 static int nvme_admin_state_make_queues(nvme_softc_t *sc); 51 static int nvme_admin_state_identify_ns(nvme_softc_t *sc); 52 static int nvme_admin_state_operating(nvme_softc_t *sc); 53 static int nvme_admin_state_failed(nvme_softc_t *sc); 54 55 /* 56 * Start the admin thread and block until it says it is running. 57 */ 58 int 59 nvme_start_admin_thread(nvme_softc_t *sc) 60 { 61 int error; 62 63 lockinit(&sc->admin_lk, "admlk", 0, 0); 64 sc->admin_signal = 0; 65 66 error = bus_setup_intr(sc->dev, sc->irq[0], INTR_MPSAFE, 67 nvme_intr, &sc->comqueues[0], 68 &sc->irq_handle[0], NULL); 69 if (error) { 70 device_printf(sc->dev, "unable to install interrupt\n"); 71 return error; 72 } 73 lockmgr(&sc->admin_lk, LK_EXCLUSIVE); 74 kthread_create(nvme_admin_thread, sc, &sc->admintd, "nvme_admin"); 75 while ((sc->admin_signal & ADMIN_SIG_RUNNING) == 0) 76 lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwbeg", 0); 77 lockmgr(&sc->admin_lk, LK_RELEASE); 78 79 return 0; 80 } 81 82 /* 83 * Stop the admin thread and block until it says it is done. 84 */ 85 void 86 nvme_stop_admin_thread(nvme_softc_t *sc) 87 { 88 uint32_t i; 89 90 atomic_set_int(&sc->admin_signal, ADMIN_SIG_STOP); 91 92 /* 93 * We have to wait for the admin thread to finish its probe 94 * before shutting it down. 95 */ 96 lockmgr(&sc->admin_lk, LK_EXCLUSIVE); 97 while ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) 98 lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwend", 0); 99 lockmgr(&sc->admin_lk, LK_RELEASE); 100 101 /* 102 * Disconnect our disks while the admin thread is still running, 103 * ensuring that the poll works even if interrupts are broken. 104 * Otherwise we could deadlock in the devfs core. 105 */ 106 for (i = 0; i < NVME_MAX_NAMESPACES; ++i) { 107 nvme_softns_t *nsc; 108 109 if ((nsc = sc->nscary[i]) != NULL) { 110 nvme_disk_detach(nsc); 111 112 kfree(nsc, M_NVME); 113 sc->nscary[i] = NULL; 114 } 115 } 116 117 /* 118 * Ask the admin thread to shut-down. 119 */ 120 lockmgr(&sc->admin_lk, LK_EXCLUSIVE); 121 wakeup(&sc->admin_signal); 122 while (sc->admin_signal & ADMIN_SIG_RUNNING) 123 lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwend", 0); 124 lockmgr(&sc->admin_lk, LK_RELEASE); 125 if (sc->irq_handle[0]) { 126 bus_teardown_intr(sc->dev, sc->irq[0], sc->irq_handle[0]); 127 sc->irq_handle[0] = NULL; 128 } 129 lockuninit(&sc->admin_lk); 130 131 /* 132 * Thread might be running on another cpu, give it time to actually 133 * exit before returning in case the caller is about to unload the 134 * module. Otherwise we don't need this. 135 */ 136 nvme_os_sleep(1); 137 } 138 139 static 140 void 141 nvme_admin_thread(void *arg) 142 { 143 nvme_softc_t *sc = arg; 144 uint32_t i; 145 146 lockmgr(&sc->admin_lk, LK_EXCLUSIVE); 147 atomic_set_int(&sc->admin_signal, ADMIN_SIG_RUNNING); 148 wakeup(&sc->admin_signal); 149 150 sc->admin_func = nvme_admin_state_identify_ctlr; 151 152 while ((sc->admin_signal & ADMIN_SIG_STOP) == 0) { 153 for (i = 0; i <= sc->niocomqs; ++i) { 154 nvme_comqueue_t *comq = &sc->comqueues[i]; 155 156 if (comq->nqe == 0) /* not configured */ 157 continue; 158 159 lockmgr(&comq->lk, LK_EXCLUSIVE); 160 nvme_poll_completions(comq, &comq->lk); 161 lockmgr(&comq->lk, LK_RELEASE); 162 } 163 if (sc->admin_signal & ADMIN_SIG_REQUEUE) { 164 atomic_clear_int(&sc->admin_signal, ADMIN_SIG_REQUEUE); 165 nvme_disk_requeues(sc); 166 } 167 if (sc->admin_func(sc) == 0 && 168 (sc->admin_signal & ADMIN_SIG_RUN_MASK) == 0) { 169 lksleep(&sc->admin_signal, &sc->admin_lk, 0, 170 "nvidle", hz); 171 } 172 } 173 174 /* 175 * Cleanup state. 176 * 177 * Note that we actually issue delete queue commands here. The NVME 178 * spec says that for a normal shutdown the I/O queues should be 179 * deleted prior to issuing the shutdown in the CONFIG register. 180 */ 181 for (i = 1; i <= sc->niosubqs; ++i) { 182 nvme_delete_subqueue(sc, i); 183 nvme_free_subqueue(sc, i); 184 } 185 for (i = 1; i <= sc->niocomqs; ++i) { 186 nvme_delete_comqueue(sc, i); 187 nvme_free_comqueue(sc, i); 188 } 189 190 /* 191 * Signal that we are done. 192 */ 193 atomic_clear_int(&sc->admin_signal, ADMIN_SIG_RUNNING); 194 wakeup(&sc->admin_signal); 195 lockmgr(&sc->admin_lk, LK_RELEASE); 196 } 197 198 /* 199 * Identify the controller 200 */ 201 static 202 int 203 nvme_admin_state_identify_ctlr(nvme_softc_t *sc) 204 { 205 nvme_request_t *req; 206 nvme_ident_ctlr_data_t *rp; 207 int status; 208 uint64_t mempgsize; 209 char serial[20+16]; 210 char model[40+16]; 211 212 /* 213 * Identify Controller 214 */ 215 mempgsize = NVME_CAP_MEMPG_MIN_GET(sc->cap); 216 217 req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY); 218 req->cmd.identify.cns = NVME_CNS_CTLR; 219 req->cmd.identify.cntid = 0; 220 bzero(req->info, sizeof(*req->info)); 221 nvme_submit_request(req); 222 status = nvme_wait_request(req, hz); 223 /* XXX handle status */ 224 225 sc->idctlr = req->info->idctlr; 226 nvme_put_request(req); 227 228 rp = &sc->idctlr; 229 230 KKASSERT(sizeof(sc->idctlr.serialno) == 20); 231 KKASSERT(sizeof(sc->idctlr.modelno) == 40); 232 bzero(serial, sizeof(serial)); 233 bzero(model, sizeof(model)); 234 bcopy(rp->serialno, serial, sizeof(rp->serialno)); 235 bcopy(rp->modelno, model, sizeof(rp->modelno)); 236 string_cleanup(serial, 0); 237 string_cleanup(model, 0); 238 239 device_printf(sc->dev, "Model %s BaseSerial %s nscount=%d\n", 240 model, serial, rp->ns_count); 241 242 sc->admin_func = nvme_admin_state_make_queues; 243 244 return 1; 245 } 246 247 /* 248 * Request and create the I/O queues. Figure out CPU mapping optimizations. 249 */ 250 static 251 int 252 nvme_admin_state_make_queues(nvme_softc_t *sc) 253 { 254 nvme_request_t *req; 255 uint16_t niosubqs; 256 uint16_t niocomqs; 257 uint32_t i; 258 uint16_t qno; 259 int status; 260 int error; 261 262 /* 263 * Calculate how many I/O queues (non-inclusive of admin queue) 264 * we want to have, up to 65535. dw0 in the response returns the 265 * number of queues the controller gives us. Submission and 266 * Completion queues are specified separately. 267 * 268 * This driver runs optimally with 4 submission queues and one 269 * completion queue per cpu (rdhipri, rdlopri, wrhipri, wrlopri), 270 * 271 * +1 for dumps XXX future 272 * +1 for async events XXX future 273 */ 274 req = nvme_get_admin_request(sc, NVME_OP_SET_FEATURES); 275 276 niosubqs = ncpus * 2 + 0; 277 niocomqs = ncpus + 0; 278 if (niosubqs > NVME_MAX_QUEUES) 279 niosubqs = NVME_MAX_QUEUES; 280 if (niocomqs > NVME_MAX_QUEUES) 281 niocomqs = NVME_MAX_QUEUES; 282 device_printf(sc->dev, "Request %u/%u queues, ", niosubqs, niocomqs); 283 284 req->cmd.setfeat.flags = NVME_FID_NUMQUEUES; 285 req->cmd.setfeat.numqs.nsqr = niosubqs - 1; /* 0's based 0=1 */ 286 req->cmd.setfeat.numqs.ncqr = niocomqs - 1; /* 0's based 0=1 */ 287 288 nvme_submit_request(req); 289 290 /* 291 * Get response and set our operations mode. 292 */ 293 status = nvme_wait_request(req, hz); 294 /* XXX handle status */ 295 296 if (status == 0) { 297 sc->niosubqs = 1 + (req->res.setfeat.dw0 & 0xFFFFU); 298 sc->niocomqs = 1 + ((req->res.setfeat.dw0 >> 16) & 0xFFFFU); 299 } else { 300 sc->niosubqs = 0; 301 sc->niocomqs = 0; 302 } 303 kprintf("Returns %u/%u queues, ", sc->niosubqs, sc->niocomqs); 304 305 nvme_put_request(req); 306 307 sc->dumpqno = 0; 308 sc->eventqno = 0; 309 310 if (sc->niosubqs >= ncpus * 2 + 0 && sc->niocomqs >= ncpus + 0) { 311 /* 312 * If we got all the queues we wanted do a full-bore setup of 313 * qmap[cpu][type]. 314 * 315 * Remember that subq 0 / comq 0 is the admin queue. 316 */ 317 kprintf("optimal map\n"); 318 qno = 1; 319 for (i = 0; i < ncpus; ++i) { 320 int cpuqno = sc->cputovect[i]; 321 322 KKASSERT(cpuqno != 0); 323 sc->qmap[i][0] = qno + 0; 324 sc->qmap[i][1] = qno + 1; 325 sc->subqueues[qno + 0].comqid = cpuqno; 326 sc->subqueues[qno + 1].comqid = cpuqno; 327 qno += 2; 328 } 329 sc->niosubqs = ncpus * 2 + 0; 330 sc->niocomqs = ncpus + 0; 331 } else if (sc->niosubqs >= ncpus && sc->niocomqs >= ncpus) { 332 /* 333 * We have enough to give each cpu its own submission 334 * and completion queue. 335 * 336 * leave dumpqno and eventqno set to the admin queue. 337 */ 338 kprintf("nominal map 1:1 cpu\n"); 339 for (i = 0; i < ncpus; ++i) { 340 qno = sc->cputovect[i]; 341 KKASSERT(qno != 0); 342 sc->qmap[i][0] = qno; 343 sc->qmap[i][1] = qno; 344 sc->subqueues[qno].comqid = qno; 345 } 346 sc->niosubqs = ncpus; 347 sc->niocomqs = ncpus; 348 } else if (sc->niosubqs >= 2 && sc->niocomqs >= 2) { 349 /* 350 * We have enough queues to separate and prioritize reads 351 * and writes, but all cpus have to share the same submission 352 * queues. Completion queues are split up between cpus 353 * as much as possible. 354 * 355 * leave dumpqno and eventqno set to the admin queue. 356 */ 357 kprintf("rw-sep map\n"); 358 qno = 1; 359 for (i = 0; i < ncpus; ++i) { 360 int cpuqno = sc->cputovect[i]; 361 362 KKASSERT(qno != 0); 363 sc->qmap[i][0] = qno + 0; /* read lopri */ 364 sc->qmap[i][1] = qno + 1; /* read hipri */ 365 if (i <= 0) 366 sc->subqueues[qno + 0].comqid = cpuqno; 367 if (i <= 1) 368 sc->subqueues[qno + 1].comqid = cpuqno; 369 /* do not increment qno */ 370 } 371 sc->niosubqs = 2; 372 sc->niocomqs = 2; 373 } else if (sc->niosubqs >= 2) { 374 /* 375 * We have enough to have separate read and write queues. 376 */ 377 kprintf("basic map\n"); 378 qno = 1; 379 for (i = 0; i < ncpus; ++i) { 380 int cpuqno = sc->cputovect[i]; 381 382 KKASSERT(qno != 0); 383 sc->qmap[i][0] = qno + 0; /* read */ 384 sc->qmap[i][1] = qno + 1; /* write */ 385 if (i <= 0) 386 sc->subqueues[qno + 0].comqid = cpuqno; 387 if (i <= 1) 388 sc->subqueues[qno + 1].comqid = cpuqno; 389 } 390 sc->niosubqs = 2; 391 sc->niocomqs = 1; 392 } else { 393 /* 394 * Minimal configuration, all cpus and I/O types use the 395 * same queue. Sad day. 396 */ 397 kprintf("minimal map\n"); 398 sc->dumpqno = 0; 399 sc->eventqno = 0; 400 for (i = 0; i < ncpus; ++i) { 401 sc->qmap[i][0] = 1; 402 sc->qmap[i][1] = 1; 403 } 404 sc->subqueues[1].comqid = 1; 405 sc->niosubqs = 1; 406 sc->niocomqs = 1; 407 } 408 409 /* 410 * Create all I/O submission and completion queues. The I/O 411 * queues start at 1 and are inclusive of niosubqs and niocomqs. 412 * 413 * NOTE: Completion queues must be created before submission queues. 414 * That is, the completion queue specified when creating a 415 * submission queue must already exist. 416 */ 417 error = 0; 418 for (i = 1; i <= sc->niocomqs; ++i) { 419 error += nvme_alloc_comqueue(sc, i); 420 if (error) { 421 device_printf(sc->dev, "Unable to allocate comqs\n"); 422 break; 423 } 424 error += nvme_create_comqueue(sc, i); 425 } 426 for (i = 1; i <= sc->niosubqs; ++i) { 427 error += nvme_alloc_subqueue(sc, i); 428 if (error) { 429 device_printf(sc->dev, "Unable to allocate subqs\n"); 430 break; 431 } 432 error += nvme_create_subqueue(sc, i); 433 } 434 435 if (error) { 436 device_printf(sc->dev, "Failed to initialize device!\n"); 437 sc->admin_func = nvme_admin_state_failed; 438 } else { 439 sc->admin_func = nvme_admin_state_identify_ns; 440 } 441 442 /* 443 * Basically interrupt coalescing is worthless if we care about 444 * performance, at least on the Intel 750. Setting the threshold 445 * has no effect if time is set to 0. The smallest time that can 446 * be set is a value of 1 (== 100uS), which is much too long. That 447 * is only 10,000 interrupts/sec/cpu and on the Intel 750 it totally 448 * destroys sequential performance. 449 */ 450 req = nvme_get_admin_request(sc, NVME_OP_SET_FEATURES); 451 452 device_printf(sc->dev, "Interrupt Coalesce: 100uS / 4 qentries\n"); 453 454 req->cmd.setfeat.flags = NVME_FID_INTCOALESCE; 455 req->cmd.setfeat.intcoal.thr = 0; 456 req->cmd.setfeat.intcoal.time = 0; 457 458 nvme_submit_request(req); 459 status = nvme_wait_request(req, hz); 460 if (status) { 461 device_printf(sc->dev, 462 "Interrupt coalesce failed status=%d\n", 463 status); 464 } 465 nvme_put_request(req); 466 467 return 1; 468 } 469 470 /* 471 * Identify available namespaces, iterate, and attach to disks. 472 */ 473 static 474 int 475 nvme_admin_state_identify_ns(nvme_softc_t *sc) 476 { 477 nvme_request_t *req; 478 nvme_ident_ns_list_t *rp; 479 int status; 480 uint32_t i; 481 uint32_t j; 482 483 if (bootverbose) { 484 if (sc->idctlr.admin_cap & NVME_ADMIN_NSMANAGE) 485 device_printf(sc->dev, 486 "Namespace management supported\n"); 487 else 488 device_printf(sc->dev, 489 "Namespace management not supported\n"); 490 } 491 #if 0 492 /* 493 * Identify Controllers TODO TODO TODO 494 */ 495 if (sc->idctlr.admin_cap & NVME_ADMIN_NSMANAGE) { 496 req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY); 497 req->cmd.identify.cns = NVME_CNS_ANY_CTLR_LIST; 498 req->cmd.identify.cntid = 0; 499 bzero(req->info, sizeof(*req->info)); 500 nvme_submit_request(req); 501 status = nvme_wait_request(req, hz); 502 kprintf("nsquery status %08x\n", status); 503 504 #if 0 505 for (i = 0; i < req->info->ctlrlist.idcount; ++i) { 506 kprintf("CTLR %04x\n", req->info->ctlrlist.ctlrids[i]); 507 } 508 #endif 509 nvme_put_request(req); 510 } 511 #endif 512 513 rp = kmalloc(sizeof(*rp), M_NVME, M_WAITOK | M_ZERO); 514 if (sc->idctlr.admin_cap & NVME_ADMIN_NSMANAGE) { 515 /* 516 * Namespace management supported, query active namespaces. 517 */ 518 req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY); 519 req->cmd.identify.cns = NVME_CNS_ACT_NSLIST; 520 req->cmd.identify.cntid = 0; 521 bzero(req->info, sizeof(*req->info)); 522 nvme_submit_request(req); 523 status = nvme_wait_request(req, hz); 524 kprintf("nsquery status %08x\n", status); 525 /* XXX handle status */ 526 527 cpu_lfence(); 528 *rp = req->info->nslist; 529 nvme_put_request(req); 530 } else { 531 /* 532 * Namespace management not supported, assume nsids 1..N. 533 */ 534 for (i = 1; i <= sc->idctlr.ns_count && i <= 1024; ++i) 535 rp->nsids[i-1] = i; 536 } 537 538 /* 539 * Identify each Namespace 540 */ 541 for (i = 0; i < 1024; ++i) { 542 nvme_softns_t *nsc; 543 nvme_lba_fmt_data_t *lbafmt; 544 545 if (rp->nsids[i] == 0) 546 continue; 547 req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY); 548 req->cmd.identify.cns = NVME_CNS_ACT_NS; 549 req->cmd.identify.cntid = 0; 550 req->cmd.identify.head.nsid = rp->nsids[i]; 551 bzero(req->info, sizeof(*req->info)); 552 nvme_submit_request(req); 553 status = nvme_wait_request(req, hz); 554 if (status != 0) { 555 kprintf("NS FAILED %08x\n", status); 556 continue; 557 } 558 559 for (j = 0; j < NVME_MAX_NAMESPACES; ++j) { 560 if (sc->nscary[j] && 561 sc->nscary[j]->nsid == rp->nsids[i]) 562 break; 563 } 564 if (j == NVME_MAX_NAMESPACES) { 565 j = i; 566 if (sc->nscary[j] != NULL) { 567 for (j = NVME_MAX_NAMESPACES - 1; j >= 0; --j) { 568 if (sc->nscary[j] == NULL) 569 break; 570 } 571 } 572 } 573 if (j < 0) { 574 device_printf(sc->dev, "not enough room in nscary for " 575 "namespace %08x\n", rp->nsids[i]); 576 nvme_put_request(req); 577 continue; 578 } 579 nsc = sc->nscary[j]; 580 if (nsc == NULL) { 581 nsc = kmalloc(sizeof(*nsc), M_NVME, M_WAITOK | M_ZERO); 582 nsc->unit = nvme_alloc_disk_unit(); 583 sc->nscary[j] = nsc; 584 } 585 if (sc->nscmax <= j) 586 sc->nscmax = j + 1; 587 nsc->sc = sc; 588 nsc->nsid = rp->nsids[i]; 589 nsc->state = NVME_NSC_STATE_UNATTACHED; 590 nsc->idns = req->info->idns; 591 bioq_init(&nsc->bioq); 592 lockinit(&nsc->lk, "nvnsc", 0, 0); 593 594 nvme_put_request(req); 595 596 j = NVME_FLBAS_SEL_GET(nsc->idns.flbas); 597 lbafmt = &nsc->idns.lba_fmt[j]; 598 nsc->blksize = 1 << lbafmt->sect_size; 599 600 /* 601 * Attach the namespace 602 */ 603 nvme_disk_attach(nsc); 604 } 605 kfree(rp, M_NVME); 606 607 sc->admin_func = nvme_admin_state_operating; 608 return 1; 609 } 610 611 static 612 int 613 nvme_admin_state_operating(nvme_softc_t *sc) 614 { 615 if ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) { 616 atomic_set_int(&sc->admin_signal, ADMIN_SIG_PROBED); 617 wakeup(&sc->admin_signal); 618 } 619 620 return 0; 621 } 622 623 static 624 int 625 nvme_admin_state_failed(nvme_softc_t *sc) 626 { 627 if ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) { 628 atomic_set_int(&sc->admin_signal, ADMIN_SIG_PROBED); 629 wakeup(&sc->admin_signal); 630 } 631 632 return 0; 633 } 634