1 /* 2 * Copyright (c) 2016 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * Administration thread 36 * 37 * - Handles resetting, features, iteration of namespaces, and disk 38 * attachments. Most admin operations are serialized by the admin thread. 39 * 40 * - Ioctls as well as any BIOs which require more sophisticated processing 41 * are handed to this thread as well. 42 * 43 * - Can freeze/resume other queues for various purposes. 44 */ 45 46 #include "nvme.h" 47 48 static void nvme_admin_thread(void *arg); 49 static int nvme_admin_state_identify_ctlr(nvme_softc_t *sc); 50 static int nvme_admin_state_make_queues(nvme_softc_t *sc); 51 static int nvme_admin_state_identify_ns(nvme_softc_t *sc); 52 static int nvme_admin_state_operating(nvme_softc_t *sc); 53 static int nvme_admin_state_failed(nvme_softc_t *sc); 54 55 /* 56 * Start the admin thread and block until it says it is running. 57 */ 58 int 59 nvme_start_admin_thread(nvme_softc_t *sc) 60 { 61 int error; 62 63 lockinit(&sc->admin_lk, "admlk", 0, 0); 64 lockinit(&sc->ioctl_lk, "nvioc", 0, 0); 65 sc->admin_signal = 0; 66 67 error = bus_setup_intr(sc->dev, sc->irq[0], INTR_MPSAFE, 68 nvme_intr, &sc->comqueues[0], 69 &sc->irq_handle[0], NULL); 70 if (error) { 71 device_printf(sc->dev, "unable to install interrupt\n"); 72 return error; 73 } 74 lockmgr(&sc->admin_lk, LK_EXCLUSIVE); 75 kthread_create(nvme_admin_thread, sc, &sc->admintd, "nvme_admin"); 76 while ((sc->admin_signal & ADMIN_SIG_RUNNING) == 0) 77 lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwbeg", 0); 78 lockmgr(&sc->admin_lk, LK_RELEASE); 79 80 return 0; 81 } 82 83 /* 84 * Stop the admin thread and block until it says it is done. 85 */ 86 void 87 nvme_stop_admin_thread(nvme_softc_t *sc) 88 { 89 uint32_t i; 90 91 atomic_set_int(&sc->admin_signal, ADMIN_SIG_STOP); 92 93 /* 94 * We have to wait for the admin thread to finish its probe 95 * before shutting it down. Break out if the admin thread 96 * never managed to even start. 97 */ 98 lockmgr(&sc->admin_lk, LK_EXCLUSIVE); 99 while ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) { 100 if ((sc->admin_signal & ADMIN_SIG_RUNNING) == 0) 101 break; 102 lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwend", 0); 103 } 104 lockmgr(&sc->admin_lk, LK_RELEASE); 105 106 /* 107 * Disconnect our disks while the admin thread is still running, 108 * ensuring that the poll works even if interrupts are broken. 109 * Otherwise we could deadlock in the devfs core. 110 */ 111 for (i = 0; i < NVME_MAX_NAMESPACES; ++i) { 112 nvme_softns_t *nsc; 113 114 if ((nsc = sc->nscary[i]) != NULL) { 115 nvme_disk_detach(nsc); 116 117 kfree(nsc, M_NVME); 118 sc->nscary[i] = NULL; 119 } 120 } 121 122 /* 123 * Ask the admin thread to shut-down. 124 */ 125 lockmgr(&sc->admin_lk, LK_EXCLUSIVE); 126 wakeup(&sc->admin_signal); 127 while (sc->admin_signal & ADMIN_SIG_RUNNING) 128 lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwend", 0); 129 lockmgr(&sc->admin_lk, LK_RELEASE); 130 if (sc->irq_handle[0]) { 131 bus_teardown_intr(sc->dev, sc->irq[0], sc->irq_handle[0]); 132 sc->irq_handle[0] = NULL; 133 } 134 lockuninit(&sc->ioctl_lk); 135 lockuninit(&sc->admin_lk); 136 137 /* 138 * Thread might be running on another cpu, give it time to actually 139 * exit before returning in case the caller is about to unload the 140 * module. Otherwise we don't need this. 141 */ 142 nvme_os_sleep(1); 143 } 144 145 static 146 void 147 nvme_admin_thread(void *arg) 148 { 149 nvme_softc_t *sc = arg; 150 uint32_t i; 151 152 lockmgr(&sc->admin_lk, LK_EXCLUSIVE); 153 atomic_set_int(&sc->admin_signal, ADMIN_SIG_RUNNING); 154 wakeup(&sc->admin_signal); 155 156 sc->admin_func = nvme_admin_state_identify_ctlr; 157 158 while ((sc->admin_signal & ADMIN_SIG_STOP) == 0) { 159 for (i = 0; i <= sc->niocomqs; ++i) { 160 nvme_comqueue_t *comq = &sc->comqueues[i]; 161 162 if (comq->nqe == 0) /* not configured */ 163 continue; 164 165 lockmgr(&comq->lk, LK_EXCLUSIVE); 166 nvme_poll_completions(comq, &comq->lk); 167 lockmgr(&comq->lk, LK_RELEASE); 168 } 169 if (sc->admin_signal & ADMIN_SIG_REQUEUE) { 170 atomic_clear_int(&sc->admin_signal, ADMIN_SIG_REQUEUE); 171 nvme_disk_requeues(sc); 172 } 173 if (sc->admin_func(sc) == 0 && 174 (sc->admin_signal & ADMIN_SIG_RUN_MASK) == 0) { 175 lksleep(&sc->admin_signal, &sc->admin_lk, 0, 176 "nvidle", hz); 177 } 178 } 179 180 /* 181 * Cleanup state. 182 * 183 * Note that we actually issue delete queue commands here. The NVME 184 * spec says that for a normal shutdown the I/O queues should be 185 * deleted prior to issuing the shutdown in the CONFIG register. 186 */ 187 for (i = 1; i <= sc->niosubqs; ++i) { 188 nvme_delete_subqueue(sc, i); 189 nvme_free_subqueue(sc, i); 190 } 191 for (i = 1; i <= sc->niocomqs; ++i) { 192 nvme_delete_comqueue(sc, i); 193 nvme_free_comqueue(sc, i); 194 } 195 196 /* 197 * Signal that we are done. 198 */ 199 atomic_clear_int(&sc->admin_signal, ADMIN_SIG_RUNNING); 200 wakeup(&sc->admin_signal); 201 lockmgr(&sc->admin_lk, LK_RELEASE); 202 } 203 204 /* 205 * Identify the controller 206 */ 207 static 208 int 209 nvme_admin_state_identify_ctlr(nvme_softc_t *sc) 210 { 211 nvme_request_t *req; 212 nvme_ident_ctlr_data_t *rp; 213 int status; 214 uint64_t mempgsize; 215 char serial[20+16]; 216 char model[40+16]; 217 218 /* 219 * Identify Controller 220 */ 221 mempgsize = NVME_CAP_MEMPG_MIN_GET(sc->cap); 222 223 req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY); 224 req->cmd.identify.cns = NVME_CNS_CTLR; 225 req->cmd.identify.cntid = 0; 226 bzero(req->info, sizeof(*req->info)); 227 nvme_submit_request(req); 228 status = nvme_wait_request(req, hz); 229 /* XXX handle status */ 230 231 sc->idctlr = req->info->idctlr; 232 nvme_put_request(req); 233 234 rp = &sc->idctlr; 235 236 KKASSERT(sizeof(sc->idctlr.serialno) == 20); 237 KKASSERT(sizeof(sc->idctlr.modelno) == 40); 238 bzero(serial, sizeof(serial)); 239 bzero(model, sizeof(model)); 240 bcopy(rp->serialno, serial, sizeof(rp->serialno)); 241 bcopy(rp->modelno, model, sizeof(rp->modelno)); 242 string_cleanup(serial, 0); 243 string_cleanup(model, 0); 244 245 device_printf(sc->dev, "Model %s BaseSerial %s nscount=%d\n", 246 model, serial, rp->ns_count); 247 248 sc->admin_func = nvme_admin_state_make_queues; 249 250 return 1; 251 } 252 253 /* 254 * Request and create the I/O queues. Figure out CPU mapping optimizations. 255 */ 256 static 257 int 258 nvme_admin_state_make_queues(nvme_softc_t *sc) 259 { 260 nvme_request_t *req; 261 uint16_t niosubqs; 262 uint16_t niocomqs; 263 uint32_t i; 264 uint16_t qno; 265 int status; 266 int error; 267 268 /* 269 * Calculate how many I/O queues (non-inclusive of admin queue) 270 * we want to have, up to 65535. dw0 in the response returns the 271 * number of queues the controller gives us. Submission and 272 * Completion queues are specified separately. 273 * 274 * This driver runs optimally with 4 submission queues and one 275 * completion queue per cpu (rdhipri, rdlopri, wrhipri, wrlopri), 276 * 277 * +1 for dumps XXX future 278 * +1 for async events XXX future 279 */ 280 req = nvme_get_admin_request(sc, NVME_OP_SET_FEATURES); 281 282 niosubqs = ncpus * 2 + 0; 283 niocomqs = ncpus + 0; 284 if (niosubqs > NVME_MAX_QUEUES) 285 niosubqs = NVME_MAX_QUEUES; 286 if (niocomqs > NVME_MAX_QUEUES) 287 niocomqs = NVME_MAX_QUEUES; 288 device_printf(sc->dev, "Request %u/%u queues, ", niosubqs, niocomqs); 289 290 req->cmd.setfeat.flags = NVME_FID_NUMQUEUES; 291 req->cmd.setfeat.numqs.nsqr = niosubqs - 1; /* 0's based 0=1 */ 292 req->cmd.setfeat.numqs.ncqr = niocomqs - 1; /* 0's based 0=1 */ 293 294 nvme_submit_request(req); 295 296 /* 297 * Get response and set our operations mode. 298 */ 299 status = nvme_wait_request(req, hz); 300 /* XXX handle status */ 301 302 if (status == 0) { 303 sc->niosubqs = 1 + (req->res.setfeat.dw0 & 0xFFFFU); 304 sc->niocomqs = 1 + ((req->res.setfeat.dw0 >> 16) & 0xFFFFU); 305 } else { 306 sc->niosubqs = 0; 307 sc->niocomqs = 0; 308 } 309 kprintf("Returns %u/%u queues, ", sc->niosubqs, sc->niocomqs); 310 311 nvme_put_request(req); 312 313 sc->dumpqno = 0; 314 sc->eventqno = 0; 315 316 if (sc->niosubqs >= ncpus * 2 + 0 && sc->niocomqs >= ncpus + 0) { 317 /* 318 * If we got all the queues we wanted do a full-bore setup of 319 * qmap[cpu][type]. 320 * 321 * Remember that subq 0 / comq 0 is the admin queue. 322 */ 323 kprintf("optimal map\n"); 324 qno = 1; 325 for (i = 0; i < ncpus; ++i) { 326 int cpuqno = sc->cputovect[i]; 327 328 KKASSERT(cpuqno != 0); 329 sc->qmap[i][0] = qno + 0; 330 sc->qmap[i][1] = qno + 1; 331 sc->subqueues[qno + 0].comqid = cpuqno; 332 sc->subqueues[qno + 1].comqid = cpuqno; 333 qno += 2; 334 } 335 sc->niosubqs = ncpus * 2 + 0; 336 sc->niocomqs = ncpus + 0; 337 } else if (sc->niosubqs >= ncpus && sc->niocomqs >= ncpus) { 338 /* 339 * We have enough to give each cpu its own submission 340 * and completion queue. 341 * 342 * leave dumpqno and eventqno set to the admin queue. 343 */ 344 kprintf("nominal map 1:1 cpu\n"); 345 for (i = 0; i < ncpus; ++i) { 346 qno = sc->cputovect[i]; 347 KKASSERT(qno != 0); 348 sc->qmap[i][0] = qno; 349 sc->qmap[i][1] = qno; 350 sc->subqueues[qno].comqid = qno; 351 } 352 sc->niosubqs = ncpus; 353 sc->niocomqs = ncpus; 354 } else if (sc->niosubqs >= 2 && sc->niocomqs >= 2) { 355 /* 356 * We have enough queues to separate and prioritize reads 357 * and writes, but all cpus have to share the same submission 358 * queues. Completion queues are split up between cpus 359 * as much as possible. 360 * 361 * leave dumpqno and eventqno set to the admin queue. 362 */ 363 kprintf("rw-sep map\n"); 364 qno = 1; 365 for (i = 0; i < ncpus; ++i) { 366 int cpuqno = sc->cputovect[i]; 367 368 KKASSERT(qno != 0); 369 sc->qmap[i][0] = qno + 0; /* read lopri */ 370 sc->qmap[i][1] = qno + 1; /* read hipri */ 371 if (i <= 0) 372 sc->subqueues[qno + 0].comqid = cpuqno; 373 if (i <= 1) 374 sc->subqueues[qno + 1].comqid = cpuqno; 375 /* do not increment qno */ 376 } 377 sc->niosubqs = 2; 378 sc->niocomqs = 2; 379 } else if (sc->niosubqs >= 2) { 380 /* 381 * We have enough to have separate read and write queues. 382 */ 383 kprintf("basic map\n"); 384 qno = 1; 385 for (i = 0; i < ncpus; ++i) { 386 int cpuqno = sc->cputovect[i]; 387 388 KKASSERT(qno != 0); 389 sc->qmap[i][0] = qno + 0; /* read */ 390 sc->qmap[i][1] = qno + 1; /* write */ 391 if (i <= 0) 392 sc->subqueues[qno + 0].comqid = cpuqno; 393 if (i <= 1) 394 sc->subqueues[qno + 1].comqid = cpuqno; 395 } 396 sc->niosubqs = 2; 397 sc->niocomqs = 1; 398 } else { 399 /* 400 * Minimal configuration, all cpus and I/O types use the 401 * same queue. Sad day. 402 */ 403 kprintf("minimal map\n"); 404 sc->dumpqno = 0; 405 sc->eventqno = 0; 406 for (i = 0; i < ncpus; ++i) { 407 sc->qmap[i][0] = 1; 408 sc->qmap[i][1] = 1; 409 } 410 sc->subqueues[1].comqid = 1; 411 sc->niosubqs = 1; 412 sc->niocomqs = 1; 413 } 414 415 /* 416 * Create all I/O submission and completion queues. The I/O 417 * queues start at 1 and are inclusive of niosubqs and niocomqs. 418 * 419 * NOTE: Completion queues must be created before submission queues. 420 * That is, the completion queue specified when creating a 421 * submission queue must already exist. 422 */ 423 error = 0; 424 for (i = 1; i <= sc->niocomqs; ++i) { 425 error += nvme_alloc_comqueue(sc, i); 426 if (error) { 427 device_printf(sc->dev, "Unable to allocate comqs\n"); 428 break; 429 } 430 error += nvme_create_comqueue(sc, i); 431 } 432 for (i = 1; i <= sc->niosubqs; ++i) { 433 error += nvme_alloc_subqueue(sc, i); 434 if (error) { 435 device_printf(sc->dev, "Unable to allocate subqs\n"); 436 break; 437 } 438 error += nvme_create_subqueue(sc, i); 439 } 440 441 if (error) { 442 device_printf(sc->dev, "Failed to initialize device!\n"); 443 sc->admin_func = nvme_admin_state_failed; 444 } else { 445 sc->admin_func = nvme_admin_state_identify_ns; 446 } 447 448 /* 449 * Basically interrupt coalescing is worthless if we care about 450 * performance, at least on the Intel 750. Setting the threshold 451 * has no effect if time is set to 0. The smallest time that can 452 * be set is a value of 1 (== 100uS), which is much too long. That 453 * is only 10,000 interrupts/sec/cpu and on the Intel 750 it totally 454 * destroys sequential performance. 455 */ 456 req = nvme_get_admin_request(sc, NVME_OP_SET_FEATURES); 457 458 device_printf(sc->dev, "Interrupt Coalesce: 100uS / 4 qentries\n"); 459 460 req->cmd.setfeat.flags = NVME_FID_INTCOALESCE; 461 req->cmd.setfeat.intcoal.thr = 0; 462 req->cmd.setfeat.intcoal.time = 0; 463 464 nvme_submit_request(req); 465 status = nvme_wait_request(req, hz); 466 if (status) { 467 device_printf(sc->dev, 468 "Interrupt coalesce failed status=%d\n", 469 status); 470 } 471 nvme_put_request(req); 472 473 return 1; 474 } 475 476 /* 477 * Identify available namespaces, iterate, and attach to disks. 478 */ 479 static 480 int 481 nvme_admin_state_identify_ns(nvme_softc_t *sc) 482 { 483 nvme_request_t *req; 484 nvme_ident_ns_list_t *rp; 485 int status; 486 uint32_t i; 487 uint32_t j; 488 489 if (bootverbose) { 490 if (sc->idctlr.admin_cap & NVME_ADMIN_NSMANAGE) 491 device_printf(sc->dev, 492 "Namespace management supported\n"); 493 else 494 device_printf(sc->dev, 495 "Namespace management not supported\n"); 496 } 497 #if 0 498 /* 499 * Identify Controllers TODO TODO TODO 500 */ 501 if (sc->idctlr.admin_cap & NVME_ADMIN_NSMANAGE) { 502 req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY); 503 req->cmd.identify.cns = NVME_CNS_ANY_CTLR_LIST; 504 req->cmd.identify.cntid = 0; 505 bzero(req->info, sizeof(*req->info)); 506 nvme_submit_request(req); 507 status = nvme_wait_request(req, hz); 508 kprintf("nsquery status %08x\n", status); 509 510 #if 0 511 for (i = 0; i < req->info->ctlrlist.idcount; ++i) { 512 kprintf("CTLR %04x\n", req->info->ctlrlist.ctlrids[i]); 513 } 514 #endif 515 nvme_put_request(req); 516 } 517 #endif 518 519 rp = kmalloc(sizeof(*rp), M_NVME, M_WAITOK | M_ZERO); 520 if (sc->idctlr.admin_cap & NVME_ADMIN_NSMANAGE) { 521 /* 522 * Namespace management supported, query active namespaces. 523 */ 524 req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY); 525 req->cmd.identify.cns = NVME_CNS_ACT_NSLIST; 526 req->cmd.identify.cntid = 0; 527 bzero(req->info, sizeof(*req->info)); 528 nvme_submit_request(req); 529 status = nvme_wait_request(req, hz); 530 kprintf("nsquery status %08x\n", status); 531 /* XXX handle status */ 532 533 cpu_lfence(); 534 *rp = req->info->nslist; 535 nvme_put_request(req); 536 } else { 537 /* 538 * Namespace management not supported, assume nsids 1..N. 539 */ 540 for (i = 1; i <= sc->idctlr.ns_count && i <= 1024; ++i) 541 rp->nsids[i-1] = i; 542 } 543 544 /* 545 * Identify each Namespace 546 */ 547 for (i = 0; i < 1024; ++i) { 548 nvme_softns_t *nsc; 549 nvme_lba_fmt_data_t *lbafmt; 550 551 if (rp->nsids[i] == 0) 552 continue; 553 req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY); 554 req->cmd.identify.cns = NVME_CNS_ACT_NS; 555 req->cmd.identify.cntid = 0; 556 req->cmd.identify.head.nsid = rp->nsids[i]; 557 bzero(req->info, sizeof(*req->info)); 558 nvme_submit_request(req); 559 status = nvme_wait_request(req, hz); 560 if (status != 0) { 561 kprintf("NS FAILED %08x\n", status); 562 continue; 563 } 564 565 for (j = 0; j < NVME_MAX_NAMESPACES; ++j) { 566 if (sc->nscary[j] && 567 sc->nscary[j]->nsid == rp->nsids[i]) 568 break; 569 } 570 if (j == NVME_MAX_NAMESPACES) { 571 j = i; 572 if (sc->nscary[j] != NULL) { 573 for (j = NVME_MAX_NAMESPACES - 1; j >= 0; --j) { 574 if (sc->nscary[j] == NULL) 575 break; 576 } 577 } 578 } 579 if (j < 0) { 580 device_printf(sc->dev, "not enough room in nscary for " 581 "namespace %08x\n", rp->nsids[i]); 582 nvme_put_request(req); 583 continue; 584 } 585 nsc = sc->nscary[j]; 586 if (nsc == NULL) { 587 nsc = kmalloc(sizeof(*nsc), M_NVME, M_WAITOK | M_ZERO); 588 nsc->unit = nvme_alloc_disk_unit(); 589 sc->nscary[j] = nsc; 590 } 591 if (sc->nscmax <= j) 592 sc->nscmax = j + 1; 593 nsc->sc = sc; 594 nsc->nsid = rp->nsids[i]; 595 nsc->state = NVME_NSC_STATE_UNATTACHED; 596 nsc->idns = req->info->idns; 597 bioq_init(&nsc->bioq); 598 lockinit(&nsc->lk, "nvnsc", 0, 0); 599 600 nvme_put_request(req); 601 602 j = NVME_FLBAS_SEL_GET(nsc->idns.flbas); 603 lbafmt = &nsc->idns.lba_fmt[j]; 604 nsc->blksize = 1 << lbafmt->sect_size; 605 606 /* 607 * Attach the namespace 608 */ 609 nvme_disk_attach(nsc); 610 } 611 kfree(rp, M_NVME); 612 613 sc->admin_func = nvme_admin_state_operating; 614 return 1; 615 } 616 617 static 618 int 619 nvme_admin_state_operating(nvme_softc_t *sc) 620 { 621 if ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) { 622 atomic_set_int(&sc->admin_signal, ADMIN_SIG_PROBED); 623 wakeup(&sc->admin_signal); 624 } 625 626 return 0; 627 } 628 629 static 630 int 631 nvme_admin_state_failed(nvme_softc_t *sc) 632 { 633 if ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) { 634 atomic_set_int(&sc->admin_signal, ADMIN_SIG_PROBED); 635 wakeup(&sc->admin_signal); 636 } 637 638 return 0; 639 } 640