1 /* 2 * Copyright (c) 2016 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * Administration thread 36 * 37 * - Handles resetting, features, iteration of namespaces, and disk 38 * attachments. Most admin operations are serialized by the admin thread. 39 * 40 * - Ioctls as well as any BIOs which require more sophisticated processing 41 * are handed to this thread as well. 42 * 43 * - Can freeze/resume other queues for various purposes. 44 */ 45 46 #include "nvme.h" 47 48 static void nvme_admin_thread(void *arg); 49 static int nvme_admin_state_identify_ctlr(nvme_softc_t *sc); 50 static int nvme_admin_state_make_queues(nvme_softc_t *sc); 51 static int nvme_admin_state_identify_ns(nvme_softc_t *sc); 52 static int nvme_admin_state_operating(nvme_softc_t *sc); 53 static int nvme_admin_state_failed(nvme_softc_t *sc); 54 55 /* 56 * Start the admin thread and block until it says it is running. 57 */ 58 int 59 nvme_start_admin_thread(nvme_softc_t *sc) 60 { 61 int error; 62 63 lockinit(&sc->admin_lk, "admlk", 0, 0); 64 sc->admin_signal = 0; 65 66 error = bus_setup_intr(sc->dev, sc->irq[0], INTR_MPSAFE, 67 nvme_intr, &sc->comqueues[0], 68 &sc->irq_handle[0], NULL); 69 if (error) { 70 device_printf(sc->dev, "unable to install interrupt\n"); 71 return error; 72 } 73 lockmgr(&sc->admin_lk, LK_EXCLUSIVE); 74 kthread_create(nvme_admin_thread, sc, &sc->admintd, "nvme_admin"); 75 while ((sc->admin_signal & ADMIN_SIG_RUNNING) == 0) 76 lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwbeg", 0); 77 lockmgr(&sc->admin_lk, LK_RELEASE); 78 79 return 0; 80 } 81 82 /* 83 * Stop the admin thread and block until it says it is done. 84 */ 85 void 86 nvme_stop_admin_thread(nvme_softc_t *sc) 87 { 88 uint32_t i; 89 90 atomic_set_int(&sc->admin_signal, ADMIN_SIG_STOP); 91 92 /* 93 * We have to wait for the admin thread to finish its probe 94 * before shutting it down. 95 */ 96 lockmgr(&sc->admin_lk, LK_EXCLUSIVE); 97 while ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) 98 lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwend", 0); 99 lockmgr(&sc->admin_lk, LK_RELEASE); 100 101 /* 102 * Disconnect our disks while the admin thread is still running, 103 * ensuring that the poll works even if interrupts are broken. 104 * Otherwise we could deadlock in the devfs core. 105 */ 106 for (i = 0; i < NVME_MAX_NAMESPACES; ++i) { 107 nvme_softns_t *nsc; 108 109 if ((nsc = sc->nscary[i]) != NULL) { 110 nvme_disk_detach(nsc); 111 112 kfree(nsc, M_NVME); 113 sc->nscary[i] = NULL; 114 } 115 } 116 117 /* 118 * Ask the admin thread to shut-down. 119 */ 120 lockmgr(&sc->admin_lk, LK_EXCLUSIVE); 121 wakeup(&sc->admin_signal); 122 while (sc->admin_signal & ADMIN_SIG_RUNNING) 123 lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwend", 0); 124 lockmgr(&sc->admin_lk, LK_RELEASE); 125 if (sc->irq_handle[0]) { 126 bus_teardown_intr(sc->dev, sc->irq[0], sc->irq_handle[0]); 127 sc->irq_handle[0] = NULL; 128 } 129 lockuninit(&sc->admin_lk); 130 131 /* 132 * Thread might be running on another cpu, give it time to actually 133 * exit before returning in case the caller is about to unload the 134 * module. Otherwise we don't need this. 135 */ 136 nvme_os_sleep(1); 137 } 138 139 static 140 void 141 nvme_admin_thread(void *arg) 142 { 143 nvme_softc_t *sc = arg; 144 uint32_t i; 145 146 lockmgr(&sc->admin_lk, LK_EXCLUSIVE); 147 atomic_set_int(&sc->admin_signal, ADMIN_SIG_RUNNING); 148 wakeup(&sc->admin_signal); 149 150 sc->admin_func = nvme_admin_state_identify_ctlr; 151 152 while ((sc->admin_signal & ADMIN_SIG_STOP) == 0) { 153 for (i = 0; i <= sc->niocomqs; ++i) { 154 nvme_comqueue_t *comq = &sc->comqueues[i]; 155 156 if (comq->nqe == 0) /* not configured */ 157 continue; 158 159 lockmgr(&comq->lk, LK_EXCLUSIVE); 160 nvme_poll_completions(comq, &comq->lk); 161 lockmgr(&comq->lk, LK_RELEASE); 162 } 163 if (sc->admin_signal & ADMIN_SIG_REQUEUE) { 164 atomic_clear_int(&sc->admin_signal, ADMIN_SIG_REQUEUE); 165 nvme_disk_requeues(sc); 166 } 167 if (sc->admin_func(sc) == 0 && 168 (sc->admin_signal & ADMIN_SIG_RUN_MASK) == 0) { 169 lksleep(&sc->admin_signal, &sc->admin_lk, 0, 170 "nvidle", hz); 171 } 172 } 173 174 /* 175 * Cleanup state. 176 * 177 * Note that we actually issue delete queue commands here. The NVME 178 * spec says that for a normal shutdown the I/O queues should be 179 * deleted prior to issuing the shutdown in the CONFIG register. 180 */ 181 for (i = 1; i <= sc->niosubqs; ++i) { 182 nvme_delete_subqueue(sc, i); 183 nvme_free_subqueue(sc, i); 184 } 185 for (i = 1; i <= sc->niocomqs; ++i) { 186 nvme_delete_comqueue(sc, i); 187 nvme_free_comqueue(sc, i); 188 } 189 190 /* 191 * Signal that we are done. 192 */ 193 atomic_clear_int(&sc->admin_signal, ADMIN_SIG_RUNNING); 194 wakeup(&sc->admin_signal); 195 lockmgr(&sc->admin_lk, LK_RELEASE); 196 } 197 198 /* 199 * Identify the controller 200 */ 201 static 202 int 203 nvme_admin_state_identify_ctlr(nvme_softc_t *sc) 204 { 205 nvme_request_t *req; 206 nvme_ident_ctlr_data_t *rp; 207 int status; 208 uint64_t mempgsize; 209 char serial[20+16]; 210 char model[40+16]; 211 212 /* 213 * Identify Controller 214 */ 215 mempgsize = NVME_CAP_MEMPG_MIN_GET(sc->cap); 216 217 req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY); 218 req->cmd.identify.cns = NVME_CNS_CTLR; 219 req->cmd.identify.cntid = 0; 220 bzero(req->info, sizeof(*req->info)); 221 nvme_submit_request(req); 222 status = nvme_wait_request(req); 223 /* XXX handle status */ 224 225 sc->idctlr = req->info->idctlr; 226 nvme_put_request(req); 227 228 rp = &sc->idctlr; 229 230 KKASSERT(sizeof(sc->idctlr.serialno) == 20); 231 KKASSERT(sizeof(sc->idctlr.modelno) == 40); 232 bzero(serial, sizeof(serial)); 233 bzero(model, sizeof(model)); 234 bcopy(rp->serialno, serial, sizeof(rp->serialno)); 235 bcopy(rp->modelno, model, sizeof(rp->modelno)); 236 string_cleanup(serial, 0); 237 string_cleanup(model, 0); 238 239 device_printf(sc->dev, "Model %s BaseSerial %s nscount=%d\n", 240 model, serial, rp->ns_count); 241 242 sc->admin_func = nvme_admin_state_make_queues; 243 244 return 1; 245 } 246 247 /* 248 * Request and create the I/O queues. Figure out CPU mapping optimizations. 249 */ 250 static 251 int 252 nvme_admin_state_make_queues(nvme_softc_t *sc) 253 { 254 nvme_request_t *req; 255 uint16_t niosubqs; 256 uint16_t niocomqs; 257 uint32_t i; 258 uint16_t qno; 259 int status; 260 int error; 261 262 /* 263 * Calculate how many I/O queues (non-inclusive of admin queue) 264 * we want to have, up to 65535. dw0 in the response returns the 265 * number of queues the controller gives us. Submission and 266 * Completion queues are specified separately. 267 * 268 * This driver runs optimally with 4 submission queues and one 269 * completion queue per cpu (rdhipri, rdlopri, wrhipri, wrlopri), 270 * 271 * +1 for dumps 272 * +1 for async events 273 */ 274 req = nvme_get_admin_request(sc, NVME_OP_SET_FEATURES); 275 276 niosubqs = ncpus * 4 + 2; 277 niocomqs = ncpus + 2; 278 if (niosubqs > NVME_MAX_QUEUES) 279 niosubqs = NVME_MAX_QUEUES; 280 if (niocomqs > NVME_MAX_QUEUES) 281 niocomqs = NVME_MAX_QUEUES; 282 device_printf(sc->dev, "Request %u/%u queues, ", niosubqs, niocomqs); 283 284 req->cmd.setfeat.flags = NVME_FID_NUMQUEUES; 285 req->cmd.setfeat.numqs.nsqr = niosubqs - 1; /* 0's based 0=1 */ 286 req->cmd.setfeat.numqs.ncqr = niocomqs - 1; /* 0's based 0=1 */ 287 288 nvme_submit_request(req); 289 290 /* 291 * Get response and set our operations mode. 292 */ 293 status = nvme_wait_request(req); 294 /* XXX handle status */ 295 296 if (status == 0) { 297 sc->niosubqs = 1 + (req->res.setfeat.dw0 & 0xFFFFU); 298 sc->niocomqs = 1 + ((req->res.setfeat.dw0 >> 16) & 0xFFFFU); 299 } else { 300 sc->niosubqs = 0; 301 sc->niocomqs = 0; 302 } 303 kprintf("Returns %u/%u queues, ", sc->niosubqs, sc->niocomqs); 304 305 nvme_put_request(req); 306 307 if (sc->niosubqs >= ncpus * 4 + 2 && sc->niocomqs >= ncpus + 2) { 308 /* 309 * If we got all the queues we wanted do a full-bore setup of 310 * qmap[cpu][type]. 311 * 312 * Remember that subq 0 / comq 0 is the admin queue. 313 */ 314 kprintf("optimal map\n"); 315 sc->dumpqno = 1; 316 sc->eventqno = 2; 317 sc->subqueues[1].comqid = 1; 318 sc->subqueues[2].comqid = 2; 319 qno = 3; 320 for (i = 0; i < ncpus; ++i) { 321 int cpuqno = sc->cputovect[i]; 322 323 KKASSERT(cpuqno != 0); 324 sc->qmap[i][0] = qno + 0; 325 sc->qmap[i][1] = qno + 1; 326 sc->qmap[i][2] = qno + 2; 327 sc->qmap[i][3] = qno + 3; 328 sc->subqueues[qno + 0].comqid = cpuqno; 329 sc->subqueues[qno + 1].comqid = cpuqno; 330 sc->subqueues[qno + 2].comqid = cpuqno; 331 sc->subqueues[qno + 3].comqid = cpuqno; 332 qno += 4; 333 } 334 sc->niosubqs = ncpus * 4 + 2; 335 sc->niocomqs = ncpus + 2; 336 } else if (sc->niosubqs >= ncpus && sc->niocomqs >= ncpus) { 337 /* 338 * We have enough to give each cpu its own submission 339 * and completion queue. 340 * 341 * leave dumpqno and eventqno set to the admin queue. 342 */ 343 kprintf("nominal map 1:1 cpu\n"); 344 sc->dumpqno = 0; 345 sc->eventqno = 0; 346 for (i = 0; i < ncpus; ++i) { 347 qno = sc->cputovect[i]; 348 KKASSERT(qno != 0); 349 sc->qmap[i][0] = qno; 350 sc->qmap[i][1] = qno; 351 sc->qmap[i][2] = qno; 352 sc->qmap[i][3] = qno; 353 sc->subqueues[qno].comqid = qno; 354 } 355 sc->niosubqs = ncpus; 356 sc->niocomqs = ncpus; 357 } else if (sc->niosubqs >= 4 && sc->niocomqs >= 2) { 358 /* 359 * We have enough queues to separate and prioritize reads 360 * and writes, but all cpus have to share the same submission 361 * queues. Completion queues are split up between cpus 362 * as much as possible. 363 * 364 * leave dumpqno and eventqno set to the admin queue. 365 */ 366 kprintf("rw-sep map\n"); 367 qno = 1; 368 for (i = 0; i < ncpus; ++i) { 369 int cpuqno = sc->cputovect[i]; 370 371 KKASSERT(qno != 0); 372 sc->qmap[i][0] = qno + 0; /* read lopri */ 373 sc->qmap[i][1] = qno + 1; /* read hipri */ 374 sc->qmap[i][2] = qno + 2; /* write lopri */ 375 sc->qmap[i][3] = qno + 3; /* write hipri */ 376 if (i <= 0) 377 sc->subqueues[qno + 0].comqid = cpuqno; 378 if (i <= 1) 379 sc->subqueues[qno + 1].comqid = cpuqno; 380 if (i <= 2) 381 sc->subqueues[qno + 2].comqid = cpuqno; 382 if (i <= 3) 383 sc->subqueues[qno + 3].comqid = cpuqno; 384 /* do not increment qno */ 385 } 386 sc->niosubqs = 6; 387 sc->niocomqs = 3; 388 } else if (sc->niosubqs >= 2) { 389 /* 390 * We have enough to have separate read and write queues. 391 */ 392 kprintf("basic map\n"); 393 qno = 1; 394 sc->dumpqno = 0; 395 sc->eventqno = 0; 396 for (i = 0; i < ncpus; ++i) { 397 int cpuqno = sc->cputovect[i]; 398 399 KKASSERT(qno != 0); 400 sc->qmap[i][0] = qno + 0; /* read lopri */ 401 sc->qmap[i][1] = qno + 0; /* read hi pri */ 402 sc->qmap[i][2] = qno + 1; /* write lopri */ 403 sc->qmap[i][3] = qno + 1; /* write hi pri */ 404 if (i <= 0) 405 sc->subqueues[qno + 0].comqid = cpuqno; 406 if (i <= 1) 407 sc->subqueues[qno + 1].comqid = cpuqno; 408 } 409 sc->niosubqs = 2; 410 sc->niocomqs = 1; 411 } else { 412 /* 413 * Minimal configuration, all cpus and I/O types use the 414 * same queue. Sad day. 415 */ 416 kprintf("minimal map\n"); 417 sc->dumpqno = 0; 418 sc->eventqno = 0; 419 for (i = 0; i < ncpus; ++i) { 420 sc->qmap[i][0] = 1; 421 sc->qmap[i][1] = 1; 422 sc->qmap[i][2] = 1; 423 sc->qmap[i][3] = 1; 424 } 425 sc->subqueues[1].comqid = 1; 426 sc->niosubqs = 1; 427 sc->niocomqs = 1; 428 } 429 430 /* 431 * Create all I/O submission and completion queues. The I/O 432 * queues start at 1 and are inclusive of niosubqs and niocomqs. 433 * 434 * NOTE: Completion queues must be created before submission queues. 435 * That is, the completion queue specified when creating a 436 * submission queue must already exist. 437 */ 438 error = 0; 439 for (i = 1; i <= sc->niocomqs; ++i) { 440 error += nvme_alloc_comqueue(sc, i); 441 if (error) { 442 device_printf(sc->dev, "Unable to allocate comqs\n"); 443 break; 444 } 445 error += nvme_create_comqueue(sc, i); 446 } 447 for (i = 1; i <= sc->niosubqs; ++i) { 448 error += nvme_alloc_subqueue(sc, i); 449 if (error) { 450 device_printf(sc->dev, "Unable to allocate subqs\n"); 451 break; 452 } 453 error += nvme_create_subqueue(sc, i); 454 } 455 456 if (error) { 457 device_printf(sc->dev, "Failed to initialize device!\n"); 458 sc->admin_func = nvme_admin_state_failed; 459 } else { 460 sc->admin_func = nvme_admin_state_identify_ns; 461 } 462 463 return 1; 464 } 465 466 /* 467 * Identify available namespaces, iterate, and attach to disks. 468 */ 469 static 470 int 471 nvme_admin_state_identify_ns(nvme_softc_t *sc) 472 { 473 nvme_request_t *req; 474 nvme_nslist_data_t *rp; 475 int status; 476 uint32_t i; 477 uint32_t j; 478 479 /* 480 * Identify Namespace List 481 */ 482 req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY); 483 req->cmd.identify.cns = NVME_CNS_ACT_NSLIST; 484 req->cmd.identify.cntid = 0; 485 bzero(req->info, sizeof(*req->info)); 486 nvme_submit_request(req); 487 status = nvme_wait_request(req); 488 /* XXX handle status */ 489 490 sc->nslist = req->info->nslist; 491 nvme_put_request(req); 492 493 /* 494 * Identify each Namespace 495 */ 496 rp = &sc->nslist; 497 for (i = 0; i < sc->idctlr.ns_count; ++i) { 498 nvme_softns_t *nsc; 499 nvme_lba_fmt_data_t *lbafmt; 500 501 if (rp->nids[i] == 0) 502 continue; 503 504 req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY); 505 req->cmd.identify.cns = NVME_CNS_ACT_NS; 506 req->cmd.identify.cntid = 0; 507 req->cmd.identify.head.nsid = rp->nids[i]; 508 bzero(req->info, sizeof(*req->info)); 509 nvme_submit_request(req); 510 status = nvme_wait_request(req); 511 if (status != 0) 512 continue; 513 514 for (j = 0; j < NVME_MAX_NAMESPACES; ++j) { 515 if (sc->nscary[j] && 516 sc->nscary[j]->nsid == rp->nids[i]) 517 break; 518 } 519 if (j == NVME_MAX_NAMESPACES) { 520 j = i; 521 if (sc->nscary[j] != NULL) { 522 for (j = NVME_MAX_NAMESPACES - 1; j >= 0; --j) { 523 if (sc->nscary[j] == NULL) 524 break; 525 } 526 } 527 } 528 if (j < 0) { 529 device_printf(sc->dev, "not enough room in nscary for " 530 "namespace %08x\n", rp->nids[i]); 531 nvme_put_request(req); 532 continue; 533 } 534 nsc = sc->nscary[j]; 535 if (nsc == NULL) { 536 nsc = kmalloc(sizeof(*nsc), M_NVME, M_WAITOK | M_ZERO); 537 nsc->unit = nvme_alloc_disk_unit(); 538 sc->nscary[j] = nsc; 539 } 540 if (sc->nscmax <= j) 541 sc->nscmax = j + 1; 542 nsc->sc = sc; 543 nsc->nsid = rp->nids[i]; 544 nsc->state = NVME_NSC_STATE_UNATTACHED; 545 nsc->idns = req->info->idns; 546 bioq_init(&nsc->bioq); 547 lockinit(&nsc->lk, "nvnsc", 0, 0); 548 549 nvme_put_request(req); 550 551 j = NVME_FLBAS_SEL_GET(nsc->idns.flbas); 552 lbafmt = &nsc->idns.lba_fmt[j]; 553 nsc->blksize = 1 << lbafmt->sect_size; 554 555 /* 556 * Attach the namespace 557 */ 558 nvme_disk_attach(nsc); 559 } 560 561 sc->admin_func = nvme_admin_state_operating; 562 return 1; 563 } 564 565 static 566 int 567 nvme_admin_state_operating(nvme_softc_t *sc) 568 { 569 if ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) { 570 atomic_set_int(&sc->admin_signal, ADMIN_SIG_PROBED); 571 wakeup(&sc->admin_signal); 572 } 573 574 return 0; 575 } 576 577 static 578 int 579 nvme_admin_state_failed(nvme_softc_t *sc) 580 { 581 if ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) { 582 atomic_set_int(&sc->admin_signal, ADMIN_SIG_PROBED); 583 wakeup(&sc->admin_signal); 584 } 585 586 return 0; 587 } 588