/* * Copyright (c) 2016 The DragonFly Project. All rights reserved. * * This code is derived from software contributed to The DragonFly Project * by Matthew Dillon * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * 3. Neither the name of The DragonFly Project nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific, prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Administration thread * * - Handles resetting, features, iteration of namespaces, and disk * attachments. Most admin operations are serialized by the admin thread. * * - Ioctls as well as any BIOs which require more sophisticated processing * are handed to this thread as well. * * - Can freeze/resume other queues for various purposes. */ #include "nvme.h" static void nvme_admin_thread(void *arg); static int nvme_admin_state_identify_ctlr(nvme_softc_t *sc); static int nvme_admin_state_make_queues(nvme_softc_t *sc); static int nvme_admin_state_identify_ns(nvme_softc_t *sc); static int nvme_admin_state_operating(nvme_softc_t *sc); static int nvme_admin_state_failed(nvme_softc_t *sc); /* * Start the admin thread and block until it says it is running. */ int nvme_start_admin_thread(nvme_softc_t *sc) { int error, intr_flags; lockinit(&sc->admin_lk, "admlk", 0, 0); lockinit(&sc->ioctl_lk, "nvioc", 0, 0); sc->admin_signal = 0; intr_flags = INTR_MPSAFE; if (sc->nirqs == 1) { /* This interrupt processes data CQs too */ intr_flags |= INTR_HIFREQ; } error = bus_setup_intr(sc->dev, sc->irq[0], intr_flags, nvme_intr, &sc->comqueues[0], &sc->irq_handle[0], NULL); if (error) { device_printf(sc->dev, "unable to install interrupt\n"); return error; } lockmgr(&sc->admin_lk, LK_EXCLUSIVE); kthread_create(nvme_admin_thread, sc, &sc->admintd, "nvme_admin"); while ((sc->admin_signal & ADMIN_SIG_RUNNING) == 0) lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwbeg", 0); lockmgr(&sc->admin_lk, LK_RELEASE); return 0; } /* * Stop the admin thread and block until it says it is done. */ void nvme_stop_admin_thread(nvme_softc_t *sc) { uint32_t i; atomic_set_int(&sc->admin_signal, ADMIN_SIG_STOP); /* * We have to wait for the admin thread to finish its probe * before shutting it down. Break out if the admin thread * never managed to even start. */ lockmgr(&sc->admin_lk, LK_EXCLUSIVE); while ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) { if ((sc->admin_signal & ADMIN_SIG_RUNNING) == 0) break; lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwend", 0); } lockmgr(&sc->admin_lk, LK_RELEASE); /* * Disconnect our disks while the admin thread is still running, * ensuring that the poll works even if interrupts are broken. * Otherwise we could deadlock in the devfs core. */ for (i = 0; i < NVME_MAX_NAMESPACES; ++i) { nvme_softns_t *nsc; if ((nsc = sc->nscary[i]) != NULL) { nvme_disk_detach(nsc); kfree(nsc, M_NVME); sc->nscary[i] = NULL; } } /* * Ask the admin thread to shut-down. */ lockmgr(&sc->admin_lk, LK_EXCLUSIVE); wakeup(&sc->admin_signal); while (sc->admin_signal & ADMIN_SIG_RUNNING) lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwend", 0); lockmgr(&sc->admin_lk, LK_RELEASE); if (sc->irq_handle[0]) { bus_teardown_intr(sc->dev, sc->irq[0], sc->irq_handle[0]); sc->irq_handle[0] = NULL; } lockuninit(&sc->ioctl_lk); lockuninit(&sc->admin_lk); /* * Thread might be running on another cpu, give it time to actually * exit before returning in case the caller is about to unload the * module. Otherwise we don't need this. */ nvme_os_sleep(1); } static void nvme_admin_thread(void *arg) { nvme_softc_t *sc = arg; uint32_t i; lockmgr(&sc->admin_lk, LK_EXCLUSIVE); atomic_set_int(&sc->admin_signal, ADMIN_SIG_RUNNING); wakeup(&sc->admin_signal); sc->admin_func = nvme_admin_state_identify_ctlr; while ((sc->admin_signal & ADMIN_SIG_STOP) == 0) { for (i = 0; i <= sc->niocomqs; ++i) { nvme_comqueue_t *comq = &sc->comqueues[i]; if (comq->nqe == 0) /* not configured */ continue; lockmgr(&comq->lk, LK_EXCLUSIVE); nvme_poll_completions(comq, &comq->lk); lockmgr(&comq->lk, LK_RELEASE); } if (sc->admin_signal & ADMIN_SIG_REQUEUE) { atomic_clear_int(&sc->admin_signal, ADMIN_SIG_REQUEUE); nvme_disk_requeues(sc); } if (sc->admin_func(sc) == 0 && (sc->admin_signal & ADMIN_SIG_RUN_MASK) == 0) { lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvidle", hz); } } /* * Cleanup state. * * Note that we actually issue delete queue commands here. The NVME * spec says that for a normal shutdown the I/O queues should be * deleted prior to issuing the shutdown in the CONFIG register. */ for (i = 1; i <= sc->niosubqs; ++i) { nvme_delete_subqueue(sc, i); nvme_free_subqueue(sc, i); } for (i = 1; i <= sc->niocomqs; ++i) { nvme_delete_comqueue(sc, i); nvme_free_comqueue(sc, i); } /* * Signal that we are done. */ atomic_clear_int(&sc->admin_signal, ADMIN_SIG_RUNNING); wakeup(&sc->admin_signal); lockmgr(&sc->admin_lk, LK_RELEASE); } /* * Identify the controller */ static int nvme_admin_state_identify_ctlr(nvme_softc_t *sc) { nvme_request_t *req; nvme_ident_ctlr_data_t *rp; int status; uint64_t mempgsize; char serial[20+16]; char model[40+16]; /* * Identify Controller */ mempgsize = NVME_CAP_MEMPG_MIN_GET(sc->cap); req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY); req->cmd.identify.cns = NVME_CNS_CTLR; req->cmd.identify.cntid = 0; bzero(req->info, sizeof(*req->info)); nvme_submit_request(req); status = nvme_wait_request(req); /* XXX handle status */ sc->idctlr = req->info->idctlr; nvme_put_request(req); rp = &sc->idctlr; KKASSERT(sizeof(sc->idctlr.serialno) == 20); KKASSERT(sizeof(sc->idctlr.modelno) == 40); bzero(serial, sizeof(serial)); bzero(model, sizeof(model)); bcopy(rp->serialno, serial, sizeof(rp->serialno)); bcopy(rp->modelno, model, sizeof(rp->modelno)); string_cleanup(serial, 0); string_cleanup(model, 0); device_printf(sc->dev, "Model %s BaseSerial %s nscount=%d\n", model, serial, rp->ns_count); sc->admin_func = nvme_admin_state_make_queues; return 1; } #define COMQFIXUP(msix, ncomqs) ((((msix) - 1) % ncomqs) + 1) /* * Request and create the I/O queues. Figure out CPU mapping optimizations. */ static int nvme_admin_state_make_queues(nvme_softc_t *sc) { nvme_request_t *req; uint16_t niosubqs; uint16_t niocomqs; uint32_t i; uint16_t qno; int status; int error; /* * Calculate how many I/O queues (non-inclusive of admin queue) * we want to have, up to 65535. dw0 in the response returns the * number of queues the controller gives us. Submission and * Completion queues are specified separately. * * This driver runs optimally with 4 submission queues and one * completion queue per cpu (rdhipri, rdlopri, wrhipri, wrlopri), * * +1 for dumps XXX future * +1 for async events XXX future * * NOTE: Set one less than the #define because we use 1...N for I/O * queues (queue 0 is used for the admin queue). Easier this * way. */ req = nvme_get_admin_request(sc, NVME_OP_SET_FEATURES); niosubqs = ncpus * 2 + 0; niocomqs = ncpus + 0; if (niosubqs >= NVME_MAX_QUEUES) niosubqs = NVME_MAX_QUEUES - 1; if (niocomqs >= NVME_MAX_QUEUES) niocomqs = NVME_MAX_QUEUES - 1; /* * If there are insufficient MSI-X vectors or we use a normal * interrupt, the completion queues are going to wind up being * polled by a single admin interrupt. Limit the number of * completion queues in this case to something reasonable. */ if (sc->nirqs == 1 && niocomqs > 4) { niocomqs = 4; device_printf(sc->dev, "no MSI-X support, limit comqs to %d\n", niocomqs); } device_printf(sc->dev, "Request %u/%u queues, ", niosubqs, niocomqs); req->cmd.setfeat.flags = NVME_FID_NUMQUEUES; req->cmd.setfeat.numqs.nsqr = niosubqs - 1; /* 0's based 0=1 */ req->cmd.setfeat.numqs.ncqr = niocomqs - 1; /* 0's based 0=1 */ nvme_submit_request(req); /* * Get response and set our operations mode. Limit the returned * queue counts to no more than we requested (some chipsets may * return more than the requested number of queues while others * will not). */ status = nvme_wait_request(req); /* XXX handle status */ if (status == 0) { sc->niosubqs = 1 + (req->res.setfeat.dw0 & 0xFFFFU); sc->niocomqs = 1 + ((req->res.setfeat.dw0 >> 16) & 0xFFFFU); if (sc->niosubqs > niosubqs) sc->niosubqs = niosubqs; if (sc->niocomqs > niocomqs) sc->niocomqs = niocomqs; } else { sc->niosubqs = 0; sc->niocomqs = 0; } kprintf("Returns %u/%u queues, ", sc->niosubqs, sc->niocomqs); nvme_put_request(req); sc->dumpqno = 0; sc->eventqno = 0; if (sc->niosubqs >= ncpus * 2 + 0 && sc->niocomqs >= ncpus + 0) { /* * If we got all the queues we wanted do a full-bore setup of * qmap[cpu][type]. * * Remember that subq 0 / comq 0 is the admin queue. */ kprintf("optimal map\n"); qno = 1; for (i = 0; i < ncpus; ++i) { int cpuqno = COMQFIXUP(sc->cputovect[i], ncpus); KKASSERT(cpuqno != 0); sc->qmap[i][0] = qno + 0; sc->qmap[i][1] = qno + 1; sc->subqueues[qno + 0].comqid = cpuqno; sc->subqueues[qno + 1].comqid = cpuqno; qno += 2; } sc->niosubqs = ncpus * 2 + 0; sc->niocomqs = ncpus + 0; } else if (sc->niosubqs >= ncpus && sc->niocomqs >= ncpus) { /* * We have enough to give each cpu its own submission * and completion queue. * * leave dumpqno and eventqno set to the admin queue. */ kprintf("nominal map 1:1 cpu\n"); for (i = 0; i < ncpus; ++i) { qno = sc->cputovect[i]; KKASSERT(qno != 0); sc->qmap[i][0] = qno; sc->qmap[i][1] = qno; sc->subqueues[qno].comqid = COMQFIXUP(qno, ncpus); } sc->niosubqs = ncpus; sc->niocomqs = ncpus; } else if (sc->niosubqs >= 2 && sc->niocomqs >= 2) { /* * prioritize trying to distribute available queues to * cpus, don't separate read and write. * * leave dumpqno and eventqno set to the admin queue. */ kprintf("rw-sep map (%d, %d)\n", sc->niosubqs, sc->niocomqs); for (i = 0; i < ncpus; ++i) { int cpuqno = COMQFIXUP(sc->cputovect[i], sc->niocomqs); int qno = COMQFIXUP((i + 1), sc->niosubqs); KKASSERT(qno != 0); sc->qmap[i][0] = qno; /* read */ sc->qmap[i][1] = qno; /* write */ sc->subqueues[qno].comqid = cpuqno; /* do not increment qno */ } #if 0 sc->niosubqs = 2; sc->niocomqs = 2; #endif } else if (sc->niosubqs >= 2) { /* * We have enough to have separate read and write queues. */ kprintf("basic map\n"); qno = 1; for (i = 0; i < ncpus; ++i) { int cpuqno = COMQFIXUP(sc->cputovect[i], 1); KKASSERT(qno != 0); sc->qmap[i][0] = qno + 0; /* read */ sc->qmap[i][1] = qno + 1; /* write */ if (i <= 0) sc->subqueues[qno + 0].comqid = cpuqno; if (i <= 1) sc->subqueues[qno + 1].comqid = cpuqno; } sc->niosubqs = 2; sc->niocomqs = 1; } else { /* * Minimal configuration, all cpus and I/O types use the * same queue. Sad day. */ kprintf("minimal map\n"); sc->dumpqno = 0; sc->eventqno = 0; for (i = 0; i < ncpus; ++i) { sc->qmap[i][0] = 1; sc->qmap[i][1] = 1; } sc->subqueues[1].comqid = 1; sc->niosubqs = 1; sc->niocomqs = 1; } /* * Create all I/O submission and completion queues. The I/O * queues start at 1 and are inclusive of niosubqs and niocomqs. * * NOTE: Completion queues must be created before submission queues. * That is, the completion queue specified when creating a * submission queue must already exist. */ error = 0; for (i = 1; i <= sc->niocomqs; ++i) { error += nvme_alloc_comqueue(sc, i); if (error) { device_printf(sc->dev, "Unable to allocate comqs\n"); break; } error += nvme_create_comqueue(sc, i); if (error) { nvme_delete_comqueue(sc, i); nvme_free_comqueue(sc, i); break; } } for (i = 1; i <= sc->niosubqs; ++i) { error += nvme_alloc_subqueue(sc, i); if (error) { device_printf(sc->dev, "Unable to allocate subqs\n"); break; } error += nvme_create_subqueue(sc, i); if (error) { nvme_delete_subqueue(sc, i); nvme_free_subqueue(sc, i); break; } } if (error) { device_printf(sc->dev, "Failed to initialize device!\n"); sc->admin_func = nvme_admin_state_failed; } else { sc->admin_func = nvme_admin_state_identify_ns; } /* * Disable interrupt coalescing. It is basically worthless because * setting the threshold has no effect when time is set to 0, and the * smallest time that can be set is 1 (== 100uS), which is too long. * Sequential performance is destroyed (on e.g. the Intel 750). * So kill it. */ req = nvme_get_admin_request(sc, NVME_OP_SET_FEATURES); device_printf(sc->dev, "Interrupt Coalesce: 100uS / 4 qentries\n"); req->cmd.setfeat.flags = NVME_FID_INTCOALESCE; req->cmd.setfeat.intcoal.thr = 0; req->cmd.setfeat.intcoal.time = 0; nvme_submit_request(req); status = nvme_wait_request(req); if (status) { device_printf(sc->dev, "Interrupt coalesce failed status=%d\n", status); } nvme_put_request(req); return 1; } /* * Identify available namespaces, iterate, and attach to disks. */ static int nvme_admin_state_identify_ns(nvme_softc_t *sc) { nvme_request_t *req; nvme_ident_ns_list_t *rp; int status; int i; int j; if (bootverbose) { if (sc->idctlr.admin_cap & NVME_ADMIN_NSMANAGE) device_printf(sc->dev, "Namespace management supported\n"); else device_printf(sc->dev, "Namespace management not supported\n"); } #if 0 /* * Identify Controllers TODO TODO TODO */ if (sc->idctlr.admin_cap & NVME_ADMIN_NSMANAGE) { req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY); req->cmd.identify.cns = NVME_CNS_ANY_CTLR_LIST; req->cmd.identify.cntid = 0; bzero(req->info, sizeof(*req->info)); nvme_submit_request(req); status = nvme_wait_request(req); kprintf("nsquery status %08x\n", status); #if 0 for (i = 0; i < req->info->ctlrlist.idcount; ++i) { kprintf("CTLR %04x\n", req->info->ctlrlist.ctlrids[i]); } #endif nvme_put_request(req); } #endif rp = kmalloc(sizeof(*rp), M_NVME, M_WAITOK | M_ZERO); if (sc->idctlr.admin_cap & NVME_ADMIN_NSMANAGE) { /* * Namespace management supported, query active namespaces. */ req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY); req->cmd.identify.cns = NVME_CNS_ACT_NSLIST; req->cmd.identify.cntid = 0; bzero(req->info, sizeof(*req->info)); nvme_submit_request(req); status = nvme_wait_request(req); kprintf("nsquery status %08x\n", status); /* XXX handle status */ cpu_lfence(); *rp = req->info->nslist; nvme_put_request(req); } else { /* * Namespace management not supported, assume nsids 1..N. * (note: (i) limited to 1024). */ for (i = 1; i <= (int)sc->idctlr.ns_count && i <= 1024; ++i) rp->nsids[i-1] = i; } /* * Identify each Namespace */ for (i = 0; i < 1024; ++i) { nvme_softns_t *nsc; nvme_lba_fmt_data_t *lbafmt; if (rp->nsids[i] == 0) continue; req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY); req->cmd.identify.cns = NVME_CNS_ACT_NS; req->cmd.identify.cntid = 0; req->cmd.identify.head.nsid = rp->nsids[i]; bzero(req->info, sizeof(*req->info)); nvme_submit_request(req); status = nvme_wait_request(req); if (status != 0) { kprintf("NS FAILED %08x\n", status); continue; } for (j = 0; j < NVME_MAX_NAMESPACES; ++j) { if (sc->nscary[j] && sc->nscary[j]->nsid == rp->nsids[i]) break; } if (j == NVME_MAX_NAMESPACES) { j = i; if (sc->nscary[j] != NULL) { for (j = NVME_MAX_NAMESPACES - 1; j >= 0; --j) { if (sc->nscary[j] == NULL) break; } } } if (j < 0) { device_printf(sc->dev, "not enough room in nscary for " "namespace %08x\n", rp->nsids[i]); nvme_put_request(req); continue; } nsc = sc->nscary[j]; if (nsc == NULL) { nsc = kmalloc(sizeof(*nsc), M_NVME, M_WAITOK | M_ZERO); nsc->unit = nvme_alloc_disk_unit(); sc->nscary[j] = nsc; } if (sc->nscmax <= j) sc->nscmax = j + 1; nsc->sc = sc; nsc->nsid = rp->nsids[i]; nsc->state = NVME_NSC_STATE_UNATTACHED; nsc->idns = req->info->idns; bioq_init(&nsc->bioq); lockinit(&nsc->lk, "nvnsc", 0, 0); nvme_put_request(req); j = NVME_FLBAS_SEL_GET(nsc->idns.flbas); lbafmt = &nsc->idns.lba_fmt[j]; nsc->blksize = 1 << lbafmt->sect_size; /* * Attach the namespace */ nvme_disk_attach(nsc); } kfree(rp, M_NVME); sc->admin_func = nvme_admin_state_operating; return 1; } static int nvme_admin_state_operating(nvme_softc_t *sc) { if ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) { atomic_set_int(&sc->admin_signal, ADMIN_SIG_PROBED); wakeup(&sc->admin_signal); } return 0; } static int nvme_admin_state_failed(nvme_softc_t *sc) { if ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) { atomic_set_int(&sc->admin_signal, ADMIN_SIG_PROBED); wakeup(&sc->admin_signal); } return 0; }