1 /* 2 * Copyright (c) 2016-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * Most low-level chip related functions (other than attachment) reside in 36 * this module. Most functions assume that the caller is already holding 37 * appropriate locks to prevent SMP collisions. 38 */ 39 40 #include "nvme.h" 41 42 MALLOC_DEFINE(M_NVME, "NVMe Driver", "NVME"); 43 44 /* 45 * DMA mapping callbacks. 46 */ 47 static 48 void 49 nvme_dmamem_saveseg(void *info, bus_dma_segment_t *segs, int nsegs, int error) 50 { 51 KKASSERT(error == 0); 52 KKASSERT(nsegs == 1); 53 *(bus_addr_t *)info = segs->ds_addr; 54 } 55 56 /* 57 * Low-level chip enable/disable. 58 */ 59 int 60 nvme_enable(nvme_softc_t *sc, int enable) 61 { 62 uint32_t reg; 63 int error = 0; 64 int base_ticks; 65 66 reg = nvme_read(sc, NVME_REG_CONFIG); 67 if (enable == 0 && (reg & NVME_CONFIG_EN)) { 68 /* 69 * Disable the chip so we can program it. 70 */ 71 reg &= ~NVME_CONFIG_EN; 72 nvme_write(sc, NVME_REG_CONFIG, reg); 73 } else if (enable && (reg & NVME_CONFIG_EN) == 0) { 74 /* 75 * Enable the chip once programmed. 76 */ 77 reg |= NVME_CONFIG_EN; 78 nvme_write(sc, NVME_REG_CONFIG, reg); 79 } 80 error = ENXIO; 81 base_ticks = ticks; 82 while ((int)(ticks - base_ticks) < sc->entimo) { 83 reg = nvme_read(sc, NVME_REG_STATUS); 84 if (enable == 0 && (reg & NVME_STATUS_RDY) == 0) { 85 error = 0; 86 break; 87 } 88 if (enable && (reg & NVME_STATUS_RDY)) { 89 error = 0; 90 break; 91 } 92 nvme_os_sleep(50); /* 50ms poll */ 93 } 94 95 /* 96 * Interrupt masking (only applicable when MSI-X not used, 3.1.3 and 97 * 3.1.4 state that these registers should not be accessed with MSI-X) 98 */ 99 if (error == 0 && sc->nirqs == 1) { 100 if (enable) { 101 nvme_write(sc, NVME_REG_INTSET, ~1); 102 nvme_write(sc, NVME_REG_INTCLR, 1); 103 } else { 104 nvme_write(sc, NVME_REG_INTSET, ~1); 105 } 106 } 107 108 if (error) { 109 device_printf(sc->dev, "Cannot %s device\n", 110 (enable ? "enable" : "disable")); 111 } else { 112 #if 0 113 kprintf("gratuitous 15 second sleep\n"); 114 nvme_os_sleep(15000); 115 kprintf("gratuitous 15 second sleep done\n"); 116 #endif 117 } 118 return error; 119 } 120 121 /* 122 * Allocate submission and completion queues. If qid is 0 we are allocating 123 * the ADMIN queues, otherwise we are allocating I/O queues. 124 */ 125 int 126 nvme_alloc_subqueue(nvme_softc_t *sc, uint16_t qid) 127 { 128 nvme_subqueue_t *queue = &sc->subqueues[qid]; 129 int error = 0; 130 131 /* 132 * For now implement the maximum queue size negotiated in the 133 * attach. 134 */ 135 lockinit(&queue->lk, "nvqlk", 0, 0); 136 queue->sc = sc; 137 queue->nqe = sc->maxqe; 138 queue->qid = qid; 139 queue->subq_doorbell_reg = NVME_REG_SUBQ_BELL(qid, sc->dstrd4); 140 141 /* 142 * dma memory for the submission queue 143 */ 144 if (error == 0) { 145 error = bus_dmamem_alloc(sc->sque_tag, (void **)&queue->ksubq, 146 BUS_DMA_ZERO, &queue->sque_map); 147 } 148 if (error == 0) { 149 error = bus_dmamap_load(sc->sque_tag, queue->sque_map, 150 queue->ksubq, 151 bus_dma_tag_getmaxsize(sc->sque_tag), 152 nvme_dmamem_saveseg, &queue->psubq, 153 0); 154 } 155 156 /* 157 * dma memory for enough PRPs to map MAXPHYS bytes of memory per 158 * request. A MAXPHYS buffer which begins partially straddling 159 * a page boundary can still be accomodated because we have an 160 * additional PRP entry in cmd.head. 161 */ 162 if (error == 0) { 163 error = bus_dmamem_alloc(sc->prps_tag, (void **)&queue->kprps, 164 BUS_DMA_ZERO, &queue->prps_map); 165 } 166 if (error == 0) { 167 error = bus_dmamap_load(sc->prps_tag, queue->prps_map, 168 queue->kprps, 169 bus_dma_tag_getmaxsize(sc->prps_tag), 170 nvme_dmamem_saveseg, &queue->pprps, 171 0); 172 } 173 174 /* 175 * dma memory for admin data 176 */ 177 if (qid == 0 && error == 0) { 178 error = bus_dmamem_alloc(sc->adm_tag, 179 (void **)&queue->kdatapgs, 180 BUS_DMA_ZERO, &queue->adm_map); 181 } 182 if (qid == 0 && error == 0) { 183 error = bus_dmamap_load(sc->adm_tag, queue->adm_map, 184 queue->kdatapgs, 185 bus_dma_tag_getmaxsize(sc->adm_tag), 186 nvme_dmamem_saveseg, &queue->pdatapgs, 187 0); 188 } 189 190 /* 191 * Driver request structures 192 */ 193 if (error == 0) { 194 nvme_request_t *req; 195 uint32_t i; 196 197 queue->reqary = kmalloc(sizeof(nvme_request_t) * queue->nqe, 198 M_NVME, M_WAITOK | M_ZERO); 199 for (i = 0; i < queue->nqe; ++i) { 200 req = &queue->reqary[i]; 201 if (i == 0) { 202 /* 203 * Set aside one request for dump operation 204 */ 205 queue->dump_req = req; 206 } else { 207 /* 208 * The rest go through the normal list 209 */ 210 req->next_avail = queue->first_avail; 211 queue->first_avail = req; 212 } 213 req->subq = queue; 214 req->comq = &sc->comqueues[queue->comqid]; 215 req->cmd_id = i; 216 if (qid == 0) { 217 req->info = &queue->kdatapgs[i]; 218 req->pinfo = queue->pdatapgs + 219 i * sizeof(nvme_admin_data_t); 220 } 221 } 222 } 223 224 /* 225 * Error handling 226 */ 227 if (error) 228 nvme_free_subqueue(sc, qid); 229 return error; 230 } 231 232 int 233 nvme_alloc_comqueue(nvme_softc_t *sc, uint16_t qid) 234 { 235 nvme_comqueue_t *queue = &sc->comqueues[qid]; 236 int error = 0; 237 238 /* 239 * For now implement the maximum queue size negotiated in the 240 * attach. 241 */ 242 lockinit(&queue->lk, "nvqlk", 0, 0); 243 queue->sc = sc; 244 queue->qid = qid; 245 queue->phase = NVME_COMQ_STATUS_PHASE; 246 queue->comq_doorbell_reg = NVME_REG_COMQ_BELL(qid, sc->dstrd4); 247 248 if (error == 0) { 249 error = bus_dmamem_alloc(sc->cque_tag, (void **)&queue->kcomq, 250 BUS_DMA_ZERO, &queue->cque_map); 251 } 252 if (error == 0) { 253 error = bus_dmamap_load(sc->cque_tag, queue->cque_map, 254 queue->kcomq, 255 bus_dma_tag_getmaxsize(sc->cque_tag), 256 nvme_dmamem_saveseg, &queue->pcomq, 257 0); 258 } 259 260 /* 261 * Set nqe last. The comq polling loop tests this field and we 262 * do not want it to spuriously assume that the comq is initialized 263 * until it actually is. 264 */ 265 if (error == 0) 266 queue->nqe = sc->maxqe; 267 268 if (error) 269 nvme_free_comqueue(sc, qid); 270 return error; 271 } 272 273 void 274 nvme_free_subqueue(nvme_softc_t *sc, uint16_t qid) 275 { 276 nvme_subqueue_t *queue = &sc->subqueues[qid]; 277 278 queue->first_avail = NULL; 279 if (queue->reqary) { 280 kfree(queue->reqary, M_NVME); 281 queue->reqary = NULL; 282 } 283 if (queue->ksubq) { 284 bus_dmamem_free(sc->sque_tag, queue->ksubq, queue->sque_map); 285 bus_dmamap_unload(sc->sque_tag, queue->sque_map); 286 bus_dmamap_destroy(sc->sque_tag, queue->sque_map); 287 } 288 if (queue->kprps) { 289 bus_dmamem_free(sc->prps_tag, queue->kprps, queue->prps_map); 290 bus_dmamap_unload(sc->prps_tag, queue->prps_map); 291 bus_dmamap_destroy(sc->prps_tag, queue->prps_map); 292 } 293 if (queue->kdatapgs) { 294 bus_dmamem_free(sc->adm_tag, queue->kdatapgs, queue->adm_map); 295 bus_dmamap_unload(sc->adm_tag, queue->adm_map); 296 bus_dmamap_destroy(sc->adm_tag, queue->adm_map); 297 } 298 bzero(queue, sizeof(*queue)); 299 } 300 301 void 302 nvme_free_comqueue(nvme_softc_t *sc, uint16_t qid) 303 { 304 nvme_comqueue_t *queue = &sc->comqueues[qid]; 305 306 /* 307 * Clear this field first so poll loops ignore the comq. 308 */ 309 queue->nqe = 0; 310 311 if (queue->kcomq) { 312 bus_dmamem_free(sc->cque_tag, queue->kcomq, queue->cque_map); 313 bus_dmamap_unload(sc->cque_tag, queue->cque_map); 314 bus_dmamap_destroy(sc->cque_tag, queue->cque_map); 315 } 316 bzero(queue, sizeof(*queue)); 317 } 318 319 /* 320 * ADMIN AND I/O REQUEST HANDLING 321 */ 322 323 /* 324 * Obtain a request and handle DMA mapping the supplied kernel buffer. 325 * Fields in cmd.head will be initialized and remaining fields will be zero'd. 326 * Caller is responsible for filling in remaining fields as appropriate. 327 * 328 * Caller must hold the queue lock. 329 */ 330 nvme_request_t * 331 nvme_get_admin_request(nvme_softc_t *sc, uint8_t opcode) 332 { 333 nvme_request_t *req; 334 335 req = nvme_get_request(&sc->subqueues[0], opcode, NULL, 0); 336 req->cmd.head.prp1 = req->pinfo; 337 req->callback = NULL; 338 339 return req; 340 } 341 342 /* 343 * ADMIN AND I/O REQUEST HANDLING 344 */ 345 346 static __inline 347 void 348 _nvme_fill_request(nvme_subqueue_t *queue, uint8_t opcode, 349 char *kva, size_t bytes, 350 nvme_request_t *req) 351 { 352 /* 353 * Fill-in basic fields and do the DMA mapping. 354 */ 355 req->next_avail = NULL; 356 KKASSERT(req->state == NVME_REQ_AVAIL); 357 req->state = NVME_REQ_ALLOCATED; 358 req->callback = NULL; 359 req->waiting = 0; 360 361 req->cmd.head.opcode = opcode; 362 req->cmd.head.flags = NVME_SUBQFLG_PRP | NVME_SUBQFLG_NORM; 363 req->cmd.head.cid = req->cmd_id; 364 req->cmd.head.nsid = 0; 365 req->cmd.head.mptr = 0; 366 req->cmd.head.prp1 = 0; 367 req->cmd.head.prp2 = 0; 368 req->cmd.dw10 = 0; 369 req->cmd.dw11 = 0; 370 req->cmd.dw12 = 0; 371 req->cmd.dw13 = 0; 372 req->cmd.dw14 = 0; 373 req->cmd.dw15 = 0; 374 375 if (kva) { 376 size_t count = 0; 377 size_t idx = 0; 378 vm_paddr_t paddr; 379 vm_paddr_t pprptab; 380 uint64_t *kprptab; 381 KKASSERT(bytes >= 0 && bytes <= MAXPHYS); 382 383 kprptab = queue->kprps + 384 (MAXPHYS / PAGE_SIZE) * req->cmd_id; 385 pprptab = queue->pprps + 386 (MAXPHYS / PAGE_SIZE) * req->cmd_id * 387 sizeof(uint64_t); 388 389 while (count < bytes) { 390 paddr = vtophys(kva + count); 391 if (idx == 0) { 392 KKASSERT((paddr & 3) == 0); 393 req->cmd.head.prp1 = paddr; 394 count += (((intptr_t)kva + PAGE_SIZE) & 395 ~(intptr_t)PAGE_MASK) - 396 (intptr_t)kva; 397 } else if (idx == 1 && count + PAGE_SIZE >= bytes) { 398 KKASSERT((paddr & PAGE_MASK) == 0); 399 req->cmd.head.prp2 = paddr; 400 count += PAGE_SIZE; 401 } else { 402 KKASSERT((paddr & PAGE_MASK) == 0); 403 /* if (idx == 1) -- not needed, just repeat */ 404 req->cmd.head.prp2 = pprptab; /* repeat */ 405 kprptab[idx - 1] = paddr; 406 count += PAGE_SIZE; 407 } 408 ++idx; 409 } 410 } 411 } 412 413 414 /* 415 * Obtain a request and handle DMA mapping the supplied kernel buffer. 416 * Fields in cmd.head will be initialized and remaining fields will be zero'd. 417 * Caller is responsible for filling in remaining fields as appropriate. 418 * 419 * May return NULL if no requests are available or if there is no room in 420 * the submission queue to handle it (should only be possible on an I/O queue, 421 * admin queue operations are managed). 422 * 423 * Caller should NOT hold the queue lock. 424 */ 425 nvme_request_t * 426 nvme_get_request(nvme_subqueue_t *queue, uint8_t opcode, 427 char *kva, size_t bytes) 428 { 429 nvme_request_t *req; 430 nvme_request_t *next; 431 432 /* 433 * No easy lockless way to pull a new request off. We have to check 434 * for a number of conditions and there may be multiple threads 435 * making this call simultaneously, which complicates matters even 436 * more. 437 */ 438 lockmgr(&queue->lk, LK_EXCLUSIVE); 439 440 /* 441 * Make sure the submission queue has room to accomodate the 442 * request. Requests can be completed out of order so the 443 * submission ring could still be full even though we have 444 * requests available. 445 */ 446 if ((queue->subq_tail + queue->unsubmitted + 1) % queue->nqe == 447 queue->subq_head) { 448 lockmgr(&queue->lk, LK_RELEASE); 449 KKASSERT(queue->qid != 0); 450 atomic_swap_int(&queue->signal_requeue, 1); 451 452 return NULL; 453 } 454 455 /* 456 * Pop the next available request off of the first_avail linked 457 * list. An atomic op must be used here because nvme_put_request() 458 * returns requests to the list without holding queue->lk. 459 */ 460 for (;;) { 461 req = queue->first_avail; 462 cpu_ccfence(); 463 if (req == NULL) { 464 lockmgr(&queue->lk, LK_RELEASE); 465 KKASSERT(queue->qid != 0); 466 atomic_swap_int(&queue->signal_requeue, 1); 467 468 return NULL; 469 } 470 next = req->next_avail; 471 if (atomic_cmpset_ptr(&queue->first_avail, req, next)) 472 break; 473 } 474 475 /* 476 * We have to keep track of unsubmitted requests in order to be 477 * able to properly check whether the ring is full or not (check 478 * is done at the top of this procedure, above). 479 */ 480 ++queue->unsubmitted; 481 lockmgr(&queue->lk, LK_RELEASE); 482 483 _nvme_fill_request(queue, opcode, kva, bytes, req); 484 485 return req; 486 } 487 488 /* 489 * dump path only, cannot block. Allow the lock to fail and bump 490 * queue->unsubmitted anyway. 491 */ 492 nvme_request_t * 493 nvme_get_dump_request(nvme_subqueue_t *queue, uint8_t opcode, 494 char *kva, size_t bytes) 495 { 496 nvme_request_t *req; 497 int error; 498 499 error = lockmgr(&queue->lk, LK_EXCLUSIVE | LK_NOWAIT); 500 req = queue->dump_req; 501 ++queue->unsubmitted; 502 if (error == 0) 503 lockmgr(&queue->lk, LK_RELEASE); 504 _nvme_fill_request(queue, opcode, kva, bytes, req); 505 506 return req; 507 } 508 509 /* 510 * Submit request for execution. This will doorbell the subq. 511 * 512 * Caller must hold the queue lock. 513 */ 514 void 515 nvme_submit_request(nvme_request_t *req) 516 { 517 nvme_subqueue_t *queue = req->subq; 518 nvme_allcmd_t *cmd; 519 520 cmd = &queue->ksubq[queue->subq_tail]; 521 --queue->unsubmitted; 522 if (++queue->subq_tail == queue->nqe) 523 queue->subq_tail = 0; 524 KKASSERT(queue->subq_tail != queue->subq_head); 525 *cmd = req->cmd; 526 cpu_sfence(); /* needed? */ 527 req->state = NVME_REQ_SUBMITTED; 528 nvme_write(queue->sc, queue->subq_doorbell_reg, queue->subq_tail); 529 } 530 531 /* 532 * Wait for a request to complete. 533 * 534 * Caller does not need to hold the queue lock. If it does, or if it 535 * holds some other lock, it should pass it in so it can be released across 536 * sleeps, else pass NULL. 537 */ 538 int 539 nvme_wait_request(nvme_request_t *req) 540 { 541 struct lock *lk; 542 int code; 543 544 req->waiting = 1; 545 if (req->state != NVME_REQ_COMPLETED) { 546 lk = &req->comq->lk; 547 cpu_lfence(); 548 lockmgr(lk, LK_EXCLUSIVE); 549 while (req->state == NVME_REQ_SUBMITTED) { 550 nvme_poll_completions(req->comq, lk); 551 if (req->state != NVME_REQ_SUBMITTED) 552 break; 553 lksleep(req, lk, 0, "nvwait", hz); 554 } 555 lockmgr(lk, LK_RELEASE); 556 KKASSERT(req->state == NVME_REQ_COMPLETED); 557 } 558 cpu_lfence(); 559 code = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status); 560 561 return code; 562 } 563 564 /* 565 * dump path only, we cannot block, and the lock is allowed 566 * to fail. But still try to play nice with interrupt threads. 567 */ 568 int 569 nvme_poll_request(nvme_request_t *req) 570 { 571 struct lock *lk; 572 int code; 573 int didlock = 500; /* 500uS max */ 574 575 req->waiting = 1; 576 if (req->state != NVME_REQ_COMPLETED) { 577 lk = &req->comq->lk; 578 cpu_lfence(); 579 while (lockmgr(lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 580 if (--didlock == 0) 581 break; 582 tsc_delay(1000); /* 1uS */ 583 } 584 while (req->state == NVME_REQ_SUBMITTED) { 585 nvme_poll_completions(req->comq, lk); 586 if (req->state != NVME_REQ_SUBMITTED) 587 break; 588 lwkt_switch(); 589 } 590 if (didlock) 591 lockmgr(lk, LK_RELEASE); 592 KKASSERT(req->state == NVME_REQ_COMPLETED); 593 } 594 cpu_lfence(); 595 code = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status); 596 597 return code; 598 } 599 600 /* 601 * Put request away, making it available for reuse. If this is an admin 602 * request its auxillary data page is also being released for reuse. 603 * 604 * Caller does NOT have to hold the queue lock. 605 */ 606 void 607 nvme_put_request(nvme_request_t *req) 608 { 609 nvme_subqueue_t *queue = req->subq; 610 nvme_request_t *next; 611 612 /* 613 * Insert on head for best cache reuse. 614 */ 615 KKASSERT(req->state == NVME_REQ_COMPLETED); 616 req->state = NVME_REQ_AVAIL; 617 for (;;) { 618 next = queue->first_avail; 619 cpu_ccfence(); 620 req->next_avail = next; 621 if (atomic_cmpset_ptr(&queue->first_avail, next, req)) 622 break; 623 } 624 625 /* 626 * If BIOs were deferred due to lack of request space signal the 627 * admin thread to requeue them. This is a bit messy and normally 628 * should not happen due to the large number of queue entries nvme 629 * usually has. Let it race for now (admin has a 1hz tick). 630 */ 631 if (atomic_swap_int(&queue->signal_requeue, 0)) { 632 atomic_set_int(&queue->sc->admin_signal, ADMIN_SIG_REQUEUE); 633 wakeup(&queue->sc->admin_signal); 634 } 635 } 636 637 /* 638 * dump path only. 639 */ 640 void 641 nvme_put_dump_request(nvme_request_t *req) 642 { 643 KKASSERT(req->state == NVME_REQ_COMPLETED); 644 req->state = NVME_REQ_AVAIL; 645 } 646 647 /* 648 * Poll for completions on queue, copy the 16-byte hw result entry 649 * into the request and poke the doorbell to update the controller's 650 * understanding of comq_head. 651 * 652 * If lk is non-NULL it will be passed to the callback which typically 653 * releases it temporarily when calling biodone() or doing other complex 654 * work on the result. 655 * 656 * Caller must usually hold comq->lk. 657 */ 658 void 659 nvme_poll_completions(nvme_comqueue_t *comq, struct lock *lk) 660 { 661 nvme_softc_t *sc = comq->sc; 662 nvme_request_t *req; 663 nvme_subqueue_t *subq; 664 nvme_allres_t *res; 665 #if 0 666 int didwork = 0; 667 #endif 668 669 KKASSERT(comq->comq_tail < comq->nqe); 670 cpu_lfence(); /* needed prior to first phase test */ 671 for (;;) { 672 /* 673 * WARNING! LOCK MAY HAVE BEEN TEMPORARILY LOST DURING LOOP. 674 */ 675 res = &comq->kcomq[comq->comq_tail]; 676 if ((res->tail.status ^ comq->phase) & NVME_COMQ_STATUS_PHASE) 677 break; 678 679 /* 680 * Process result on completion queue. 681 * 682 * Bump comq_tail, flip the phase detect when we roll-over. 683 * doorbell every 1/4 queue and at the end of the loop. 684 */ 685 if (++comq->comq_tail == comq->nqe) { 686 comq->comq_tail = 0; 687 comq->phase ^= NVME_COMQ_STATUS_PHASE; 688 } 689 690 /* 691 * WARNING! I imploded the chip by reusing a command id 692 * before it was discarded in the completion queue 693 * via the doorbell, so for now we always write 694 * the doorbell before marking the request as 695 * COMPLETED (it can be reused instantly upon 696 * being marked). 697 */ 698 #if 0 699 if (++didwork == (comq->nqe >> 2)) { 700 didwork = 0; 701 nvme_write(comq->sc, comq->comq_doorbell_reg, 702 comq->comq_tail); 703 } 704 #endif 705 cpu_lfence(); /* needed prior to content check */ 706 707 /* 708 * Locate the request and related submission queue. The 709 * request could be on a different queue. A submission 710 * queue can have only one completion queue, so we can 711 * update subq_head without locking the submission queue. 712 */ 713 subq = &sc->subqueues[res->tail.subq_id]; 714 subq->subq_head = res->tail.subq_head_ptr; 715 req = &subq->reqary[res->tail.cmd_id]; 716 717 /* 718 * Copy the fields and wakeup anyone waiting on req. 719 * The response field in the completion queue can be reused 720 * once we doorbell which is why we make a copy. 721 */ 722 KKASSERT(req->state == NVME_REQ_SUBMITTED && 723 req->comq == comq); 724 req->res = *res; 725 nvme_write(comq->sc, comq->comq_doorbell_reg, comq->comq_tail); 726 cpu_sfence(); 727 req->state = NVME_REQ_COMPLETED; 728 if (req->callback) { 729 req->callback(req, lk); 730 } else if (req->waiting) { 731 wakeup(req); 732 } 733 } 734 #if 0 735 if (didwork) 736 nvme_write(comq->sc, comq->comq_doorbell_reg, comq->comq_tail); 737 #endif 738 } 739 740 /* 741 * Core interrupt handler (called from dedicated interrupt thread, possibly 742 * preempts other threads). 743 * 744 * NOTE: For pin-based level interrupts, the chipset interrupt is cleared 745 * automatically once all the head doorbells are updated. However, 746 * most chipsets assume MSI-X will be used and MAY NOT IMPLEMENT 747 * pin-based interrupts properly. I found the BPX card, for example, 748 * is unable to clear a pin-based interrupt. 749 */ 750 void 751 nvme_intr(void *arg) 752 { 753 nvme_comqueue_t *comq = arg; 754 nvme_softc_t *sc; 755 int i; 756 int skip; 757 758 /* 759 * Process all completion queues associated with this vector. The 760 * interrupt is masked in the APIC. Do NOT mess with the NVMe 761 * masking registers because (1) We don't need to and it wastes time, 762 * and (2) We aren't supposed to touch them if using MSI-X anyway. 763 */ 764 sc = comq->sc; 765 if (sc->nirqs == 1) 766 skip = 1; 767 else 768 skip = sc->nirqs - 1; 769 770 for (i = comq->qid; i <= sc->niocomqs; i += skip) { 771 if (comq->nqe) { 772 lockmgr(&comq->lk, LK_EXCLUSIVE); 773 nvme_poll_completions(comq, &comq->lk); 774 lockmgr(&comq->lk, LK_RELEASE); 775 } 776 comq += skip; 777 } 778 } 779 780 /* 781 * ADMIN HELPER COMMAND ROLLUP FUNCTIONS 782 */ 783 /* 784 * Issue command to create a submission queue. 785 */ 786 int 787 nvme_create_subqueue(nvme_softc_t *sc, uint16_t qid) 788 { 789 nvme_request_t *req; 790 nvme_subqueue_t *subq = &sc->subqueues[qid]; 791 int status; 792 793 req = nvme_get_admin_request(sc, NVME_OP_CREATE_SUBQ); 794 req->cmd.head.prp1 = subq->psubq; 795 req->cmd.crsub.subq_id = qid; 796 req->cmd.crsub.subq_size = subq->nqe - 1; /* 0's based value */ 797 req->cmd.crsub.flags = NVME_CREATESUB_PC | NVME_CREATESUB_PRI_URG; 798 req->cmd.crsub.comq_id = subq->comqid; 799 800 nvme_submit_request(req); 801 status = nvme_wait_request(req); 802 nvme_put_request(req); 803 804 return status; 805 } 806 807 /* 808 * Issue command to create a completion queue. 809 */ 810 int 811 nvme_create_comqueue(nvme_softc_t *sc, uint16_t qid) 812 { 813 nvme_request_t *req; 814 nvme_comqueue_t *comq = &sc->comqueues[qid]; 815 int status; 816 int error; 817 uint16_t ivect; 818 819 error = 0; 820 if (sc->nirqs > 1) { 821 ivect = 1 + (qid - 1) % (sc->nirqs - 1); 822 if (qid && ivect == qid) { 823 error = bus_setup_intr(sc->dev, sc->irq[ivect], 824 INTR_MPSAFE | INTR_HIFREQ, 825 nvme_intr, 826 &sc->comqueues[ivect], 827 &sc->irq_handle[ivect], 828 NULL); 829 } 830 } else { 831 ivect = 0; 832 } 833 if (error) 834 return error; 835 836 req = nvme_get_admin_request(sc, NVME_OP_CREATE_COMQ); 837 req->cmd.head.prp1 = comq->pcomq; 838 req->cmd.crcom.comq_id = qid; 839 req->cmd.crcom.comq_size = comq->nqe - 1; /* 0's based value */ 840 req->cmd.crcom.ivect = ivect; 841 req->cmd.crcom.flags = NVME_CREATECOM_PC | NVME_CREATECOM_IEN; 842 843 nvme_submit_request(req); 844 status = nvme_wait_request(req); 845 nvme_put_request(req); 846 847 /* 848 * Ooops, create failed, undo the irq setup 849 */ 850 if (sc->nirqs > 1 && status) { 851 ivect = 1 + (qid - 1) % (sc->nirqs - 1); 852 if (qid && ivect == qid) { 853 bus_teardown_intr(sc->dev, 854 sc->irq[ivect], 855 sc->irq_handle[ivect]); 856 sc->irq_handle[ivect] = NULL; 857 } 858 } 859 860 return status; 861 } 862 863 /* 864 * Issue command to delete a submission queue. 865 */ 866 int 867 nvme_delete_subqueue(nvme_softc_t *sc, uint16_t qid) 868 { 869 nvme_request_t *req; 870 /*nvme_subqueue_t *subq = &sc->subqueues[qid];*/ 871 int status; 872 873 req = nvme_get_admin_request(sc, NVME_OP_DELETE_SUBQ); 874 req->cmd.head.prp1 = 0; 875 req->cmd.delete.qid = qid; 876 877 nvme_submit_request(req); 878 status = nvme_wait_request(req); 879 nvme_put_request(req); 880 881 return status; 882 } 883 884 /* 885 * Issue command to delete a completion queue. 886 */ 887 int 888 nvme_delete_comqueue(nvme_softc_t *sc, uint16_t qid) 889 { 890 nvme_request_t *req; 891 nvme_comqueue_t *comq = &sc->comqueues[qid]; 892 int status; 893 uint16_t ivect; 894 895 if (comq->sc == NULL) 896 return 0; 897 898 req = nvme_get_admin_request(sc, NVME_OP_DELETE_COMQ); 899 req->cmd.head.prp1 = 0; 900 req->cmd.delete.qid = qid; 901 902 nvme_submit_request(req); 903 status = nvme_wait_request(req); 904 nvme_put_request(req); 905 906 if (qid && sc->nirqs > 1) { 907 ivect = 1 + (qid - 1) % (sc->nirqs - 1); 908 if (ivect == qid && sc->irq_handle[ivect]) { 909 bus_teardown_intr(sc->dev, 910 sc->irq[ivect], 911 sc->irq_handle[ivect]); 912 sc->irq_handle[ivect] = NULL; 913 } 914 } 915 916 return status; 917 } 918 919 /* 920 * Issue friendly shutdown to controller. 921 */ 922 int 923 nvme_issue_shutdown(nvme_softc_t *sc, int dopoll) 924 { 925 uint32_t reg; 926 int base_ticks; 927 int error; 928 929 /* 930 * Put us in shutdown 931 */ 932 reg = nvme_read(sc, NVME_REG_CONFIG); 933 reg &= ~NVME_CONFIG_SHUT_MASK; 934 reg |= NVME_CONFIG_SHUT_NORM; 935 nvme_write(sc, NVME_REG_CONFIG, reg); 936 937 /* 938 * Wait up to 10 seconds for acknowlegement 939 */ 940 error = ENXIO; 941 base_ticks = ticks; 942 while ((int)(ticks - base_ticks) < 10 * 20) { 943 reg = nvme_read(sc, NVME_REG_STATUS); 944 if ((reg & NVME_STATUS_SHUT_MASK) & NVME_STATUS_SHUT_DONE) { 945 error = 0; 946 break; 947 } 948 if (dopoll == 0) 949 nvme_os_sleep(50); /* 50ms poll */ 950 } 951 if (error) 952 device_printf(sc->dev, "Unable to shutdown chip nicely\n"); 953 else 954 device_printf(sc->dev, "Normal chip shutdown succeeded\n"); 955 956 return error; 957 } 958 959 /* 960 * Make space-padded string serial and model numbers more readable. 961 */ 962 size_t 963 string_cleanup(char *str, int domiddle) 964 { 965 size_t i; 966 size_t j; 967 int atbeg = 1; 968 969 for (i = j = 0; str[i]; ++i) { 970 if ((str[i] == ' ' || str[i] == '\r') && 971 (atbeg || domiddle)) { 972 continue; 973 } else { 974 atbeg = 0; 975 } 976 str[j] = str[i]; 977 ++j; 978 } 979 while (domiddle == 0 && j > 0 && (str[j-1] == ' ' || str[j-1] == '\r')) 980 --j; 981 str[j] = 0; 982 if (domiddle == 0) { 983 for (j = 0; str[j]; ++j) { 984 if (str[j] == ' ') 985 str[j] = '_'; 986 } 987 } 988 989 return j; 990 } 991