1 /*
2 * Copyright (c) 2016-2018 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34 /*
35 * Most low-level chip related functions (other than attachment) reside in
36 * this module. Most functions assume that the caller is already holding
37 * appropriate locks to prevent SMP collisions.
38 */
39
40 #include "nvme.h"
41
42 MALLOC_DEFINE(M_NVME, "NVMe Driver", "NVME");
43
44 /*
45 * DMA mapping callbacks.
46 */
47 static
48 void
nvme_dmamem_saveseg(void * info,bus_dma_segment_t * segs,int nsegs,int error)49 nvme_dmamem_saveseg(void *info, bus_dma_segment_t *segs, int nsegs, int error)
50 {
51 KKASSERT(error == 0);
52 KKASSERT(nsegs == 1);
53 *(bus_addr_t *)info = segs->ds_addr;
54 }
55
56 /*
57 * Low-level chip enable/disable.
58 */
59 int
nvme_enable(nvme_softc_t * sc,int enable)60 nvme_enable(nvme_softc_t *sc, int enable)
61 {
62 uint32_t reg;
63 int error = 0;
64 int base_ticks;
65
66 reg = nvme_read(sc, NVME_REG_CONFIG);
67 if (enable == 0 && (reg & NVME_CONFIG_EN)) {
68 /*
69 * Disable the chip so we can program it.
70 */
71 reg &= ~NVME_CONFIG_EN;
72 nvme_write(sc, NVME_REG_CONFIG, reg);
73 } else if (enable && (reg & NVME_CONFIG_EN) == 0) {
74 /*
75 * Enable the chip once programmed.
76 */
77 reg |= NVME_CONFIG_EN;
78 nvme_write(sc, NVME_REG_CONFIG, reg);
79 }
80 error = ENXIO;
81 base_ticks = ticks;
82 while ((int)(ticks - base_ticks) < sc->entimo) {
83 reg = nvme_read(sc, NVME_REG_STATUS);
84 if (enable == 0 && (reg & NVME_STATUS_RDY) == 0) {
85 error = 0;
86 break;
87 }
88 if (enable && (reg & NVME_STATUS_RDY)) {
89 error = 0;
90 break;
91 }
92 nvme_os_sleep(50); /* 50ms poll */
93 }
94
95 /*
96 * Interrupt masking (only applicable when MSI-X not used, 3.1.3 and
97 * 3.1.4 state that these registers should not be accessed with MSI-X)
98 */
99 if (error == 0 && sc->nirqs == 1) {
100 if (enable) {
101 nvme_write(sc, NVME_REG_INTSET, ~1);
102 nvme_write(sc, NVME_REG_INTCLR, 1);
103 } else {
104 nvme_write(sc, NVME_REG_INTSET, ~1);
105 }
106 }
107
108 if (error) {
109 device_printf(sc->dev, "Cannot %s device\n",
110 (enable ? "enable" : "disable"));
111 } else {
112 #if 0
113 kprintf("gratuitous 15 second sleep\n");
114 nvme_os_sleep(15000);
115 kprintf("gratuitous 15 second sleep done\n");
116 #endif
117 }
118 return error;
119 }
120
121 /*
122 * Allocate submission and completion queues. If qid is 0 we are allocating
123 * the ADMIN queues, otherwise we are allocating I/O queues.
124 */
125 int
nvme_alloc_subqueue(nvme_softc_t * sc,uint16_t qid)126 nvme_alloc_subqueue(nvme_softc_t *sc, uint16_t qid)
127 {
128 nvme_subqueue_t *queue = &sc->subqueues[qid];
129 int error = 0;
130
131 /*
132 * For now implement the maximum queue size negotiated in the
133 * attach.
134 */
135 lockinit(&queue->lk, "nvqlk", 0, 0);
136 queue->sc = sc;
137 queue->nqe = sc->maxqe;
138 queue->qid = qid;
139 queue->subq_doorbell_reg = NVME_REG_SUBQ_BELL(qid, sc->dstrd4);
140
141 /*
142 * dma memory for the submission queue
143 */
144 if (error == 0) {
145 error = bus_dmamem_alloc(sc->sque_tag, (void **)&queue->ksubq,
146 BUS_DMA_ZERO, &queue->sque_map);
147 }
148 if (error == 0) {
149 error = bus_dmamap_load(sc->sque_tag, queue->sque_map,
150 queue->ksubq,
151 bus_dma_tag_getmaxsize(sc->sque_tag),
152 nvme_dmamem_saveseg, &queue->psubq,
153 0);
154 }
155
156 /*
157 * dma memory for enough PRPs to map MAXPHYS bytes of memory per
158 * request. A MAXPHYS buffer which begins partially straddling
159 * a page boundary can still be accomodated because we have an
160 * additional PRP entry in cmd.head.
161 */
162 if (error == 0) {
163 error = bus_dmamem_alloc(sc->prps_tag, (void **)&queue->kprps,
164 BUS_DMA_ZERO, &queue->prps_map);
165 }
166 if (error == 0) {
167 error = bus_dmamap_load(sc->prps_tag, queue->prps_map,
168 queue->kprps,
169 bus_dma_tag_getmaxsize(sc->prps_tag),
170 nvme_dmamem_saveseg, &queue->pprps,
171 0);
172 }
173
174 /*
175 * dma memory for admin data
176 */
177 if (qid == 0 && error == 0) {
178 error = bus_dmamem_alloc(sc->adm_tag,
179 (void **)&queue->kdatapgs,
180 BUS_DMA_ZERO, &queue->adm_map);
181 }
182 if (qid == 0 && error == 0) {
183 error = bus_dmamap_load(sc->adm_tag, queue->adm_map,
184 queue->kdatapgs,
185 bus_dma_tag_getmaxsize(sc->adm_tag),
186 nvme_dmamem_saveseg, &queue->pdatapgs,
187 0);
188 }
189
190 /*
191 * Driver request structures
192 */
193 if (error == 0) {
194 nvme_request_t *req;
195 uint32_t i;
196
197 queue->reqary = kmalloc(sizeof(nvme_request_t) * queue->nqe,
198 M_NVME, M_WAITOK | M_ZERO);
199 for (i = 0; i < queue->nqe; ++i) {
200 req = &queue->reqary[i];
201 if (i == 0) {
202 /*
203 * Set aside one request for dump operation
204 */
205 queue->dump_req = req;
206 } else {
207 /*
208 * The rest go through the normal list
209 */
210 req->next_avail = queue->first_avail;
211 queue->first_avail = req;
212 }
213 req->subq = queue;
214 req->comq = &sc->comqueues[queue->comqid];
215 req->cmd_id = i;
216 if (qid == 0) {
217 req->info = &queue->kdatapgs[i];
218 req->pinfo = queue->pdatapgs +
219 i * sizeof(nvme_admin_data_t);
220 }
221 }
222 }
223
224 /*
225 * Error handling
226 */
227 if (error)
228 nvme_free_subqueue(sc, qid);
229 return error;
230 }
231
232 int
nvme_alloc_comqueue(nvme_softc_t * sc,uint16_t qid)233 nvme_alloc_comqueue(nvme_softc_t *sc, uint16_t qid)
234 {
235 nvme_comqueue_t *queue = &sc->comqueues[qid];
236 int error = 0;
237
238 /*
239 * For now implement the maximum queue size negotiated in the
240 * attach.
241 */
242 lockinit(&queue->lk, "nvqlk", 0, 0);
243 queue->sc = sc;
244 queue->qid = qid;
245 queue->phase = NVME_COMQ_STATUS_PHASE;
246 queue->comq_doorbell_reg = NVME_REG_COMQ_BELL(qid, sc->dstrd4);
247
248 if (error == 0) {
249 error = bus_dmamem_alloc(sc->cque_tag, (void **)&queue->kcomq,
250 BUS_DMA_ZERO, &queue->cque_map);
251 }
252 if (error == 0) {
253 error = bus_dmamap_load(sc->cque_tag, queue->cque_map,
254 queue->kcomq,
255 bus_dma_tag_getmaxsize(sc->cque_tag),
256 nvme_dmamem_saveseg, &queue->pcomq,
257 0);
258 }
259
260 /*
261 * Set nqe last. The comq polling loop tests this field and we
262 * do not want it to spuriously assume that the comq is initialized
263 * until it actually is.
264 */
265 if (error == 0)
266 queue->nqe = sc->maxqe;
267
268 if (error)
269 nvme_free_comqueue(sc, qid);
270 return error;
271 }
272
273 void
nvme_free_subqueue(nvme_softc_t * sc,uint16_t qid)274 nvme_free_subqueue(nvme_softc_t *sc, uint16_t qid)
275 {
276 nvme_subqueue_t *queue = &sc->subqueues[qid];
277
278 queue->first_avail = NULL;
279 if (queue->reqary) {
280 kfree(queue->reqary, M_NVME);
281 queue->reqary = NULL;
282 }
283 if (queue->ksubq) {
284 bus_dmamem_free(sc->sque_tag, queue->ksubq, queue->sque_map);
285 bus_dmamap_unload(sc->sque_tag, queue->sque_map);
286 bus_dmamap_destroy(sc->sque_tag, queue->sque_map);
287 }
288 if (queue->kprps) {
289 bus_dmamem_free(sc->prps_tag, queue->kprps, queue->prps_map);
290 bus_dmamap_unload(sc->prps_tag, queue->prps_map);
291 bus_dmamap_destroy(sc->prps_tag, queue->prps_map);
292 }
293 if (queue->kdatapgs) {
294 bus_dmamem_free(sc->adm_tag, queue->kdatapgs, queue->adm_map);
295 bus_dmamap_unload(sc->adm_tag, queue->adm_map);
296 bus_dmamap_destroy(sc->adm_tag, queue->adm_map);
297 }
298 bzero(queue, sizeof(*queue));
299 }
300
301 void
nvme_free_comqueue(nvme_softc_t * sc,uint16_t qid)302 nvme_free_comqueue(nvme_softc_t *sc, uint16_t qid)
303 {
304 nvme_comqueue_t *queue = &sc->comqueues[qid];
305
306 /*
307 * Clear this field first so poll loops ignore the comq.
308 */
309 queue->nqe = 0;
310
311 if (queue->kcomq) {
312 bus_dmamem_free(sc->cque_tag, queue->kcomq, queue->cque_map);
313 bus_dmamap_unload(sc->cque_tag, queue->cque_map);
314 bus_dmamap_destroy(sc->cque_tag, queue->cque_map);
315 }
316 bzero(queue, sizeof(*queue));
317 }
318
319 /*
320 * ADMIN AND I/O REQUEST HANDLING
321 */
322
323 /*
324 * Obtain a request and handle DMA mapping the supplied kernel buffer.
325 * Fields in cmd.head will be initialized and remaining fields will be zero'd.
326 * Caller is responsible for filling in remaining fields as appropriate.
327 *
328 * Caller must hold the queue lock.
329 */
330 nvme_request_t *
nvme_get_admin_request(nvme_softc_t * sc,uint8_t opcode)331 nvme_get_admin_request(nvme_softc_t *sc, uint8_t opcode)
332 {
333 nvme_request_t *req;
334
335 req = nvme_get_request(&sc->subqueues[0], opcode, NULL, 0);
336 req->cmd.head.prp1 = req->pinfo;
337 req->callback = NULL;
338
339 return req;
340 }
341
342 /*
343 * ADMIN AND I/O REQUEST HANDLING
344 */
345
346 static __inline
347 void
_nvme_fill_request(nvme_subqueue_t * queue,uint8_t opcode,char * kva,size_t bytes,nvme_request_t * req)348 _nvme_fill_request(nvme_subqueue_t *queue, uint8_t opcode,
349 char *kva, size_t bytes,
350 nvme_request_t *req)
351 {
352 /*
353 * Fill-in basic fields and do the DMA mapping.
354 */
355 req->next_avail = NULL;
356 KKASSERT(req->state == NVME_REQ_AVAIL);
357 req->state = NVME_REQ_ALLOCATED;
358 req->callback = NULL;
359 req->waiting = 0;
360
361 req->cmd.head.opcode = opcode;
362 req->cmd.head.flags = NVME_SUBQFLG_PRP | NVME_SUBQFLG_NORM;
363 req->cmd.head.cid = req->cmd_id;
364 req->cmd.head.nsid = 0;
365 req->cmd.head.mptr = 0;
366 req->cmd.head.prp1 = 0;
367 req->cmd.head.prp2 = 0;
368 req->cmd.dw10 = 0;
369 req->cmd.dw11 = 0;
370 req->cmd.dw12 = 0;
371 req->cmd.dw13 = 0;
372 req->cmd.dw14 = 0;
373 req->cmd.dw15 = 0;
374
375 if (kva) {
376 size_t count = 0;
377 size_t idx = 0;
378 vm_paddr_t paddr;
379 vm_paddr_t pprptab;
380 uint64_t *kprptab;
381 KKASSERT(bytes >= 0 && bytes <= MAXPHYS);
382
383 kprptab = queue->kprps +
384 (MAXPHYS / PAGE_SIZE) * req->cmd_id;
385 pprptab = queue->pprps +
386 (MAXPHYS / PAGE_SIZE) * req->cmd_id *
387 sizeof(uint64_t);
388
389 while (count < bytes) {
390 paddr = vtophys(kva + count);
391 if (idx == 0) {
392 KKASSERT((paddr & 3) == 0);
393 req->cmd.head.prp1 = paddr;
394 count += (((intptr_t)kva + PAGE_SIZE) &
395 ~(intptr_t)PAGE_MASK) -
396 (intptr_t)kva;
397 } else if (idx == 1 && count + PAGE_SIZE >= bytes) {
398 KKASSERT((paddr & PAGE_MASK) == 0);
399 req->cmd.head.prp2 = paddr;
400 count += PAGE_SIZE;
401 } else {
402 KKASSERT((paddr & PAGE_MASK) == 0);
403 /* if (idx == 1) -- not needed, just repeat */
404 req->cmd.head.prp2 = pprptab; /* repeat */
405 kprptab[idx - 1] = paddr;
406 count += PAGE_SIZE;
407 }
408 ++idx;
409 }
410 }
411 }
412
413
414 /*
415 * Obtain a request and handle DMA mapping the supplied kernel buffer.
416 * Fields in cmd.head will be initialized and remaining fields will be zero'd.
417 * Caller is responsible for filling in remaining fields as appropriate.
418 *
419 * May return NULL if no requests are available or if there is no room in
420 * the submission queue to handle it (should only be possible on an I/O queue,
421 * admin queue operations are managed).
422 *
423 * Caller should NOT hold the queue lock.
424 */
425 nvme_request_t *
nvme_get_request(nvme_subqueue_t * queue,uint8_t opcode,char * kva,size_t bytes)426 nvme_get_request(nvme_subqueue_t *queue, uint8_t opcode,
427 char *kva, size_t bytes)
428 {
429 nvme_request_t *req;
430 nvme_request_t *next;
431
432 /*
433 * No easy lockless way to pull a new request off. We have to check
434 * for a number of conditions and there may be multiple threads
435 * making this call simultaneously, which complicates matters even
436 * more.
437 */
438 lockmgr(&queue->lk, LK_EXCLUSIVE);
439
440 /*
441 * Make sure the submission queue has room to accomodate the
442 * request. Requests can be completed out of order so the
443 * submission ring could still be full even though we have
444 * requests available.
445 */
446 if ((queue->subq_tail + queue->unsubmitted + 1) % queue->nqe ==
447 queue->subq_head) {
448 lockmgr(&queue->lk, LK_RELEASE);
449 KKASSERT(queue->qid != 0);
450 atomic_swap_int(&queue->signal_requeue, 1);
451
452 return NULL;
453 }
454
455 /*
456 * Pop the next available request off of the first_avail linked
457 * list. An atomic op must be used here because nvme_put_request()
458 * returns requests to the list without holding queue->lk.
459 */
460 for (;;) {
461 req = queue->first_avail;
462 cpu_ccfence();
463 if (req == NULL) {
464 lockmgr(&queue->lk, LK_RELEASE);
465 KKASSERT(queue->qid != 0);
466 atomic_swap_int(&queue->signal_requeue, 1);
467
468 return NULL;
469 }
470 next = req->next_avail;
471 if (atomic_cmpset_ptr(&queue->first_avail, req, next))
472 break;
473 }
474
475 /*
476 * We have to keep track of unsubmitted requests in order to be
477 * able to properly check whether the ring is full or not (check
478 * is done at the top of this procedure, above).
479 */
480 ++queue->unsubmitted;
481 lockmgr(&queue->lk, LK_RELEASE);
482
483 _nvme_fill_request(queue, opcode, kva, bytes, req);
484
485 return req;
486 }
487
488 /*
489 * dump path only, cannot block. Allow the lock to fail and bump
490 * queue->unsubmitted anyway.
491 */
492 nvme_request_t *
nvme_get_dump_request(nvme_subqueue_t * queue,uint8_t opcode,char * kva,size_t bytes)493 nvme_get_dump_request(nvme_subqueue_t *queue, uint8_t opcode,
494 char *kva, size_t bytes)
495 {
496 nvme_request_t *req;
497 int error;
498
499 error = lockmgr(&queue->lk, LK_EXCLUSIVE | LK_NOWAIT);
500 req = queue->dump_req;
501 ++queue->unsubmitted;
502 if (error == 0)
503 lockmgr(&queue->lk, LK_RELEASE);
504 _nvme_fill_request(queue, opcode, kva, bytes, req);
505
506 return req;
507 }
508
509 /*
510 * Submit request for execution. This will doorbell the subq.
511 *
512 * Caller must hold the queue lock.
513 */
514 void
nvme_submit_request(nvme_request_t * req)515 nvme_submit_request(nvme_request_t *req)
516 {
517 nvme_subqueue_t *queue = req->subq;
518 nvme_allcmd_t *cmd;
519
520 cmd = &queue->ksubq[queue->subq_tail];
521 --queue->unsubmitted;
522 if (++queue->subq_tail == queue->nqe)
523 queue->subq_tail = 0;
524 KKASSERT(queue->subq_tail != queue->subq_head);
525 *cmd = req->cmd;
526 cpu_sfence(); /* needed? */
527 req->state = NVME_REQ_SUBMITTED;
528 nvme_write(queue->sc, queue->subq_doorbell_reg, queue->subq_tail);
529 }
530
531 /*
532 * Wait for a request to complete.
533 *
534 * Caller does not need to hold the queue lock. If it does, or if it
535 * holds some other lock, it should pass it in so it can be released across
536 * sleeps, else pass NULL.
537 */
538 int
nvme_wait_request(nvme_request_t * req)539 nvme_wait_request(nvme_request_t *req)
540 {
541 struct lock *lk;
542 int code;
543
544 req->waiting = 1;
545 if (req->state != NVME_REQ_COMPLETED) {
546 lk = &req->comq->lk;
547 cpu_lfence();
548 lockmgr(lk, LK_EXCLUSIVE);
549 while (req->state == NVME_REQ_SUBMITTED) {
550 nvme_poll_completions(req->comq, lk);
551 if (req->state != NVME_REQ_SUBMITTED)
552 break;
553 lksleep(req, lk, 0, "nvwait", hz);
554 }
555 lockmgr(lk, LK_RELEASE);
556 KKASSERT(req->state == NVME_REQ_COMPLETED);
557 }
558 cpu_lfence();
559 code = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status);
560
561 return code;
562 }
563
564 /*
565 * dump path only, we cannot block, and the lock is allowed
566 * to fail. But still try to play nice with interrupt threads.
567 */
568 int
nvme_poll_request(nvme_request_t * req)569 nvme_poll_request(nvme_request_t *req)
570 {
571 struct lock *lk;
572 int code;
573 int didlock = 500; /* 500uS max */
574
575 req->waiting = 1;
576 if (req->state != NVME_REQ_COMPLETED) {
577 lk = &req->comq->lk;
578 cpu_lfence();
579 while (lockmgr(lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
580 if (--didlock == 0)
581 break;
582 tsc_delay(1000); /* 1uS */
583 }
584 while (req->state == NVME_REQ_SUBMITTED) {
585 nvme_poll_completions(req->comq, lk);
586 if (req->state != NVME_REQ_SUBMITTED)
587 break;
588 lwkt_switch();
589 }
590 if (didlock)
591 lockmgr(lk, LK_RELEASE);
592 KKASSERT(req->state == NVME_REQ_COMPLETED);
593 }
594 cpu_lfence();
595 code = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status);
596
597 return code;
598 }
599
600 /*
601 * Put request away, making it available for reuse. If this is an admin
602 * request its auxillary data page is also being released for reuse.
603 *
604 * Caller does NOT have to hold the queue lock.
605 */
606 void
nvme_put_request(nvme_request_t * req)607 nvme_put_request(nvme_request_t *req)
608 {
609 nvme_subqueue_t *queue = req->subq;
610 nvme_request_t *next;
611
612 /*
613 * Insert on head for best cache reuse.
614 */
615 KKASSERT(req->state == NVME_REQ_COMPLETED);
616 req->state = NVME_REQ_AVAIL;
617 for (;;) {
618 next = queue->first_avail;
619 cpu_ccfence();
620 req->next_avail = next;
621 if (atomic_cmpset_ptr(&queue->first_avail, next, req))
622 break;
623 }
624
625 /*
626 * If BIOs were deferred due to lack of request space signal the
627 * admin thread to requeue them. This is a bit messy and normally
628 * should not happen due to the large number of queue entries nvme
629 * usually has. Let it race for now (admin has a 1hz tick).
630 */
631 if (atomic_swap_int(&queue->signal_requeue, 0)) {
632 atomic_set_int(&queue->sc->admin_signal, ADMIN_SIG_REQUEUE);
633 wakeup(&queue->sc->admin_signal);
634 }
635 }
636
637 /*
638 * dump path only.
639 */
640 void
nvme_put_dump_request(nvme_request_t * req)641 nvme_put_dump_request(nvme_request_t *req)
642 {
643 KKASSERT(req->state == NVME_REQ_COMPLETED);
644 req->state = NVME_REQ_AVAIL;
645 }
646
647 /*
648 * Poll for completions on queue, copy the 16-byte hw result entry
649 * into the request and poke the doorbell to update the controller's
650 * understanding of comq_head.
651 *
652 * If lk is non-NULL it will be passed to the callback which typically
653 * releases it temporarily when calling biodone() or doing other complex
654 * work on the result.
655 *
656 * Caller must usually hold comq->lk.
657 */
658 void
nvme_poll_completions(nvme_comqueue_t * comq,struct lock * lk)659 nvme_poll_completions(nvme_comqueue_t *comq, struct lock *lk)
660 {
661 nvme_softc_t *sc = comq->sc;
662 nvme_request_t *req;
663 nvme_subqueue_t *subq;
664 nvme_allres_t *res;
665 #if 0
666 int didwork = 0;
667 #endif
668
669 KKASSERT(comq->comq_tail < comq->nqe);
670 cpu_lfence(); /* needed prior to first phase test */
671 for (;;) {
672 /*
673 * WARNING! LOCK MAY HAVE BEEN TEMPORARILY LOST DURING LOOP.
674 */
675 res = &comq->kcomq[comq->comq_tail];
676 if ((res->tail.status ^ comq->phase) & NVME_COMQ_STATUS_PHASE)
677 break;
678
679 /*
680 * Process result on completion queue.
681 *
682 * Bump comq_tail, flip the phase detect when we roll-over.
683 * doorbell every 1/4 queue and at the end of the loop.
684 */
685 if (++comq->comq_tail == comq->nqe) {
686 comq->comq_tail = 0;
687 comq->phase ^= NVME_COMQ_STATUS_PHASE;
688 }
689
690 /*
691 * WARNING! I imploded the chip by reusing a command id
692 * before it was discarded in the completion queue
693 * via the doorbell, so for now we always write
694 * the doorbell before marking the request as
695 * COMPLETED (it can be reused instantly upon
696 * being marked).
697 */
698 #if 0
699 if (++didwork == (comq->nqe >> 2)) {
700 didwork = 0;
701 nvme_write(comq->sc, comq->comq_doorbell_reg,
702 comq->comq_tail);
703 }
704 #endif
705 cpu_lfence(); /* needed prior to content check */
706
707 /*
708 * Locate the request and related submission queue. The
709 * request could be on a different queue. A submission
710 * queue can have only one completion queue, so we can
711 * update subq_head without locking the submission queue.
712 */
713 subq = &sc->subqueues[res->tail.subq_id];
714 subq->subq_head = res->tail.subq_head_ptr;
715 req = &subq->reqary[res->tail.cmd_id];
716
717 /*
718 * Copy the fields and wakeup anyone waiting on req.
719 * The response field in the completion queue can be reused
720 * once we doorbell which is why we make a copy.
721 */
722 KKASSERT(req->state == NVME_REQ_SUBMITTED &&
723 req->comq == comq);
724 req->res = *res;
725 nvme_write(comq->sc, comq->comq_doorbell_reg, comq->comq_tail);
726 cpu_sfence();
727 req->state = NVME_REQ_COMPLETED;
728 if (req->callback) {
729 req->callback(req, lk);
730 } else if (req->waiting) {
731 wakeup(req);
732 }
733 }
734 #if 0
735 if (didwork)
736 nvme_write(comq->sc, comq->comq_doorbell_reg, comq->comq_tail);
737 #endif
738 }
739
740 /*
741 * Core interrupt handler (called from dedicated interrupt thread, possibly
742 * preempts other threads).
743 *
744 * NOTE: For pin-based level interrupts, the chipset interrupt is cleared
745 * automatically once all the head doorbells are updated. However,
746 * most chipsets assume MSI-X will be used and MAY NOT IMPLEMENT
747 * pin-based interrupts properly. I found the BPX card, for example,
748 * is unable to clear a pin-based interrupt.
749 */
750 void
nvme_intr(void * arg)751 nvme_intr(void *arg)
752 {
753 nvme_comqueue_t *comq = arg;
754 nvme_softc_t *sc;
755 int i;
756 int skip;
757
758 /*
759 * Process all completion queues associated with this vector. The
760 * interrupt is masked in the APIC. Do NOT mess with the NVMe
761 * masking registers because (1) We don't need to and it wastes time,
762 * and (2) We aren't supposed to touch them if using MSI-X anyway.
763 */
764 sc = comq->sc;
765 if (sc->nirqs == 1)
766 skip = 1;
767 else
768 skip = sc->nirqs - 1;
769
770 for (i = comq->qid; i <= sc->niocomqs; i += skip) {
771 if (comq->nqe) {
772 lockmgr(&comq->lk, LK_EXCLUSIVE);
773 nvme_poll_completions(comq, &comq->lk);
774 lockmgr(&comq->lk, LK_RELEASE);
775 }
776 comq += skip;
777 }
778 }
779
780 /*
781 * ADMIN HELPER COMMAND ROLLUP FUNCTIONS
782 */
783 /*
784 * Issue command to create a submission queue.
785 */
786 int
nvme_create_subqueue(nvme_softc_t * sc,uint16_t qid)787 nvme_create_subqueue(nvme_softc_t *sc, uint16_t qid)
788 {
789 nvme_request_t *req;
790 nvme_subqueue_t *subq = &sc->subqueues[qid];
791 int status;
792
793 req = nvme_get_admin_request(sc, NVME_OP_CREATE_SUBQ);
794 req->cmd.head.prp1 = subq->psubq;
795 req->cmd.crsub.subq_id = qid;
796 req->cmd.crsub.subq_size = subq->nqe - 1; /* 0's based value */
797 req->cmd.crsub.flags = NVME_CREATESUB_PC | NVME_CREATESUB_PRI_URG;
798 req->cmd.crsub.comq_id = subq->comqid;
799
800 nvme_submit_request(req);
801 status = nvme_wait_request(req);
802 nvme_put_request(req);
803
804 return status;
805 }
806
807 /*
808 * Issue command to create a completion queue.
809 */
810 int
nvme_create_comqueue(nvme_softc_t * sc,uint16_t qid)811 nvme_create_comqueue(nvme_softc_t *sc, uint16_t qid)
812 {
813 nvme_request_t *req;
814 nvme_comqueue_t *comq = &sc->comqueues[qid];
815 int status;
816 int error;
817 uint16_t ivect;
818
819 error = 0;
820 if (sc->nirqs > 1) {
821 ivect = 1 + (qid - 1) % (sc->nirqs - 1);
822 if (qid && ivect == qid) {
823 error = bus_setup_intr(sc->dev, sc->irq[ivect],
824 INTR_MPSAFE | INTR_HIFREQ,
825 nvme_intr,
826 &sc->comqueues[ivect],
827 &sc->irq_handle[ivect],
828 NULL);
829 }
830 } else {
831 ivect = 0;
832 }
833 if (error)
834 return error;
835
836 req = nvme_get_admin_request(sc, NVME_OP_CREATE_COMQ);
837 req->cmd.head.prp1 = comq->pcomq;
838 req->cmd.crcom.comq_id = qid;
839 req->cmd.crcom.comq_size = comq->nqe - 1; /* 0's based value */
840 req->cmd.crcom.ivect = ivect;
841 req->cmd.crcom.flags = NVME_CREATECOM_PC | NVME_CREATECOM_IEN;
842
843 nvme_submit_request(req);
844 status = nvme_wait_request(req);
845 nvme_put_request(req);
846
847 /*
848 * Ooops, create failed, undo the irq setup
849 */
850 if (sc->nirqs > 1 && status) {
851 ivect = 1 + (qid - 1) % (sc->nirqs - 1);
852 if (qid && ivect == qid) {
853 bus_teardown_intr(sc->dev,
854 sc->irq[ivect],
855 sc->irq_handle[ivect]);
856 sc->irq_handle[ivect] = NULL;
857 }
858 }
859
860 return status;
861 }
862
863 /*
864 * Issue command to delete a submission queue.
865 */
866 int
nvme_delete_subqueue(nvme_softc_t * sc,uint16_t qid)867 nvme_delete_subqueue(nvme_softc_t *sc, uint16_t qid)
868 {
869 nvme_request_t *req;
870 /*nvme_subqueue_t *subq = &sc->subqueues[qid];*/
871 int status;
872
873 req = nvme_get_admin_request(sc, NVME_OP_DELETE_SUBQ);
874 req->cmd.head.prp1 = 0;
875 req->cmd.delete.qid = qid;
876
877 nvme_submit_request(req);
878 status = nvme_wait_request(req);
879 nvme_put_request(req);
880
881 return status;
882 }
883
884 /*
885 * Issue command to delete a completion queue.
886 */
887 int
nvme_delete_comqueue(nvme_softc_t * sc,uint16_t qid)888 nvme_delete_comqueue(nvme_softc_t *sc, uint16_t qid)
889 {
890 nvme_request_t *req;
891 nvme_comqueue_t *comq = &sc->comqueues[qid];
892 int status;
893 uint16_t ivect;
894
895 if (comq->sc == NULL)
896 return 0;
897
898 req = nvme_get_admin_request(sc, NVME_OP_DELETE_COMQ);
899 req->cmd.head.prp1 = 0;
900 req->cmd.delete.qid = qid;
901
902 nvme_submit_request(req);
903 status = nvme_wait_request(req);
904 nvme_put_request(req);
905
906 if (qid && sc->nirqs > 1) {
907 ivect = 1 + (qid - 1) % (sc->nirqs - 1);
908 if (ivect == qid && sc->irq_handle[ivect]) {
909 bus_teardown_intr(sc->dev,
910 sc->irq[ivect],
911 sc->irq_handle[ivect]);
912 sc->irq_handle[ivect] = NULL;
913 }
914 }
915
916 return status;
917 }
918
919 /*
920 * Issue friendly shutdown to controller.
921 */
922 int
nvme_issue_shutdown(nvme_softc_t * sc,int dopoll)923 nvme_issue_shutdown(nvme_softc_t *sc, int dopoll)
924 {
925 uint32_t reg;
926 int base_ticks;
927 int error;
928
929 /*
930 * Put us in shutdown
931 */
932 reg = nvme_read(sc, NVME_REG_CONFIG);
933 reg &= ~NVME_CONFIG_SHUT_MASK;
934 reg |= NVME_CONFIG_SHUT_NORM;
935 nvme_write(sc, NVME_REG_CONFIG, reg);
936
937 /*
938 * Wait up to 10 seconds for acknowlegement
939 */
940 error = ENXIO;
941 base_ticks = ticks;
942 while ((int)(ticks - base_ticks) < 10 * 20) {
943 reg = nvme_read(sc, NVME_REG_STATUS);
944 if ((reg & NVME_STATUS_SHUT_MASK) & NVME_STATUS_SHUT_DONE) {
945 error = 0;
946 break;
947 }
948 if (dopoll == 0)
949 nvme_os_sleep(50); /* 50ms poll */
950 }
951 if (error)
952 device_printf(sc->dev, "Unable to shutdown chip nicely\n");
953 else
954 device_printf(sc->dev, "Normal chip shutdown succeeded\n");
955
956 return error;
957 }
958
959 /*
960 * Make space-padded string serial and model numbers more readable.
961 */
962 size_t
string_cleanup(char * str,int domiddle)963 string_cleanup(char *str, int domiddle)
964 {
965 size_t i;
966 size_t j;
967 int atbeg = 1;
968
969 for (i = j = 0; str[i]; ++i) {
970 if ((str[i] == ' ' || str[i] == '\r') &&
971 (atbeg || domiddle)) {
972 continue;
973 } else {
974 atbeg = 0;
975 }
976 str[j] = str[i];
977 ++j;
978 }
979 while (domiddle == 0 && j > 0 && (str[j-1] == ' ' || str[j-1] == '\r'))
980 --j;
981 str[j] = 0;
982 if (domiddle == 0) {
983 for (j = 0; str[j]; ++j) {
984 if (str[j] == ' ')
985 str[j] = '_';
986 }
987 }
988
989 return j;
990 }
991