xref: /dragonfly/sys/dev/disk/nvme/nvme.c (revision 0066c2fb)
1 /*
2  * Copyright (c) 2016 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * Most low-level chip related functions (other than attachment) reside in
36  * this module.  Most functions assume that the caller is already holding
37  * appropriate locks to prevent SMP collisions.
38  */
39 
40 #include "nvme.h"
41 
42 MALLOC_DEFINE(M_NVME, "NVMe Driver", "NVME");
43 
44 /*
45  * DMA mapping callbacks.
46  */
47 static
48 void
49 nvme_dmamem_saveseg(void *info, bus_dma_segment_t *segs, int nsegs, int error)
50 {
51         KKASSERT(error == 0);
52 	KKASSERT(nsegs == 1);
53 	*(bus_addr_t *)info = segs->ds_addr;
54 }
55 
56 /*
57  * Low-level chip enable/disable.
58  */
59 int
60 nvme_enable(nvme_softc_t *sc, int enable)
61 {
62 	uint32_t reg;
63 	int error = 0;
64 	int base_ticks;
65 
66 	reg = nvme_read(sc, NVME_REG_CONFIG);
67 	if (enable == 0 && (reg & NVME_CONFIG_EN)) {
68 		/*
69 		 * Disable the chip so we can program it.
70 		 */
71 		reg &= ~NVME_CONFIG_EN;
72 		nvme_write(sc, NVME_REG_CONFIG, reg);
73 	} else if (enable && (reg & NVME_CONFIG_EN) == 0) {
74 		/*
75 		 * Enable the chip once programmed.
76 		 */
77 		reg |= NVME_CONFIG_EN;
78 		nvme_write(sc, NVME_REG_CONFIG, reg);
79 	}
80 	error = ENXIO;
81 	base_ticks = ticks;
82 	while ((int)(ticks - base_ticks) < sc->entimo) {
83 		reg = nvme_read(sc, NVME_REG_STATUS);
84 		if (enable == 0 && (reg & NVME_STATUS_RDY) == 0) {
85 			error = 0;
86 			break;
87 		}
88 		if (enable && (reg & NVME_STATUS_RDY)) {
89 			error = 0;
90 			break;
91 		}
92 		nvme_os_sleep(50);	/* 50ms poll */
93 	}
94 
95 	/*
96 	 * Interrupt masking (only applicable when MSI-X not used, 3.1.3 and
97 	 * 3.1.4 state that these registers should not be accessed with MSI-X)
98 	 */
99 	if (error == 0 && sc->nirqs == 1) {
100 		if (enable) {
101 			nvme_write(sc, NVME_REG_INTSET, ~1);
102 			nvme_write(sc, NVME_REG_INTCLR, 1);
103 		} else {
104 			nvme_write(sc, NVME_REG_INTSET, ~1);
105 		}
106 	}
107 
108 	if (error) {
109 		device_printf(sc->dev, "Cannot %s device\n",
110 			      (enable ? "enable" : "disable"));
111 	} else {
112 #if 0
113 		kprintf("gratuitous 15 second sleep\n");
114 		nvme_os_sleep(15000);
115 		kprintf("gratuitous 15 second sleep done\n");
116 #endif
117 	}
118 	return error;
119 }
120 
121 /*
122  * Allocate submission and completion queues.  If qid is 0 we are allocating
123  * the ADMIN queues, otherwise we are allocating I/O queues.
124  */
125 int
126 nvme_alloc_subqueue(nvme_softc_t *sc, uint16_t qid)
127 {
128 	nvme_subqueue_t *queue = &sc->subqueues[qid];
129 	int error = 0;
130 
131 	/*
132 	 * For now implement the maximum queue size negotiated in the
133 	 * attach.
134 	 */
135 	lockinit(&queue->lk, "nvqlk", 0, 0);
136 	queue->sc = sc;
137 	queue->nqe = sc->maxqe;
138 	queue->qid = qid;
139 	queue->subq_doorbell_reg = NVME_REG_SUBQ_BELL(qid, sc->dstrd4);
140 
141 	/*
142 	 * dma memory for the submission queue
143 	 */
144 	if (error == 0) {
145 		error = bus_dmamem_alloc(sc->sque_tag, (void **)&queue->ksubq,
146 					 BUS_DMA_ZERO, &queue->sque_map);
147 	}
148 	if (error == 0) {
149 		error = bus_dmamap_load(sc->sque_tag, queue->sque_map,
150 					queue->ksubq,
151 					bus_dma_tag_getmaxsize(sc->sque_tag),
152 					nvme_dmamem_saveseg, &queue->psubq,
153 					0);
154 	}
155 
156 	/*
157 	 * dma memory for enough PRPs to map MAXPHYS bytes of memory per
158 	 * request.  A MAXPHYS buffer which begins partially straddling
159 	 * a page boundary can still be accomodated because we have an
160 	 * additional PRP entry in cmd.head.
161 	 */
162 	if (error == 0) {
163 		error = bus_dmamem_alloc(sc->prps_tag, (void **)&queue->kprps,
164 					 BUS_DMA_ZERO, &queue->prps_map);
165 	}
166 	if (error == 0) {
167 		error = bus_dmamap_load(sc->prps_tag, queue->prps_map,
168 					queue->kprps,
169 					bus_dma_tag_getmaxsize(sc->prps_tag),
170 					nvme_dmamem_saveseg, &queue->pprps,
171 					0);
172 	}
173 
174 	/*
175 	 * dma memory for admin data
176 	 */
177 	if (qid == 0 && error == 0) {
178 		error = bus_dmamem_alloc(sc->adm_tag,
179 					 (void **)&queue->kdatapgs,
180 					 BUS_DMA_ZERO, &queue->adm_map);
181 	}
182 	if (qid == 0 && error == 0) {
183 		error = bus_dmamap_load(sc->adm_tag, queue->adm_map,
184 					queue->kdatapgs,
185 					bus_dma_tag_getmaxsize(sc->adm_tag),
186 					nvme_dmamem_saveseg, &queue->pdatapgs,
187 					0);
188 	}
189 
190 	/*
191 	 * Driver request structures
192 	 */
193 	if (error == 0) {
194 		nvme_request_t *req;
195 		uint32_t i;
196 
197 		queue->reqary = kmalloc(sizeof(nvme_request_t) * queue->nqe,
198 					M_NVME, M_WAITOK | M_ZERO);
199 		for (i = 0; i < queue->nqe; ++i) {
200 			req = &queue->reqary[i];
201 			req->next_avail = queue->first_avail;
202 			queue->first_avail = req;
203 			req->subq = queue;
204 			req->comq = &sc->comqueues[queue->comqid];
205 			req->cmd_id = i;
206 			if (qid == 0) {
207 				req->info = &queue->kdatapgs[i];
208 				req->pinfo = queue->pdatapgs +
209 					     i * sizeof(nvme_admin_data_t);
210 			}
211 		}
212 	}
213 
214 	/*
215 	 * Error handling
216 	 */
217 	if (error)
218 		nvme_free_subqueue(sc, qid);
219 	return error;
220 }
221 
222 int
223 nvme_alloc_comqueue(nvme_softc_t *sc, uint16_t qid)
224 {
225 	nvme_comqueue_t *queue = &sc->comqueues[qid];
226 	int error = 0;
227 
228 	/*
229 	 * For now implement the maximum queue size negotiated in the
230 	 * attach.
231 	 */
232 	lockinit(&queue->lk, "nvqlk", 0, 0);
233 	queue->sc = sc;
234 	queue->qid = qid;
235 	queue->phase = NVME_COMQ_STATUS_PHASE;
236 	queue->comq_doorbell_reg = NVME_REG_COMQ_BELL(qid, sc->dstrd4);
237 
238 	if (error == 0) {
239 		error = bus_dmamem_alloc(sc->cque_tag, (void **)&queue->kcomq,
240 					 BUS_DMA_ZERO, &queue->cque_map);
241 	}
242 	if (error == 0) {
243 		error = bus_dmamap_load(sc->cque_tag, queue->cque_map,
244 					queue->kcomq,
245 					bus_dma_tag_getmaxsize(sc->cque_tag),
246 					nvme_dmamem_saveseg, &queue->pcomq,
247 					0);
248 	}
249 
250 	/*
251 	 * Set nqe last.  The comq polling loop tests this field and we
252 	 * do not want it to spuriously assume that the comq is initialized
253 	 * until it actually is.
254 	 */
255 	if (error == 0)
256 		queue->nqe = sc->maxqe;
257 
258 	if (error)
259 		nvme_free_comqueue(sc, qid);
260 	return error;
261 }
262 
263 void
264 nvme_free_subqueue(nvme_softc_t *sc, uint16_t qid)
265 {
266 	nvme_subqueue_t *queue = &sc->subqueues[qid];
267 
268 	queue->first_avail = NULL;
269 	if (queue->reqary) {
270 		kfree(queue->reqary, M_NVME);
271 		queue->reqary = NULL;
272 	}
273 	if (queue->ksubq) {
274 		bus_dmamem_free(sc->sque_tag, queue->ksubq, queue->sque_map);
275 		bus_dmamap_unload(sc->sque_tag, queue->sque_map);
276 		bus_dmamap_destroy(sc->sque_tag, queue->sque_map);
277 	}
278 	if (queue->kprps) {
279 		bus_dmamem_free(sc->prps_tag, queue->kprps, queue->prps_map);
280 		bus_dmamap_unload(sc->prps_tag, queue->prps_map);
281 		bus_dmamap_destroy(sc->prps_tag, queue->prps_map);
282 	}
283 	if (queue->kdatapgs) {
284 		bus_dmamem_free(sc->adm_tag, queue->kdatapgs, queue->adm_map);
285 		bus_dmamap_unload(sc->adm_tag, queue->adm_map);
286 		bus_dmamap_destroy(sc->adm_tag, queue->adm_map);
287 	}
288 	bzero(queue, sizeof(*queue));
289 }
290 
291 void
292 nvme_free_comqueue(nvme_softc_t *sc, uint16_t qid)
293 {
294 	nvme_comqueue_t *queue = &sc->comqueues[qid];
295 
296 	/*
297 	 * Clear this field first so poll loops ignore the comq.
298 	 */
299 	queue->nqe = 0;
300 
301 	if (queue->kcomq) {
302 		bus_dmamem_free(sc->cque_tag, queue->kcomq, queue->cque_map);
303 		bus_dmamap_unload(sc->cque_tag, queue->cque_map);
304 		bus_dmamap_destroy(sc->cque_tag, queue->cque_map);
305 	}
306 	bzero(queue, sizeof(*queue));
307 }
308 
309 /*
310  * ADMIN AND I/O REQUEST HANDLING
311  */
312 
313 /*
314  * Obtain a request and handle DMA mapping the supplied kernel buffer.
315  * Fields in cmd.head will be initialized and remaining fields will be zero'd.
316  * Caller is responsible for filling in remaining fields as appropriate.
317  *
318  * Caller must hold the queue lock.
319  */
320 nvme_request_t *
321 nvme_get_admin_request(nvme_softc_t *sc, uint8_t opcode)
322 {
323 	nvme_request_t *req;
324 
325 	req = nvme_get_request(&sc->subqueues[0], opcode, NULL, 0);
326 	req->cmd.head.prp1 = req->pinfo;
327 	req->callback = NULL;
328 
329 	return req;
330 }
331 
332 /*
333  * ADMIN AND I/O REQUEST HANDLING
334  */
335 
336 /*
337  * Obtain a request and handle DMA mapping the supplied kernel buffer.
338  * Fields in cmd.head will be initialized and remaining fields will be zero'd.
339  * Caller is responsible for filling in remaining fields as appropriate.
340  *
341  * May return NULL if no requests are available or if there is no room in
342  * the submission queue to handle it (should only be possible on an I/O queue,
343  * admin queue operations are managed).
344  *
345  * Caller should NOT hold the queue lock.
346  */
347 nvme_request_t *
348 nvme_get_request(nvme_subqueue_t *queue, uint8_t opcode,
349 		 char *kva, size_t bytes)
350 {
351 	nvme_request_t *req;
352 	nvme_request_t *next;
353 
354 	/*
355 	 * No easy lockless way to pull a new request off.  We have to check
356 	 * for a number of conditions and there may be multiple threads
357 	 * making this call simultaneously, which complicates matters even
358 	 * more.
359 	 */
360 	lockmgr(&queue->lk, LK_EXCLUSIVE);
361 
362 	/*
363 	 * Make sure the submission queue has room to accomodate the
364 	 * request.  Requests can be completed out of order so the
365 	 * submission ring could still be full even though we have
366 	 * requests available.
367 	 */
368 	if ((queue->subq_tail + queue->unsubmitted + 1) % queue->nqe ==
369 	    queue->subq_head) {
370 		lockmgr(&queue->lk, LK_RELEASE);
371 		KKASSERT(queue->qid != 0);
372 		atomic_swap_int(&queue->signal_requeue, 1);
373 
374 		return NULL;
375 	}
376 
377 	/*
378 	 * Pop the next available request off of the first_avail linked
379 	 * list.  An atomic op must be used here because nvme_put_request()
380 	 * returns requests to the list without holding queue->lk.
381 	 */
382 	for (;;) {
383 		req = queue->first_avail;
384 		cpu_ccfence();
385 		if (req == NULL) {
386 			lockmgr(&queue->lk, LK_RELEASE);
387 			KKASSERT(queue->qid != 0);
388 			atomic_swap_int(&queue->signal_requeue, 1);
389 
390 			return NULL;
391 		}
392 		next = req->next_avail;
393 		if (atomic_cmpset_ptr(&queue->first_avail, req, next))
394 			break;
395 	}
396 
397 	/*
398 	 * We have to keep track of unsubmitted requests in order to be
399 	 * able to properly check whether the ring is full or not (check
400 	 * is done at the top of this procedure, above).
401 	 */
402 	++queue->unsubmitted;
403 	lockmgr(&queue->lk, LK_RELEASE);
404 
405 	/*
406 	 * Fill-in basic fields and do the DMA mapping.
407 	 */
408 	req->next_avail = NULL;
409 	KKASSERT(req->state == NVME_REQ_AVAIL);
410 	req->state = NVME_REQ_ALLOCATED;
411 	req->callback = NULL;
412 	req->waiting = 0;
413 
414 	req->cmd.head.opcode = opcode;
415 	req->cmd.head.flags = NVME_SUBQFLG_PRP | NVME_SUBQFLG_NORM;
416 	req->cmd.head.cid = req->cmd_id;
417 	req->cmd.head.nsid = 0;
418 	req->cmd.head.mptr = 0;
419 	req->cmd.head.prp1 = 0;
420 	req->cmd.head.prp2 = 0;
421 	req->cmd.dw10 = 0;
422 	req->cmd.dw11 = 0;
423 	req->cmd.dw12 = 0;
424 	req->cmd.dw13 = 0;
425 	req->cmd.dw14 = 0;
426 	req->cmd.dw15 = 0;
427 
428 	if (kva) {
429 		size_t count = 0;
430 		size_t idx = 0;
431 		vm_paddr_t paddr;
432 		vm_paddr_t pprptab;
433 		uint64_t *kprptab;
434 		KKASSERT(bytes >= 0 && bytes <= MAXPHYS);
435 
436 		kprptab = queue->kprps +
437 			  (MAXPHYS / PAGE_SIZE) * req->cmd_id;
438 		pprptab = queue->pprps +
439 			  (MAXPHYS / PAGE_SIZE) * req->cmd_id *
440 			  sizeof(uint64_t);
441 
442 		while (count < bytes) {
443 			paddr = vtophys(kva + count);
444 			if (idx == 0) {
445 				KKASSERT((paddr & 3) == 0);
446 				req->cmd.head.prp1 = paddr;
447 				count += (((intptr_t)kva + PAGE_SIZE) &
448 					  ~(intptr_t)PAGE_MASK) -
449 					 (intptr_t)kva;
450 			} else if (idx == 1 && count + PAGE_SIZE >= bytes) {
451 				KKASSERT((paddr & PAGE_MASK) == 0);
452 				req->cmd.head.prp2 = paddr;
453 				count += PAGE_SIZE;
454 			} else {
455 				KKASSERT((paddr & PAGE_MASK) == 0);
456 				/* if (idx == 1) -- not needed, just repeat */
457 				req->cmd.head.prp2 = pprptab; /* repeat */
458 				kprptab[idx - 1] = paddr;
459 				count += PAGE_SIZE;
460 			}
461 			++idx;
462 		}
463 	}
464 	return req;
465 }
466 
467 /*
468  * Submit request for execution.  This will doorbell the subq.
469  *
470  * Caller must hold the queue lock.
471  */
472 void
473 nvme_submit_request(nvme_request_t *req)
474 {
475 	nvme_subqueue_t *queue = req->subq;
476 	nvme_allcmd_t *cmd;
477 
478 	cmd = &queue->ksubq[queue->subq_tail];
479 	--queue->unsubmitted;
480 	if (++queue->subq_tail == queue->nqe)
481 		queue->subq_tail = 0;
482 	KKASSERT(queue->subq_tail != queue->subq_head);
483 	*cmd = req->cmd;
484 	cpu_sfence();	/* needed? */
485 	req->state = NVME_REQ_SUBMITTED;
486 	nvme_write(queue->sc, queue->subq_doorbell_reg, queue->subq_tail);
487 }
488 
489 /*
490  * Wait for a request to complete.
491  *
492  * Caller does not need to hold the queue lock.  If it does, or if it
493  * holds some other lock, it should pass it in so it can be released across
494  * sleeps, else pass NULL.
495  */
496 int
497 nvme_wait_request(nvme_request_t *req, int ticks)
498 {
499 	struct lock *lk;
500 	int code;
501 
502 	req->waiting = 1;
503 	if (req->state != NVME_REQ_COMPLETED) {
504 		lk = &req->comq->lk;
505 		cpu_lfence();
506 		lockmgr(lk, LK_EXCLUSIVE);
507 		while (req->state == NVME_REQ_SUBMITTED) {
508 			nvme_poll_completions(req->comq, lk);
509 			if (req->state != NVME_REQ_SUBMITTED)
510 				break;
511 			lksleep(req, lk, 0, "nvwait", hz);
512 		}
513 		lockmgr(lk, LK_RELEASE);
514 		KKASSERT(req->state == NVME_REQ_COMPLETED);
515 	}
516 	cpu_lfence();
517 	code = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status);
518 
519 	return code;
520 }
521 
522 /*
523  * Put request away, making it available for reuse.  If this is an admin
524  * request its auxillary data page is also being released for reuse.
525  *
526  * Caller does NOT have to hold the queue lock.
527  */
528 void
529 nvme_put_request(nvme_request_t *req)
530 {
531 	nvme_subqueue_t *queue = req->subq;
532 	nvme_request_t *next;
533 
534 	/*
535 	 * Insert on head for best cache reuse.
536 	 */
537 	KKASSERT(req->state == NVME_REQ_COMPLETED);
538 	req->state = NVME_REQ_AVAIL;
539 	for (;;) {
540 		next = queue->first_avail;
541 		cpu_ccfence();
542 		req->next_avail = next;
543 		if (atomic_cmpset_ptr(&queue->first_avail, next, req))
544 			break;
545 	}
546 
547 	/*
548 	 * If BIOs were deferred due to lack of request space signal the
549 	 * admin thread to requeue them.  This is a bit messy and normally
550 	 * should not happen due to the large number of queue entries nvme
551 	 * usually has.  Let it race for now (admin has a 1hz tick).
552 	 */
553 	if (atomic_swap_int(&queue->signal_requeue, 0)) {
554 		atomic_set_int(&queue->sc->admin_signal, ADMIN_SIG_REQUEUE);
555 		wakeup(&queue->sc->admin_signal);
556 	}
557 }
558 
559 /*
560  * Poll for completions on queue, copy the 16-byte hw result entry
561  * into the request and poke the doorbell to update the controller's
562  * understanding of comq_head.
563  *
564  * If lk is non-NULL it will be passed to the callback which typically
565  * releases it temporarily when calling biodone() or doing other complex
566  * work on the result.
567  *
568  * Caller must usually hold comq->lk.
569  */
570 void
571 nvme_poll_completions(nvme_comqueue_t *comq, struct lock *lk)
572 {
573 	nvme_softc_t *sc = comq->sc;
574 	nvme_request_t *req;
575 	nvme_subqueue_t *subq;
576 	nvme_allres_t *res;
577 #if 0
578 	int didwork = 0;
579 #endif
580 
581 	KKASSERT(comq->comq_tail < comq->nqe);
582 	cpu_lfence();		/* needed prior to first phase test */
583 	for (;;) {
584 		/*
585 		 * WARNING! LOCK MAY HAVE BEEN TEMPORARILY LOST DURING LOOP.
586 		 */
587 		res = &comq->kcomq[comq->comq_tail];
588 		if ((res->tail.status ^ comq->phase) & NVME_COMQ_STATUS_PHASE)
589 			break;
590 
591 		/*
592 		 * Process result on completion queue.
593 		 *
594 		 * Bump comq_tail, flip the phase detect when we roll-over.
595 		 * doorbell every 1/4 queue and at the end of the loop.
596 		 */
597 		if (++comq->comq_tail == comq->nqe) {
598 			comq->comq_tail = 0;
599 			comq->phase ^= NVME_COMQ_STATUS_PHASE;
600 		}
601 
602 		/*
603 		 * WARNING! I imploded the chip by reusing a command id
604 		 *	    before it was discarded in the completion queue
605 		 *	    via the doorbell, so for now we always write
606 		 *	    the doorbell before marking the request as
607 		 *	    COMPLETED (it can be reused instantly upon
608 		 *	    being marked).
609 		 */
610 #if 0
611 		if (++didwork == (comq->nqe >> 2)) {
612 			didwork = 0;
613 			nvme_write(comq->sc, comq->comq_doorbell_reg,
614 				   comq->comq_tail);
615 		}
616 #endif
617 		cpu_lfence();	/* needed prior to content check */
618 
619 		/*
620 		 * Locate the request and related submission queue.  The
621 		 * request could be on a different queue.  A submission
622 		 * queue can have only one completion queue, so we can
623 		 * update subq_head without locking the submission queue.
624 		 */
625 		subq = &sc->subqueues[res->tail.subq_id];
626 		subq->subq_head = res->tail.subq_head_ptr;
627 		req = &subq->reqary[res->tail.cmd_id];
628 
629 		/*
630 		 * Copy the fields and wakeup anyone waiting on req.
631 		 * The response field in the completion queue can be reused
632 		 * once we doorbell which is why we make a copy.
633 		 */
634 		KKASSERT(req->state == NVME_REQ_SUBMITTED &&
635 			 req->comq == comq);
636 		req->res = *res;
637 		nvme_write(comq->sc, comq->comq_doorbell_reg, comq->comq_tail);
638 		cpu_sfence();
639 		req->state = NVME_REQ_COMPLETED;
640 		if (req->callback) {
641 			req->callback(req, lk);
642 		} else if (req->waiting) {
643 			wakeup(req);
644 		}
645 	}
646 #if 0
647 	if (didwork)
648 		nvme_write(comq->sc, comq->comq_doorbell_reg, comq->comq_tail);
649 #endif
650 }
651 
652 /*
653  * Core interrupt handler (called from dedicated interrupt thread, possibly
654  * preempts other threads).
655  *
656  * NOTE: For pin-based level interrupts, the chipset interrupt is cleared
657  *	 automatically once all the head doorbells are updated.  However,
658  *	 most chipsets assume MSI-X will be used and MAY NOT IMPLEMENT
659  *	 pin-based interrupts properly.  I found the BPX card, for example,
660  *	 is unable to clear a pin-based interrupt.
661  */
662 void
663 nvme_intr(void *arg)
664 {
665 	nvme_comqueue_t *comq = arg;
666 	nvme_softc_t *sc;
667 	int i;
668 	int skip;
669 
670 	/*
671 	 * Process all completion queues associated with this vector.  The
672 	 * interrupt is masked in the APIC.  Do NOT mess with the NVMe
673 	 * masking registers because (1) We don't need to and it wastes time,
674 	 * and (2) We aren't supposed to touch them if using MSI-X anyway.
675 	 */
676 	sc = comq->sc;
677 	if (sc->nirqs == 1)
678 		skip = 1;
679 	else
680 		skip = sc->nirqs - 1;
681 
682 	for (i = comq->qid; i <= sc->niocomqs; i += skip) {
683 		if (comq->nqe) {
684 			lockmgr(&comq->lk, LK_EXCLUSIVE);
685 			nvme_poll_completions(comq, &comq->lk);
686 			lockmgr(&comq->lk, LK_RELEASE);
687 		}
688 		comq += skip;
689 	}
690 }
691 
692 /*
693  * ADMIN HELPER COMMAND ROLLUP FUNCTIONS
694  */
695 /*
696  * Issue command to create a submission queue.
697  */
698 int
699 nvme_create_subqueue(nvme_softc_t *sc, uint16_t qid)
700 {
701 	nvme_request_t *req;
702 	nvme_subqueue_t *subq = &sc->subqueues[qid];
703 	int status;
704 
705 	req = nvme_get_admin_request(sc, NVME_OP_CREATE_SUBQ);
706 	req->cmd.head.prp1 = subq->psubq;
707 	req->cmd.crsub.subq_id = qid;
708 	req->cmd.crsub.subq_size = subq->nqe - 1;	/* 0's based value */
709 	req->cmd.crsub.flags = NVME_CREATESUB_PC | NVME_CREATESUB_PRI_URG;
710 	req->cmd.crsub.comq_id = subq->comqid;
711 
712 	nvme_submit_request(req);
713 	status = nvme_wait_request(req, hz);
714 	nvme_put_request(req);
715 
716 	return status;
717 }
718 
719 /*
720  * Issue command to create a completion queue.
721  */
722 int
723 nvme_create_comqueue(nvme_softc_t *sc, uint16_t qid)
724 {
725 	nvme_request_t *req;
726 	nvme_comqueue_t *comq = &sc->comqueues[qid];
727 	int status;
728 	int error;
729 	uint16_t ivect;
730 
731 	error = 0;
732 	if (sc->nirqs > 1) {
733 		ivect = 1 + (qid - 1) % (sc->nirqs - 1);
734 		if (qid && ivect == qid) {
735 			error = bus_setup_intr(sc->dev, sc->irq[ivect],
736 						INTR_MPSAFE | INTR_HIFREQ,
737 						nvme_intr,
738 						&sc->comqueues[ivect],
739 						&sc->irq_handle[ivect],
740 						NULL);
741 		}
742 	} else {
743 		ivect = 0;
744 	}
745 	if (error)
746 		return error;
747 
748 	req = nvme_get_admin_request(sc, NVME_OP_CREATE_COMQ);
749 	req->cmd.head.prp1 = comq->pcomq;
750 	req->cmd.crcom.comq_id = qid;
751 	req->cmd.crcom.comq_size = comq->nqe - 1;	/* 0's based value */
752 	req->cmd.crcom.ivect = ivect;
753 	req->cmd.crcom.flags = NVME_CREATECOM_PC | NVME_CREATECOM_IEN;
754 
755 	nvme_submit_request(req);
756 	status = nvme_wait_request(req, hz);
757 	nvme_put_request(req);
758 
759 	return status;
760 }
761 
762 /*
763  * Issue command to delete a submission queue.
764  */
765 int
766 nvme_delete_subqueue(nvme_softc_t *sc, uint16_t qid)
767 {
768 	nvme_request_t *req;
769 	/*nvme_subqueue_t *subq = &sc->subqueues[qid];*/
770 	int status;
771 
772 	req = nvme_get_admin_request(sc, NVME_OP_DELETE_SUBQ);
773 	req->cmd.head.prp1 = 0;
774 	req->cmd.delete.qid = qid;
775 
776 	nvme_submit_request(req);
777 	status = nvme_wait_request(req, hz);
778 	nvme_put_request(req);
779 
780 	return status;
781 }
782 
783 /*
784  * Issue command to delete a completion queue.
785  */
786 int
787 nvme_delete_comqueue(nvme_softc_t *sc, uint16_t qid)
788 {
789 	nvme_request_t *req;
790 	/*nvme_comqueue_t *comq = &sc->comqueues[qid];*/
791 	int status;
792 	uint16_t ivect;
793 
794 	req = nvme_get_admin_request(sc, NVME_OP_DELETE_COMQ);
795 	req->cmd.head.prp1 = 0;
796 	req->cmd.delete.qid = qid;
797 
798 	nvme_submit_request(req);
799 	status = nvme_wait_request(req, hz);
800 	nvme_put_request(req);
801 
802 	if (qid && sc->nirqs > 1) {
803 		ivect = 1 + (qid - 1) % (sc->nirqs - 1);
804 		if (ivect == qid) {
805 			bus_teardown_intr(sc->dev,
806 					  sc->irq[ivect],
807 					  sc->irq_handle[ivect]);
808 		}
809 	}
810 
811 	return status;
812 }
813 
814 /*
815  * Issue friendly shutdown to controller.
816  */
817 int
818 nvme_issue_shutdown(nvme_softc_t *sc)
819 {
820 	uint32_t reg;
821 	int base_ticks;
822 	int error;
823 
824 	/*
825 	 * Put us in shutdown
826 	 */
827 	reg = nvme_read(sc, NVME_REG_CONFIG);
828 	reg &= ~NVME_CONFIG_SHUT_MASK;
829 	reg |= NVME_CONFIG_SHUT_NORM;
830 	nvme_write(sc, NVME_REG_CONFIG, reg);
831 
832 	/*
833 	 * Wait up to 10 seconds for acknowlegement
834 	 */
835 	error = ENXIO;
836 	base_ticks = ticks;
837 	while ((int)(ticks - base_ticks) < 10 * 20) {
838 		reg = nvme_read(sc, NVME_REG_STATUS);
839 		if ((reg & NVME_STATUS_SHUT_MASK) & NVME_STATUS_SHUT_DONE) {
840 			error = 0;
841 			break;
842 		}
843 		nvme_os_sleep(50);	/* 50ms poll */
844 	}
845 	if (error)
846 		device_printf(sc->dev, "Unable to shutdown chip nicely\n");
847 	else
848 		device_printf(sc->dev, "Normal chip shutdown succeeded\n");
849 
850 	return error;
851 }
852 
853 /*
854  * Make space-padded string serial and model numbers more readable.
855  */
856 size_t
857 string_cleanup(char *str, int domiddle)
858 {
859 	size_t i;
860 	size_t j;
861 	int atbeg = 1;
862 
863 	for (i = j = 0; str[i]; ++i) {
864 		if ((str[i] == ' ' || str[i] == '\r') &&
865 		    (atbeg || domiddle)) {
866 			continue;
867 		} else {
868 			atbeg = 0;
869 		}
870 		str[j] = str[i];
871 		++j;
872 	}
873 	while (domiddle == 0 && j > 0 && (str[j-1] == ' ' || str[j-1] == '\r'))
874 		--j;
875 	str[j] = 0;
876 	if (domiddle == 0) {
877 		for (j = 0; str[j]; ++j) {
878 			if (str[j] == ' ')
879 				str[j] = '_';
880 		}
881 	}
882 
883 	return j;
884 }
885