xref: /dragonfly/sys/dev/disk/nvme/nvme.c (revision 7ff0fc30)
1 /*
2  * Copyright (c) 2016-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * Most low-level chip related functions (other than attachment) reside in
36  * this module.  Most functions assume that the caller is already holding
37  * appropriate locks to prevent SMP collisions.
38  */
39 
40 #include "nvme.h"
41 
42 MALLOC_DEFINE(M_NVME, "NVMe Driver", "NVME");
43 
44 /*
45  * DMA mapping callbacks.
46  */
47 static
48 void
49 nvme_dmamem_saveseg(void *info, bus_dma_segment_t *segs, int nsegs, int error)
50 {
51         KKASSERT(error == 0);
52 	KKASSERT(nsegs == 1);
53 	*(bus_addr_t *)info = segs->ds_addr;
54 }
55 
56 /*
57  * Low-level chip enable/disable.
58  */
59 int
60 nvme_enable(nvme_softc_t *sc, int enable)
61 {
62 	uint32_t reg;
63 	int error = 0;
64 	int base_ticks;
65 
66 	reg = nvme_read(sc, NVME_REG_CONFIG);
67 	if (enable == 0 && (reg & NVME_CONFIG_EN)) {
68 		/*
69 		 * Disable the chip so we can program it.
70 		 */
71 		reg &= ~NVME_CONFIG_EN;
72 		nvme_write(sc, NVME_REG_CONFIG, reg);
73 	} else if (enable && (reg & NVME_CONFIG_EN) == 0) {
74 		/*
75 		 * Enable the chip once programmed.
76 		 */
77 		reg |= NVME_CONFIG_EN;
78 		nvme_write(sc, NVME_REG_CONFIG, reg);
79 	}
80 	error = ENXIO;
81 	base_ticks = ticks;
82 	while ((int)(ticks - base_ticks) < sc->entimo) {
83 		reg = nvme_read(sc, NVME_REG_STATUS);
84 		if (enable == 0 && (reg & NVME_STATUS_RDY) == 0) {
85 			error = 0;
86 			break;
87 		}
88 		if (enable && (reg & NVME_STATUS_RDY)) {
89 			error = 0;
90 			break;
91 		}
92 		nvme_os_sleep(50);	/* 50ms poll */
93 	}
94 
95 	/*
96 	 * Interrupt masking (only applicable when MSI-X not used, 3.1.3 and
97 	 * 3.1.4 state that these registers should not be accessed with MSI-X)
98 	 */
99 	if (error == 0 && sc->nirqs == 1) {
100 		if (enable) {
101 			nvme_write(sc, NVME_REG_INTSET, ~1);
102 			nvme_write(sc, NVME_REG_INTCLR, 1);
103 		} else {
104 			nvme_write(sc, NVME_REG_INTSET, ~1);
105 		}
106 	}
107 
108 	if (error) {
109 		device_printf(sc->dev, "Cannot %s device\n",
110 			      (enable ? "enable" : "disable"));
111 	} else {
112 #if 0
113 		kprintf("gratuitous 15 second sleep\n");
114 		nvme_os_sleep(15000);
115 		kprintf("gratuitous 15 second sleep done\n");
116 #endif
117 	}
118 	return error;
119 }
120 
121 /*
122  * Allocate submission and completion queues.  If qid is 0 we are allocating
123  * the ADMIN queues, otherwise we are allocating I/O queues.
124  */
125 int
126 nvme_alloc_subqueue(nvme_softc_t *sc, uint16_t qid)
127 {
128 	nvme_subqueue_t *queue = &sc->subqueues[qid];
129 	int error = 0;
130 
131 	/*
132 	 * For now implement the maximum queue size negotiated in the
133 	 * attach.
134 	 */
135 	lockinit(&queue->lk, "nvqlk", 0, 0);
136 	queue->sc = sc;
137 	queue->nqe = sc->maxqe;
138 	queue->qid = qid;
139 	queue->subq_doorbell_reg = NVME_REG_SUBQ_BELL(qid, sc->dstrd4);
140 
141 	/*
142 	 * dma memory for the submission queue
143 	 */
144 	if (error == 0) {
145 		error = bus_dmamem_alloc(sc->sque_tag, (void **)&queue->ksubq,
146 					 BUS_DMA_ZERO, &queue->sque_map);
147 	}
148 	if (error == 0) {
149 		error = bus_dmamap_load(sc->sque_tag, queue->sque_map,
150 					queue->ksubq,
151 					bus_dma_tag_getmaxsize(sc->sque_tag),
152 					nvme_dmamem_saveseg, &queue->psubq,
153 					0);
154 	}
155 
156 	/*
157 	 * dma memory for enough PRPs to map MAXPHYS bytes of memory per
158 	 * request.  A MAXPHYS buffer which begins partially straddling
159 	 * a page boundary can still be accomodated because we have an
160 	 * additional PRP entry in cmd.head.
161 	 */
162 	if (error == 0) {
163 		error = bus_dmamem_alloc(sc->prps_tag, (void **)&queue->kprps,
164 					 BUS_DMA_ZERO, &queue->prps_map);
165 	}
166 	if (error == 0) {
167 		error = bus_dmamap_load(sc->prps_tag, queue->prps_map,
168 					queue->kprps,
169 					bus_dma_tag_getmaxsize(sc->prps_tag),
170 					nvme_dmamem_saveseg, &queue->pprps,
171 					0);
172 	}
173 
174 	/*
175 	 * dma memory for admin data
176 	 */
177 	if (qid == 0 && error == 0) {
178 		error = bus_dmamem_alloc(sc->adm_tag,
179 					 (void **)&queue->kdatapgs,
180 					 BUS_DMA_ZERO, &queue->adm_map);
181 	}
182 	if (qid == 0 && error == 0) {
183 		error = bus_dmamap_load(sc->adm_tag, queue->adm_map,
184 					queue->kdatapgs,
185 					bus_dma_tag_getmaxsize(sc->adm_tag),
186 					nvme_dmamem_saveseg, &queue->pdatapgs,
187 					0);
188 	}
189 
190 	/*
191 	 * Driver request structures
192 	 */
193 	if (error == 0) {
194 		nvme_request_t *req;
195 		uint32_t i;
196 
197 		queue->reqary = kmalloc(sizeof(nvme_request_t) * queue->nqe,
198 					M_NVME, M_WAITOK | M_ZERO);
199 		for (i = 0; i < queue->nqe; ++i) {
200 			req = &queue->reqary[i];
201 			if (i == 0) {
202 				/*
203 				 * Set aside one request for dump operation
204 				 */
205 				queue->dump_req = req;
206 			} else {
207 				/*
208 				 * The rest go through the normal list
209 				 */
210 				req->next_avail = queue->first_avail;
211 				queue->first_avail = req;
212 			}
213 			req->subq = queue;
214 			req->comq = &sc->comqueues[queue->comqid];
215 			req->cmd_id = i;
216 			if (qid == 0) {
217 				req->info = &queue->kdatapgs[i];
218 				req->pinfo = queue->pdatapgs +
219 					     i * sizeof(nvme_admin_data_t);
220 			}
221 		}
222 	}
223 
224 	/*
225 	 * Error handling
226 	 */
227 	if (error)
228 		nvme_free_subqueue(sc, qid);
229 	return error;
230 }
231 
232 int
233 nvme_alloc_comqueue(nvme_softc_t *sc, uint16_t qid)
234 {
235 	nvme_comqueue_t *queue = &sc->comqueues[qid];
236 	int error = 0;
237 
238 	/*
239 	 * For now implement the maximum queue size negotiated in the
240 	 * attach.
241 	 */
242 	lockinit(&queue->lk, "nvqlk", 0, 0);
243 	queue->sc = sc;
244 	queue->qid = qid;
245 	queue->phase = NVME_COMQ_STATUS_PHASE;
246 	queue->comq_doorbell_reg = NVME_REG_COMQ_BELL(qid, sc->dstrd4);
247 
248 	if (error == 0) {
249 		error = bus_dmamem_alloc(sc->cque_tag, (void **)&queue->kcomq,
250 					 BUS_DMA_ZERO, &queue->cque_map);
251 	}
252 	if (error == 0) {
253 		error = bus_dmamap_load(sc->cque_tag, queue->cque_map,
254 					queue->kcomq,
255 					bus_dma_tag_getmaxsize(sc->cque_tag),
256 					nvme_dmamem_saveseg, &queue->pcomq,
257 					0);
258 	}
259 
260 	/*
261 	 * Set nqe last.  The comq polling loop tests this field and we
262 	 * do not want it to spuriously assume that the comq is initialized
263 	 * until it actually is.
264 	 */
265 	if (error == 0)
266 		queue->nqe = sc->maxqe;
267 
268 	if (error)
269 		nvme_free_comqueue(sc, qid);
270 	return error;
271 }
272 
273 void
274 nvme_free_subqueue(nvme_softc_t *sc, uint16_t qid)
275 {
276 	nvme_subqueue_t *queue = &sc->subqueues[qid];
277 
278 	queue->first_avail = NULL;
279 	if (queue->reqary) {
280 		kfree(queue->reqary, M_NVME);
281 		queue->reqary = NULL;
282 	}
283 	if (queue->ksubq) {
284 		bus_dmamem_free(sc->sque_tag, queue->ksubq, queue->sque_map);
285 		bus_dmamap_unload(sc->sque_tag, queue->sque_map);
286 		bus_dmamap_destroy(sc->sque_tag, queue->sque_map);
287 	}
288 	if (queue->kprps) {
289 		bus_dmamem_free(sc->prps_tag, queue->kprps, queue->prps_map);
290 		bus_dmamap_unload(sc->prps_tag, queue->prps_map);
291 		bus_dmamap_destroy(sc->prps_tag, queue->prps_map);
292 	}
293 	if (queue->kdatapgs) {
294 		bus_dmamem_free(sc->adm_tag, queue->kdatapgs, queue->adm_map);
295 		bus_dmamap_unload(sc->adm_tag, queue->adm_map);
296 		bus_dmamap_destroy(sc->adm_tag, queue->adm_map);
297 	}
298 	bzero(queue, sizeof(*queue));
299 }
300 
301 void
302 nvme_free_comqueue(nvme_softc_t *sc, uint16_t qid)
303 {
304 	nvme_comqueue_t *queue = &sc->comqueues[qid];
305 
306 	/*
307 	 * Clear this field first so poll loops ignore the comq.
308 	 */
309 	queue->nqe = 0;
310 
311 	if (queue->kcomq) {
312 		bus_dmamem_free(sc->cque_tag, queue->kcomq, queue->cque_map);
313 		bus_dmamap_unload(sc->cque_tag, queue->cque_map);
314 		bus_dmamap_destroy(sc->cque_tag, queue->cque_map);
315 	}
316 	bzero(queue, sizeof(*queue));
317 }
318 
319 /*
320  * ADMIN AND I/O REQUEST HANDLING
321  */
322 
323 /*
324  * Obtain a request and handle DMA mapping the supplied kernel buffer.
325  * Fields in cmd.head will be initialized and remaining fields will be zero'd.
326  * Caller is responsible for filling in remaining fields as appropriate.
327  *
328  * Caller must hold the queue lock.
329  */
330 nvme_request_t *
331 nvme_get_admin_request(nvme_softc_t *sc, uint8_t opcode)
332 {
333 	nvme_request_t *req;
334 
335 	req = nvme_get_request(&sc->subqueues[0], opcode, NULL, 0);
336 	req->cmd.head.prp1 = req->pinfo;
337 	req->callback = NULL;
338 
339 	return req;
340 }
341 
342 /*
343  * ADMIN AND I/O REQUEST HANDLING
344  */
345 
346 static __inline
347 void
348 _nvme_fill_request(nvme_subqueue_t *queue, uint8_t opcode,
349 		   char *kva, size_t bytes,
350 		   nvme_request_t *req)
351 {
352 	/*
353 	 * Fill-in basic fields and do the DMA mapping.
354 	 */
355 	req->next_avail = NULL;
356 	KKASSERT(req->state == NVME_REQ_AVAIL);
357 	req->state = NVME_REQ_ALLOCATED;
358 	req->callback = NULL;
359 	req->waiting = 0;
360 
361 	req->cmd.head.opcode = opcode;
362 	req->cmd.head.flags = NVME_SUBQFLG_PRP | NVME_SUBQFLG_NORM;
363 	req->cmd.head.cid = req->cmd_id;
364 	req->cmd.head.nsid = 0;
365 	req->cmd.head.mptr = 0;
366 	req->cmd.head.prp1 = 0;
367 	req->cmd.head.prp2 = 0;
368 	req->cmd.dw10 = 0;
369 	req->cmd.dw11 = 0;
370 	req->cmd.dw12 = 0;
371 	req->cmd.dw13 = 0;
372 	req->cmd.dw14 = 0;
373 	req->cmd.dw15 = 0;
374 
375 	if (kva) {
376 		size_t count = 0;
377 		size_t idx = 0;
378 		vm_paddr_t paddr;
379 		vm_paddr_t pprptab;
380 		uint64_t *kprptab;
381 		KKASSERT(bytes >= 0 && bytes <= MAXPHYS);
382 
383 		kprptab = queue->kprps +
384 			  (MAXPHYS / PAGE_SIZE) * req->cmd_id;
385 		pprptab = queue->pprps +
386 			  (MAXPHYS / PAGE_SIZE) * req->cmd_id *
387 			  sizeof(uint64_t);
388 
389 		while (count < bytes) {
390 			paddr = vtophys(kva + count);
391 			if (idx == 0) {
392 				KKASSERT((paddr & 3) == 0);
393 				req->cmd.head.prp1 = paddr;
394 				count += (((intptr_t)kva + PAGE_SIZE) &
395 					  ~(intptr_t)PAGE_MASK) -
396 					 (intptr_t)kva;
397 			} else if (idx == 1 && count + PAGE_SIZE >= bytes) {
398 				KKASSERT((paddr & PAGE_MASK) == 0);
399 				req->cmd.head.prp2 = paddr;
400 				count += PAGE_SIZE;
401 			} else {
402 				KKASSERT((paddr & PAGE_MASK) == 0);
403 				/* if (idx == 1) -- not needed, just repeat */
404 				req->cmd.head.prp2 = pprptab; /* repeat */
405 				kprptab[idx - 1] = paddr;
406 				count += PAGE_SIZE;
407 			}
408 			++idx;
409 		}
410 	}
411 }
412 
413 
414 /*
415  * Obtain a request and handle DMA mapping the supplied kernel buffer.
416  * Fields in cmd.head will be initialized and remaining fields will be zero'd.
417  * Caller is responsible for filling in remaining fields as appropriate.
418  *
419  * May return NULL if no requests are available or if there is no room in
420  * the submission queue to handle it (should only be possible on an I/O queue,
421  * admin queue operations are managed).
422  *
423  * Caller should NOT hold the queue lock.
424  */
425 nvme_request_t *
426 nvme_get_request(nvme_subqueue_t *queue, uint8_t opcode,
427 		 char *kva, size_t bytes)
428 {
429 	nvme_request_t *req;
430 	nvme_request_t *next;
431 
432 	/*
433 	 * No easy lockless way to pull a new request off.  We have to check
434 	 * for a number of conditions and there may be multiple threads
435 	 * making this call simultaneously, which complicates matters even
436 	 * more.
437 	 */
438 	lockmgr(&queue->lk, LK_EXCLUSIVE);
439 
440 	/*
441 	 * Make sure the submission queue has room to accomodate the
442 	 * request.  Requests can be completed out of order so the
443 	 * submission ring could still be full even though we have
444 	 * requests available.
445 	 */
446 	if ((queue->subq_tail + queue->unsubmitted + 1) % queue->nqe ==
447 	    queue->subq_head) {
448 		lockmgr(&queue->lk, LK_RELEASE);
449 		KKASSERT(queue->qid != 0);
450 		atomic_swap_int(&queue->signal_requeue, 1);
451 
452 		return NULL;
453 	}
454 
455 	/*
456 	 * Pop the next available request off of the first_avail linked
457 	 * list.  An atomic op must be used here because nvme_put_request()
458 	 * returns requests to the list without holding queue->lk.
459 	 */
460 	for (;;) {
461 		req = queue->first_avail;
462 		cpu_ccfence();
463 		if (req == NULL) {
464 			lockmgr(&queue->lk, LK_RELEASE);
465 			KKASSERT(queue->qid != 0);
466 			atomic_swap_int(&queue->signal_requeue, 1);
467 
468 			return NULL;
469 		}
470 		next = req->next_avail;
471 		if (atomic_cmpset_ptr(&queue->first_avail, req, next))
472 			break;
473 	}
474 
475 	/*
476 	 * We have to keep track of unsubmitted requests in order to be
477 	 * able to properly check whether the ring is full or not (check
478 	 * is done at the top of this procedure, above).
479 	 */
480 	++queue->unsubmitted;
481 	lockmgr(&queue->lk, LK_RELEASE);
482 
483 	_nvme_fill_request(queue, opcode, kva, bytes, req);
484 
485 	return req;
486 }
487 
488 /*
489  * dump path only, cannot block.  Allow the lock to fail and bump
490  * queue->unsubmitted anyway.
491  */
492 nvme_request_t *
493 nvme_get_dump_request(nvme_subqueue_t *queue, uint8_t opcode,
494 		 char *kva, size_t bytes)
495 {
496 	nvme_request_t *req;
497 	int error;
498 
499 	error = lockmgr(&queue->lk, LK_EXCLUSIVE | LK_NOWAIT);
500 	req = queue->dump_req;
501 	++queue->unsubmitted;
502 	if (error == 0)
503 		lockmgr(&queue->lk, LK_RELEASE);
504 	_nvme_fill_request(queue, opcode, kva, bytes, req);
505 
506 	return req;
507 }
508 
509 /*
510  * Submit request for execution.  This will doorbell the subq.
511  *
512  * Caller must hold the queue lock.
513  */
514 void
515 nvme_submit_request(nvme_request_t *req)
516 {
517 	nvme_subqueue_t *queue = req->subq;
518 	nvme_allcmd_t *cmd;
519 
520 	cmd = &queue->ksubq[queue->subq_tail];
521 	--queue->unsubmitted;
522 	if (++queue->subq_tail == queue->nqe)
523 		queue->subq_tail = 0;
524 	KKASSERT(queue->subq_tail != queue->subq_head);
525 	*cmd = req->cmd;
526 	cpu_sfence();	/* needed? */
527 	req->state = NVME_REQ_SUBMITTED;
528 	nvme_write(queue->sc, queue->subq_doorbell_reg, queue->subq_tail);
529 }
530 
531 /*
532  * Wait for a request to complete.
533  *
534  * Caller does not need to hold the queue lock.  If it does, or if it
535  * holds some other lock, it should pass it in so it can be released across
536  * sleeps, else pass NULL.
537  */
538 int
539 nvme_wait_request(nvme_request_t *req)
540 {
541 	struct lock *lk;
542 	int code;
543 
544 	req->waiting = 1;
545 	if (req->state != NVME_REQ_COMPLETED) {
546 		lk = &req->comq->lk;
547 		cpu_lfence();
548 		lockmgr(lk, LK_EXCLUSIVE);
549 		while (req->state == NVME_REQ_SUBMITTED) {
550 			nvme_poll_completions(req->comq, lk);
551 			if (req->state != NVME_REQ_SUBMITTED)
552 				break;
553 			lksleep(req, lk, 0, "nvwait", hz);
554 		}
555 		lockmgr(lk, LK_RELEASE);
556 		KKASSERT(req->state == NVME_REQ_COMPLETED);
557 	}
558 	cpu_lfence();
559 	code = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status);
560 
561 	return code;
562 }
563 
564 /*
565  * dump path only, we cannot block, and the lock is allowed
566  * to fail.  But still try to play nice with interrupt threads.
567  */
568 int
569 nvme_poll_request(nvme_request_t *req)
570 {
571 	struct lock *lk;
572 	int code;
573 	int didlock = 500;	/* 500uS max */
574 
575 	req->waiting = 1;
576 	if (req->state != NVME_REQ_COMPLETED) {
577 		lk = &req->comq->lk;
578 		cpu_lfence();
579 		while (lockmgr(lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
580 			if (--didlock == 0)
581 				break;
582 			tsc_delay(1000);	/* 1uS */
583 		}
584 		while (req->state == NVME_REQ_SUBMITTED) {
585 			nvme_poll_completions(req->comq, lk);
586 			if (req->state != NVME_REQ_SUBMITTED)
587 				break;
588 			lwkt_switch();
589 		}
590 		if (didlock)
591 			lockmgr(lk, LK_RELEASE);
592 		KKASSERT(req->state == NVME_REQ_COMPLETED);
593 	}
594 	cpu_lfence();
595 	code = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status);
596 
597 	return code;
598 }
599 
600 /*
601  * Put request away, making it available for reuse.  If this is an admin
602  * request its auxillary data page is also being released for reuse.
603  *
604  * Caller does NOT have to hold the queue lock.
605  */
606 void
607 nvme_put_request(nvme_request_t *req)
608 {
609 	nvme_subqueue_t *queue = req->subq;
610 	nvme_request_t *next;
611 
612 	/*
613 	 * Insert on head for best cache reuse.
614 	 */
615 	KKASSERT(req->state == NVME_REQ_COMPLETED);
616 	req->state = NVME_REQ_AVAIL;
617 	for (;;) {
618 		next = queue->first_avail;
619 		cpu_ccfence();
620 		req->next_avail = next;
621 		if (atomic_cmpset_ptr(&queue->first_avail, next, req))
622 			break;
623 	}
624 
625 	/*
626 	 * If BIOs were deferred due to lack of request space signal the
627 	 * admin thread to requeue them.  This is a bit messy and normally
628 	 * should not happen due to the large number of queue entries nvme
629 	 * usually has.  Let it race for now (admin has a 1hz tick).
630 	 */
631 	if (atomic_swap_int(&queue->signal_requeue, 0)) {
632 		atomic_set_int(&queue->sc->admin_signal, ADMIN_SIG_REQUEUE);
633 		wakeup(&queue->sc->admin_signal);
634 	}
635 }
636 
637 /*
638  * dump path only.
639  */
640 void
641 nvme_put_dump_request(nvme_request_t *req)
642 {
643 	KKASSERT(req->state == NVME_REQ_COMPLETED);
644 	req->state = NVME_REQ_AVAIL;
645 }
646 
647 /*
648  * Poll for completions on queue, copy the 16-byte hw result entry
649  * into the request and poke the doorbell to update the controller's
650  * understanding of comq_head.
651  *
652  * If lk is non-NULL it will be passed to the callback which typically
653  * releases it temporarily when calling biodone() or doing other complex
654  * work on the result.
655  *
656  * Caller must usually hold comq->lk.
657  */
658 void
659 nvme_poll_completions(nvme_comqueue_t *comq, struct lock *lk)
660 {
661 	nvme_softc_t *sc = comq->sc;
662 	nvme_request_t *req;
663 	nvme_subqueue_t *subq;
664 	nvme_allres_t *res;
665 #if 0
666 	int didwork = 0;
667 #endif
668 
669 	KKASSERT(comq->comq_tail < comq->nqe);
670 	cpu_lfence();		/* needed prior to first phase test */
671 	for (;;) {
672 		/*
673 		 * WARNING! LOCK MAY HAVE BEEN TEMPORARILY LOST DURING LOOP.
674 		 */
675 		res = &comq->kcomq[comq->comq_tail];
676 		if ((res->tail.status ^ comq->phase) & NVME_COMQ_STATUS_PHASE)
677 			break;
678 
679 		/*
680 		 * Process result on completion queue.
681 		 *
682 		 * Bump comq_tail, flip the phase detect when we roll-over.
683 		 * doorbell every 1/4 queue and at the end of the loop.
684 		 */
685 		if (++comq->comq_tail == comq->nqe) {
686 			comq->comq_tail = 0;
687 			comq->phase ^= NVME_COMQ_STATUS_PHASE;
688 		}
689 
690 		/*
691 		 * WARNING! I imploded the chip by reusing a command id
692 		 *	    before it was discarded in the completion queue
693 		 *	    via the doorbell, so for now we always write
694 		 *	    the doorbell before marking the request as
695 		 *	    COMPLETED (it can be reused instantly upon
696 		 *	    being marked).
697 		 */
698 #if 0
699 		if (++didwork == (comq->nqe >> 2)) {
700 			didwork = 0;
701 			nvme_write(comq->sc, comq->comq_doorbell_reg,
702 				   comq->comq_tail);
703 		}
704 #endif
705 		cpu_lfence();	/* needed prior to content check */
706 
707 		/*
708 		 * Locate the request and related submission queue.  The
709 		 * request could be on a different queue.  A submission
710 		 * queue can have only one completion queue, so we can
711 		 * update subq_head without locking the submission queue.
712 		 */
713 		subq = &sc->subqueues[res->tail.subq_id];
714 		subq->subq_head = res->tail.subq_head_ptr;
715 		req = &subq->reqary[res->tail.cmd_id];
716 
717 		/*
718 		 * Copy the fields and wakeup anyone waiting on req.
719 		 * The response field in the completion queue can be reused
720 		 * once we doorbell which is why we make a copy.
721 		 */
722 		KKASSERT(req->state == NVME_REQ_SUBMITTED &&
723 			 req->comq == comq);
724 		req->res = *res;
725 		nvme_write(comq->sc, comq->comq_doorbell_reg, comq->comq_tail);
726 		cpu_sfence();
727 		req->state = NVME_REQ_COMPLETED;
728 		if (req->callback) {
729 			req->callback(req, lk);
730 		} else if (req->waiting) {
731 			wakeup(req);
732 		}
733 	}
734 #if 0
735 	if (didwork)
736 		nvme_write(comq->sc, comq->comq_doorbell_reg, comq->comq_tail);
737 #endif
738 }
739 
740 /*
741  * Core interrupt handler (called from dedicated interrupt thread, possibly
742  * preempts other threads).
743  *
744  * NOTE: For pin-based level interrupts, the chipset interrupt is cleared
745  *	 automatically once all the head doorbells are updated.  However,
746  *	 most chipsets assume MSI-X will be used and MAY NOT IMPLEMENT
747  *	 pin-based interrupts properly.  I found the BPX card, for example,
748  *	 is unable to clear a pin-based interrupt.
749  */
750 void
751 nvme_intr(void *arg)
752 {
753 	nvme_comqueue_t *comq = arg;
754 	nvme_softc_t *sc;
755 	int i;
756 	int skip;
757 
758 	/*
759 	 * Process all completion queues associated with this vector.  The
760 	 * interrupt is masked in the APIC.  Do NOT mess with the NVMe
761 	 * masking registers because (1) We don't need to and it wastes time,
762 	 * and (2) We aren't supposed to touch them if using MSI-X anyway.
763 	 */
764 	sc = comq->sc;
765 	if (sc->nirqs == 1)
766 		skip = 1;
767 	else
768 		skip = sc->nirqs - 1;
769 
770 	for (i = comq->qid; i <= sc->niocomqs; i += skip) {
771 		if (comq->nqe) {
772 			lockmgr(&comq->lk, LK_EXCLUSIVE);
773 			nvme_poll_completions(comq, &comq->lk);
774 			lockmgr(&comq->lk, LK_RELEASE);
775 		}
776 		comq += skip;
777 	}
778 }
779 
780 /*
781  * ADMIN HELPER COMMAND ROLLUP FUNCTIONS
782  */
783 /*
784  * Issue command to create a submission queue.
785  */
786 int
787 nvme_create_subqueue(nvme_softc_t *sc, uint16_t qid)
788 {
789 	nvme_request_t *req;
790 	nvme_subqueue_t *subq = &sc->subqueues[qid];
791 	int status;
792 
793 	req = nvme_get_admin_request(sc, NVME_OP_CREATE_SUBQ);
794 	req->cmd.head.prp1 = subq->psubq;
795 	req->cmd.crsub.subq_id = qid;
796 	req->cmd.crsub.subq_size = subq->nqe - 1;	/* 0's based value */
797 	req->cmd.crsub.flags = NVME_CREATESUB_PC | NVME_CREATESUB_PRI_URG;
798 	req->cmd.crsub.comq_id = subq->comqid;
799 
800 	nvme_submit_request(req);
801 	status = nvme_wait_request(req);
802 	nvme_put_request(req);
803 
804 	return status;
805 }
806 
807 /*
808  * Issue command to create a completion queue.
809  */
810 int
811 nvme_create_comqueue(nvme_softc_t *sc, uint16_t qid)
812 {
813 	nvme_request_t *req;
814 	nvme_comqueue_t *comq = &sc->comqueues[qid];
815 	int status;
816 	int error;
817 	uint16_t ivect;
818 
819 	error = 0;
820 	if (sc->nirqs > 1) {
821 		ivect = 1 + (qid - 1) % (sc->nirqs - 1);
822 		if (qid && ivect == qid) {
823 			error = bus_setup_intr(sc->dev, sc->irq[ivect],
824 						INTR_MPSAFE | INTR_HIFREQ,
825 						nvme_intr,
826 						&sc->comqueues[ivect],
827 						&sc->irq_handle[ivect],
828 						NULL);
829 		}
830 	} else {
831 		ivect = 0;
832 	}
833 	if (error)
834 		return error;
835 
836 	req = nvme_get_admin_request(sc, NVME_OP_CREATE_COMQ);
837 	req->cmd.head.prp1 = comq->pcomq;
838 	req->cmd.crcom.comq_id = qid;
839 	req->cmd.crcom.comq_size = comq->nqe - 1;	/* 0's based value */
840 	req->cmd.crcom.ivect = ivect;
841 	req->cmd.crcom.flags = NVME_CREATECOM_PC | NVME_CREATECOM_IEN;
842 
843 	nvme_submit_request(req);
844 	status = nvme_wait_request(req);
845 	nvme_put_request(req);
846 
847 	/*
848 	 * Ooops, create failed, undo the irq setup
849 	 */
850 	if (sc->nirqs > 1 && status) {
851 		ivect = 1 + (qid - 1) % (sc->nirqs - 1);
852 		if (qid && ivect == qid) {
853 			bus_teardown_intr(sc->dev,
854 					  sc->irq[ivect],
855 					  sc->irq_handle[ivect]);
856 			sc->irq_handle[ivect] = NULL;
857 		}
858 	}
859 
860 	return status;
861 }
862 
863 /*
864  * Issue command to delete a submission queue.
865  */
866 int
867 nvme_delete_subqueue(nvme_softc_t *sc, uint16_t qid)
868 {
869 	nvme_request_t *req;
870 	/*nvme_subqueue_t *subq = &sc->subqueues[qid];*/
871 	int status;
872 
873 	req = nvme_get_admin_request(sc, NVME_OP_DELETE_SUBQ);
874 	req->cmd.head.prp1 = 0;
875 	req->cmd.delete.qid = qid;
876 
877 	nvme_submit_request(req);
878 	status = nvme_wait_request(req);
879 	nvme_put_request(req);
880 
881 	return status;
882 }
883 
884 /*
885  * Issue command to delete a completion queue.
886  */
887 int
888 nvme_delete_comqueue(nvme_softc_t *sc, uint16_t qid)
889 {
890 	nvme_request_t *req;
891 	nvme_comqueue_t *comq = &sc->comqueues[qid];
892 	int status;
893 	uint16_t ivect;
894 
895 	if (comq->sc == NULL)
896 		return 0;
897 
898 	req = nvme_get_admin_request(sc, NVME_OP_DELETE_COMQ);
899 	req->cmd.head.prp1 = 0;
900 	req->cmd.delete.qid = qid;
901 
902 	nvme_submit_request(req);
903 	status = nvme_wait_request(req);
904 	nvme_put_request(req);
905 
906 	if (qid && sc->nirqs > 1) {
907 		ivect = 1 + (qid - 1) % (sc->nirqs - 1);
908 		if (ivect == qid && sc->irq_handle[ivect]) {
909 			bus_teardown_intr(sc->dev,
910 					  sc->irq[ivect],
911 					  sc->irq_handle[ivect]);
912 			sc->irq_handle[ivect] = NULL;
913 		}
914 	}
915 
916 	return status;
917 }
918 
919 /*
920  * Issue friendly shutdown to controller.
921  */
922 int
923 nvme_issue_shutdown(nvme_softc_t *sc, int dopoll)
924 {
925 	uint32_t reg;
926 	int base_ticks;
927 	int error;
928 
929 	/*
930 	 * Put us in shutdown
931 	 */
932 	reg = nvme_read(sc, NVME_REG_CONFIG);
933 	reg &= ~NVME_CONFIG_SHUT_MASK;
934 	reg |= NVME_CONFIG_SHUT_NORM;
935 	nvme_write(sc, NVME_REG_CONFIG, reg);
936 
937 	/*
938 	 * Wait up to 10 seconds for acknowlegement
939 	 */
940 	error = ENXIO;
941 	base_ticks = ticks;
942 	while ((int)(ticks - base_ticks) < 10 * 20) {
943 		reg = nvme_read(sc, NVME_REG_STATUS);
944 		if ((reg & NVME_STATUS_SHUT_MASK) & NVME_STATUS_SHUT_DONE) {
945 			error = 0;
946 			break;
947 		}
948 		if (dopoll == 0)
949 			nvme_os_sleep(50);	/* 50ms poll */
950 	}
951 	if (error)
952 		device_printf(sc->dev, "Unable to shutdown chip nicely\n");
953 	else
954 		device_printf(sc->dev, "Normal chip shutdown succeeded\n");
955 
956 	return error;
957 }
958 
959 /*
960  * Make space-padded string serial and model numbers more readable.
961  */
962 size_t
963 string_cleanup(char *str, int domiddle)
964 {
965 	size_t i;
966 	size_t j;
967 	int atbeg = 1;
968 
969 	for (i = j = 0; str[i]; ++i) {
970 		if ((str[i] == ' ' || str[i] == '\r') &&
971 		    (atbeg || domiddle)) {
972 			continue;
973 		} else {
974 			atbeg = 0;
975 		}
976 		str[j] = str[i];
977 		++j;
978 	}
979 	while (domiddle == 0 && j > 0 && (str[j-1] == ' ' || str[j-1] == '\r'))
980 		--j;
981 	str[j] = 0;
982 	if (domiddle == 0) {
983 		for (j = 0; str[j]; ++j) {
984 			if (str[j] == ' ')
985 				str[j] = '_';
986 		}
987 	}
988 
989 	return j;
990 }
991