xref: /dragonfly/sys/dev/disk/nvme/nvme_admin.c (revision 9ace2a34)
1 /*
2  * Copyright (c) 2016 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * Administration thread
36  *
37  * - Handles resetting, features, iteration of namespaces, and disk
38  *   attachments.  Most admin operations are serialized by the admin thread.
39  *
40  * - Ioctls as well as any BIOs which require more sophisticated processing
41  *   are handed to this thread as well.
42  *
43  * - Can freeze/resume other queues for various purposes.
44  */
45 
46 #include "nvme.h"
47 
48 static void nvme_admin_thread(void *arg);
49 static int nvme_admin_state_identify_ctlr(nvme_softc_t *sc);
50 static int nvme_admin_state_make_queues(nvme_softc_t *sc);
51 static int nvme_admin_state_identify_ns(nvme_softc_t *sc);
52 static int nvme_admin_state_operating(nvme_softc_t *sc);
53 static int nvme_admin_state_failed(nvme_softc_t *sc);
54 
55 /*
56  * Start the admin thread and block until it says it is running.
57  */
58 int
59 nvme_start_admin_thread(nvme_softc_t *sc)
60 {
61 	int error;
62 
63 	lockinit(&sc->admin_lk, "admlk", 0, 0);
64 	sc->admin_signal = 0;
65 
66 	error = bus_setup_intr(sc->dev, sc->irq[0], INTR_MPSAFE,
67 			       nvme_intr, &sc->comqueues[0],
68 			       &sc->irq_handle[0], NULL);
69 	if (error) {
70 		device_printf(sc->dev, "unable to install interrupt\n");
71 		return error;
72 	}
73 	lockmgr(&sc->admin_lk, LK_EXCLUSIVE);
74 	kthread_create(nvme_admin_thread, sc, &sc->admintd, "nvme_admin");
75 	while ((sc->admin_signal & ADMIN_SIG_RUNNING) == 0)
76 		lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwbeg", 0);
77 	lockmgr(&sc->admin_lk, LK_RELEASE);
78 
79 	return 0;
80 }
81 
82 /*
83  * Stop the admin thread and block until it says it is done.
84  */
85 void
86 nvme_stop_admin_thread(nvme_softc_t *sc)
87 {
88 	uint32_t i;
89 
90 	atomic_set_int(&sc->admin_signal, ADMIN_SIG_STOP);
91 
92 	/*
93 	 * We have to wait for the admin thread to finish its probe
94 	 * before shutting it down.
95 	 */
96 	lockmgr(&sc->admin_lk, LK_EXCLUSIVE);
97 	while ((sc->admin_signal & ADMIN_SIG_PROBED) == 0)
98 		lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwend", 0);
99 	lockmgr(&sc->admin_lk, LK_RELEASE);
100 
101 	/*
102 	 * Disconnect our disks while the admin thread is still running,
103 	 * ensuring that the poll works even if interrupts are broken.
104 	 * Otherwise we could deadlock in the devfs core.
105 	 */
106 	for (i = 0; i < NVME_MAX_NAMESPACES; ++i) {
107 		nvme_softns_t *nsc;
108 
109 		if ((nsc = sc->nscary[i]) != NULL) {
110 			nvme_disk_detach(nsc);
111 
112 			kfree(nsc, M_NVME);
113 			sc->nscary[i] = NULL;
114 		}
115 	}
116 
117 	/*
118 	 * Ask the admin thread to shut-down.
119 	 */
120 	lockmgr(&sc->admin_lk, LK_EXCLUSIVE);
121 	wakeup(&sc->admin_signal);
122 	while (sc->admin_signal & ADMIN_SIG_RUNNING)
123 		lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwend", 0);
124 	lockmgr(&sc->admin_lk, LK_RELEASE);
125 	if (sc->irq_handle[0]) {
126 		bus_teardown_intr(sc->dev, sc->irq[0], sc->irq_handle[0]);
127 		sc->irq_handle[0] = NULL;
128 	}
129 	lockuninit(&sc->admin_lk);
130 
131 	/*
132 	 * Thread might be running on another cpu, give it time to actually
133 	 * exit before returning in case the caller is about to unload the
134 	 * module.  Otherwise we don't need this.
135 	 */
136 	nvme_os_sleep(1);
137 }
138 
139 static
140 void
141 nvme_admin_thread(void *arg)
142 {
143 	nvme_softc_t *sc = arg;
144 	uint32_t i;
145 
146 	lockmgr(&sc->admin_lk, LK_EXCLUSIVE);
147 	atomic_set_int(&sc->admin_signal, ADMIN_SIG_RUNNING);
148 	wakeup(&sc->admin_signal);
149 
150 	sc->admin_func = nvme_admin_state_identify_ctlr;
151 
152 	while ((sc->admin_signal & ADMIN_SIG_STOP) == 0) {
153 		for (i = 0; i <= sc->niocomqs; ++i) {
154 			nvme_comqueue_t *comq = &sc->comqueues[i];
155 
156 			if (comq->nqe == 0)	/* not configured */
157 				continue;
158 
159 			lockmgr(&comq->lk, LK_EXCLUSIVE);
160 			nvme_poll_completions(comq, &comq->lk);
161 			lockmgr(&comq->lk, LK_RELEASE);
162 		}
163 		if (sc->admin_signal & ADMIN_SIG_REQUEUE) {
164 			atomic_clear_int(&sc->admin_signal, ADMIN_SIG_REQUEUE);
165 			nvme_disk_requeues(sc);
166 		}
167 		if (sc->admin_func(sc) == 0 &&
168 		    (sc->admin_signal & ADMIN_SIG_RUN_MASK) == 0) {
169 			lksleep(&sc->admin_signal, &sc->admin_lk, 0,
170 				"nvidle", hz);
171 		}
172 	}
173 
174 	/*
175 	 * Cleanup state.
176 	 *
177 	 * Note that we actually issue delete queue commands here.  The NVME
178 	 * spec says that for a normal shutdown the I/O queues should be
179 	 * deleted prior to issuing the shutdown in the CONFIG register.
180 	 */
181 	for (i = 1; i <= sc->niosubqs; ++i) {
182 		nvme_delete_subqueue(sc, i);
183 		nvme_free_subqueue(sc, i);
184 	}
185 	for (i = 1; i <= sc->niocomqs; ++i) {
186 		nvme_delete_comqueue(sc, i);
187 		nvme_free_comqueue(sc, i);
188 	}
189 
190 	/*
191 	 * Signal that we are done.
192 	 */
193 	atomic_clear_int(&sc->admin_signal, ADMIN_SIG_RUNNING);
194 	wakeup(&sc->admin_signal);
195 	lockmgr(&sc->admin_lk, LK_RELEASE);
196 }
197 
198 /*
199  * Identify the controller
200  */
201 static
202 int
203 nvme_admin_state_identify_ctlr(nvme_softc_t *sc)
204 {
205 	nvme_request_t *req;
206 	nvme_ident_ctlr_data_t *rp;
207 	int status;
208 	uint64_t mempgsize;
209 	char serial[20+16];
210 	char model[40+16];
211 
212 	/*
213 	 * Identify Controller
214 	 */
215 	mempgsize = NVME_CAP_MEMPG_MIN_GET(sc->cap);
216 
217 	req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY);
218 	req->cmd.identify.cns = NVME_CNS_CTLR;
219 	req->cmd.identify.cntid = 0;
220 	bzero(req->info, sizeof(*req->info));
221 	nvme_submit_request(req);
222 	status = nvme_wait_request(req);
223 	/* XXX handle status */
224 
225 	sc->idctlr = req->info->idctlr;
226 	nvme_put_request(req);
227 
228 	rp = &sc->idctlr;
229 
230 	KKASSERT(sizeof(sc->idctlr.serialno) == 20);
231 	KKASSERT(sizeof(sc->idctlr.modelno) == 40);
232 	bzero(serial, sizeof(serial));
233 	bzero(model, sizeof(model));
234 	bcopy(rp->serialno, serial, sizeof(rp->serialno));
235 	bcopy(rp->modelno, model, sizeof(rp->modelno));
236 	string_cleanup(serial, 0);
237 	string_cleanup(model, 0);
238 
239 	device_printf(sc->dev, "Model %s BaseSerial %s nscount=%d\n",
240 		      model, serial, rp->ns_count);
241 
242 	sc->admin_func = nvme_admin_state_make_queues;
243 
244 	return 1;
245 }
246 
247 /*
248  * Request and create the I/O queues.  Figure out CPU mapping optimizations.
249  */
250 static
251 int
252 nvme_admin_state_make_queues(nvme_softc_t *sc)
253 {
254 	nvme_request_t *req;
255 	uint16_t niosubqs;
256 	uint16_t niocomqs;
257 	uint32_t i;
258 	uint16_t qno;
259 	int status;
260 	int error;
261 
262 	/*
263 	 * Calculate how many I/O queues (non-inclusive of admin queue)
264 	 * we want to have, up to 65535.  dw0 in the response returns the
265 	 * number of queues the controller gives us.  Submission and
266 	 * Completion queues are specified separately.
267 	 *
268 	 * This driver runs optimally with 4 submission queues and one
269 	 * completion queue per cpu (rdhipri, rdlopri, wrhipri, wrlopri),
270 	 *
271 	 * +1 for dumps
272 	 * +1 for async events
273 	 */
274 	req = nvme_get_admin_request(sc, NVME_OP_SET_FEATURES);
275 
276 	niosubqs = ncpus * 4 + 2;
277 	niocomqs = ncpus + 2;
278 	if (niosubqs > NVME_MAX_QUEUES)
279 		niosubqs = NVME_MAX_QUEUES;
280 	if (niocomqs > NVME_MAX_QUEUES)
281 		niocomqs = NVME_MAX_QUEUES;
282 	device_printf(sc->dev, "Request %u/%u queues, ", niosubqs, niocomqs);
283 
284 	req->cmd.setfeat.flags = NVME_FID_NUMQUEUES;
285 	req->cmd.setfeat.numqs.nsqr = niosubqs - 1;	/* 0's based 0=1 */
286 	req->cmd.setfeat.numqs.ncqr = niocomqs - 1;	/* 0's based 0=1 */
287 
288 	nvme_submit_request(req);
289 
290 	/*
291 	 * Get response and set our operations mode.
292 	 */
293 	status = nvme_wait_request(req);
294 	/* XXX handle status */
295 
296 	if (status == 0) {
297 		sc->niosubqs = 1 + (req->res.setfeat.dw0 & 0xFFFFU);
298 		sc->niocomqs = 1 + ((req->res.setfeat.dw0 >> 16) & 0xFFFFU);
299 	} else {
300 		sc->niosubqs = 0;
301 		sc->niocomqs = 0;
302 	}
303 	kprintf("Returns %u/%u queues, ", sc->niosubqs, sc->niocomqs);
304 
305 	nvme_put_request(req);
306 
307 	if (sc->niosubqs >= ncpus * 4 + 2 && sc->niocomqs >= ncpus + 2) {
308 		/*
309 		 * If we got all the queues we wanted do a full-bore setup of
310 		 * qmap[cpu][type].
311 		 *
312 		 * Remember that subq 0 / comq 0 is the admin queue.
313 		 */
314 		kprintf("optimal map\n");
315 		sc->dumpqno = 1;
316 		sc->eventqno = 2;
317 		sc->subqueues[1].comqid = 1;
318 		sc->subqueues[2].comqid = 2;
319 		qno = 3;
320 		for (i = 0; i < ncpus; ++i) {
321 			int cpuqno = sc->cputovect[i];
322 
323 			KKASSERT(cpuqno != 0);
324 			sc->qmap[i][0] = qno + 0;
325 			sc->qmap[i][1] = qno + 1;
326 			sc->qmap[i][2] = qno + 2;
327 			sc->qmap[i][3] = qno + 3;
328 			sc->subqueues[qno + 0].comqid = cpuqno;
329 			sc->subqueues[qno + 1].comqid = cpuqno;
330 			sc->subqueues[qno + 2].comqid = cpuqno;
331 			sc->subqueues[qno + 3].comqid = cpuqno;
332 			qno += 4;
333 		}
334 		sc->niosubqs = ncpus * 4 + 2;
335 		sc->niocomqs = ncpus + 2;
336 	} else if (sc->niosubqs >= ncpus && sc->niocomqs >= ncpus) {
337 		/*
338 		 * We have enough to give each cpu its own submission
339 		 * and completion queue.
340 		 *
341 		 * leave dumpqno and eventqno set to the admin queue.
342 		 */
343 		kprintf("nominal map 1:1 cpu\n");
344 		sc->dumpqno = 0;
345 		sc->eventqno = 0;
346 		for (i = 0; i < ncpus; ++i) {
347 			qno = sc->cputovect[i];
348 			KKASSERT(qno != 0);
349 			sc->qmap[i][0] = qno;
350 			sc->qmap[i][1] = qno;
351 			sc->qmap[i][2] = qno;
352 			sc->qmap[i][3] = qno;
353 			sc->subqueues[qno].comqid = qno;
354 		}
355 		sc->niosubqs = ncpus;
356 		sc->niocomqs = ncpus;
357 	} else if (sc->niosubqs >= 4 && sc->niocomqs >= 2) {
358 		/*
359 		 * We have enough queues to separate and prioritize reads
360 		 * and writes, but all cpus have to share the same submission
361 		 * queues.  Completion queues are split up between cpus
362 		 * as much as possible.
363 		 *
364 		 * leave dumpqno and eventqno set to the admin queue.
365 		 */
366 		kprintf("rw-sep map\n");
367 		qno = 1;
368 		for (i = 0; i < ncpus; ++i) {
369 			int cpuqno = sc->cputovect[i];
370 
371 			KKASSERT(qno != 0);
372 			sc->qmap[i][0] = qno + 0;	/* read lopri */
373 			sc->qmap[i][1] = qno + 1;	/* read hipri */
374 			sc->qmap[i][2] = qno + 2;	/* write lopri */
375 			sc->qmap[i][3] = qno + 3;	/* write hipri */
376 			if (i <= 0)
377 				sc->subqueues[qno + 0].comqid = cpuqno;
378 			if (i <= 1)
379 				sc->subqueues[qno + 1].comqid = cpuqno;
380 			if (i <= 2)
381 				sc->subqueues[qno + 2].comqid = cpuqno;
382 			if (i <= 3)
383 				sc->subqueues[qno + 3].comqid = cpuqno;
384 			/* do not increment qno */
385 		}
386 		sc->niosubqs = 6;
387 		sc->niocomqs = 3;
388 	} else if (sc->niosubqs >= 2) {
389 		/*
390 		 * We have enough to have separate read and write queues.
391 		 */
392 		kprintf("basic map\n");
393 		qno = 1;
394 		sc->dumpqno = 0;
395 		sc->eventqno = 0;
396 		for (i = 0; i < ncpus; ++i) {
397 			int cpuqno = sc->cputovect[i];
398 
399 			KKASSERT(qno != 0);
400 			sc->qmap[i][0] = qno + 0;	/* read lopri */
401 			sc->qmap[i][1] = qno + 0;	/* read hi pri */
402 			sc->qmap[i][2] = qno + 1;	/* write lopri */
403 			sc->qmap[i][3] = qno + 1;	/* write hi pri */
404 			if (i <= 0)
405 				sc->subqueues[qno + 0].comqid = cpuqno;
406 			if (i <= 1)
407 				sc->subqueues[qno + 1].comqid = cpuqno;
408 		}
409 		sc->niosubqs = 2;
410 		sc->niocomqs = 1;
411 	} else {
412 		/*
413 		 * Minimal configuration, all cpus and I/O types use the
414 		 * same queue.  Sad day.
415 		 */
416 		kprintf("minimal map\n");
417 		sc->dumpqno = 0;
418 		sc->eventqno = 0;
419 		for (i = 0; i < ncpus; ++i) {
420 			sc->qmap[i][0] = 1;
421 			sc->qmap[i][1] = 1;
422 			sc->qmap[i][2] = 1;
423 			sc->qmap[i][3] = 1;
424 		}
425 		sc->subqueues[1].comqid = 1;
426 		sc->niosubqs = 1;
427 		sc->niocomqs = 1;
428 	}
429 
430 	/*
431 	 * Create all I/O submission and completion queues.  The I/O
432 	 * queues start at 1 and are inclusive of niosubqs and niocomqs.
433 	 *
434 	 * NOTE: Completion queues must be created before submission queues.
435 	 *	 That is, the completion queue specified when creating a
436 	 *	 submission queue must already exist.
437 	 */
438 	error = 0;
439 	for (i = 1; i <= sc->niocomqs; ++i) {
440 		error += nvme_alloc_comqueue(sc, i);
441 		if (error) {
442 			device_printf(sc->dev, "Unable to allocate comqs\n");
443 			break;
444 		}
445 		error += nvme_create_comqueue(sc, i);
446 	}
447 	for (i = 1; i <= sc->niosubqs; ++i) {
448 		error += nvme_alloc_subqueue(sc, i);
449 		if (error) {
450 			device_printf(sc->dev, "Unable to allocate subqs\n");
451 			break;
452 		}
453 		error += nvme_create_subqueue(sc, i);
454 	}
455 
456 	if (error) {
457 		device_printf(sc->dev, "Failed to initialize device!\n");
458 		sc->admin_func = nvme_admin_state_failed;
459 	} else {
460 		sc->admin_func = nvme_admin_state_identify_ns;
461 	}
462 
463 	return 1;
464 }
465 
466 /*
467  * Identify available namespaces, iterate, and attach to disks.
468  */
469 static
470 int
471 nvme_admin_state_identify_ns(nvme_softc_t *sc)
472 {
473 	nvme_request_t *req;
474 	nvme_nslist_data_t *rp;
475 	int status;
476 	uint32_t i;
477 	uint32_t j;
478 
479 	/*
480 	 * Identify Namespace List
481 	 */
482 	req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY);
483 	req->cmd.identify.cns = NVME_CNS_ACT_NSLIST;
484 	req->cmd.identify.cntid = 0;
485 	bzero(req->info, sizeof(*req->info));
486 	nvme_submit_request(req);
487 	status = nvme_wait_request(req);
488 	/* XXX handle status */
489 
490 	sc->nslist = req->info->nslist;
491 	nvme_put_request(req);
492 
493 	/*
494 	 * Identify each Namespace
495 	 */
496 	rp = &sc->nslist;
497 	for (i = 0; i < sc->idctlr.ns_count; ++i) {
498 		nvme_softns_t *nsc;
499 		nvme_lba_fmt_data_t *lbafmt;
500 
501 		if (rp->nids[i] == 0)
502 			continue;
503 
504 		req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY);
505 		req->cmd.identify.cns = NVME_CNS_ACT_NS;
506 		req->cmd.identify.cntid = 0;
507 		req->cmd.identify.head.nsid = rp->nids[i];
508 		bzero(req->info, sizeof(*req->info));
509 		nvme_submit_request(req);
510 		status = nvme_wait_request(req);
511 		if (status != 0)
512 			continue;
513 
514 		for (j = 0; j < NVME_MAX_NAMESPACES; ++j) {
515 			if (sc->nscary[j] &&
516 			    sc->nscary[j]->nsid == rp->nids[i])
517 				break;
518 		}
519 		if (j == NVME_MAX_NAMESPACES) {
520 			j = i;
521 			if (sc->nscary[j] != NULL) {
522 				for (j = NVME_MAX_NAMESPACES - 1; j >= 0; --j) {
523 					if (sc->nscary[j] == NULL)
524 						break;
525 				}
526 			}
527 		}
528 		if (j < 0) {
529 			device_printf(sc->dev, "not enough room in nscary for "
530 					       "namespace %08x\n", rp->nids[i]);
531 			nvme_put_request(req);
532 			continue;
533 		}
534 		nsc = sc->nscary[j];
535 		if (nsc == NULL) {
536 			nsc = kmalloc(sizeof(*nsc), M_NVME, M_WAITOK | M_ZERO);
537 			nsc->unit = nvme_alloc_disk_unit();
538 			sc->nscary[j] = nsc;
539 		}
540 		if (sc->nscmax <= j)
541 			sc->nscmax = j + 1;
542 		nsc->sc = sc;
543 		nsc->nsid = rp->nids[i];
544 		nsc->state = NVME_NSC_STATE_UNATTACHED;
545 		nsc->idns = req->info->idns;
546 		bioq_init(&nsc->bioq);
547 		lockinit(&nsc->lk, "nvnsc", 0, 0);
548 
549 		nvme_put_request(req);
550 
551 		j = NVME_FLBAS_SEL_GET(nsc->idns.flbas);
552 		lbafmt = &nsc->idns.lba_fmt[j];
553 		nsc->blksize = 1 << lbafmt->sect_size;
554 
555 		/*
556 		 * Attach the namespace
557 		 */
558 		nvme_disk_attach(nsc);
559 	}
560 
561 	sc->admin_func = nvme_admin_state_operating;
562 	return 1;
563 }
564 
565 static
566 int
567 nvme_admin_state_operating(nvme_softc_t *sc)
568 {
569 	if ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) {
570 		atomic_set_int(&sc->admin_signal, ADMIN_SIG_PROBED);
571 		wakeup(&sc->admin_signal);
572 	}
573 
574 	return 0;
575 }
576 
577 static
578 int
579 nvme_admin_state_failed(nvme_softc_t *sc)
580 {
581 	if ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) {
582 		atomic_set_int(&sc->admin_signal, ADMIN_SIG_PROBED);
583 		wakeup(&sc->admin_signal);
584 	}
585 
586 	return 0;
587 }
588