xref: /freebsd/sys/dev/nvmf/host/nvmf.c (revision b985c9ca)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/param.h>
9 #include <sys/bus.h>
10 #include <sys/conf.h>
11 #include <sys/lock.h>
12 #include <sys/kernel.h>
13 #include <sys/malloc.h>
14 #include <sys/memdesc.h>
15 #include <sys/module.h>
16 #include <sys/mutex.h>
17 #include <sys/sx.h>
18 #include <sys/taskqueue.h>
19 #include <dev/nvme/nvme.h>
20 #include <dev/nvmf/nvmf.h>
21 #include <dev/nvmf/nvmf_transport.h>
22 #include <dev/nvmf/host/nvmf_var.h>
23 
24 static struct cdevsw nvmf_cdevsw;
25 
26 MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
27 
28 static void	nvmf_disconnect_task(void *arg, int pending);
29 
30 void
31 nvmf_complete(void *arg, const struct nvme_completion *cqe)
32 {
33 	struct nvmf_completion_status *status = arg;
34 	struct mtx *mtx;
35 
36 	status->cqe = *cqe;
37 	mtx = mtx_pool_find(mtxpool_sleep, status);
38 	mtx_lock(mtx);
39 	status->done = true;
40 	mtx_unlock(mtx);
41 	wakeup(status);
42 }
43 
44 void
45 nvmf_io_complete(void *arg, size_t xfered, int error)
46 {
47 	struct nvmf_completion_status *status = arg;
48 	struct mtx *mtx;
49 
50 	status->io_error = error;
51 	mtx = mtx_pool_find(mtxpool_sleep, status);
52 	mtx_lock(mtx);
53 	status->io_done = true;
54 	mtx_unlock(mtx);
55 	wakeup(status);
56 }
57 
58 void
59 nvmf_wait_for_reply(struct nvmf_completion_status *status)
60 {
61 	struct mtx *mtx;
62 
63 	mtx = mtx_pool_find(mtxpool_sleep, status);
64 	mtx_lock(mtx);
65 	while (!status->done || !status->io_done)
66 		mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
67 	mtx_unlock(mtx);
68 }
69 
70 static int
71 nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
72     uint64_t *value)
73 {
74 	const struct nvmf_fabric_prop_get_rsp *rsp;
75 	struct nvmf_completion_status status;
76 
77 	nvmf_status_init(&status);
78 	if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
79 	    M_WAITOK))
80 		return (ECONNABORTED);
81 	nvmf_wait_for_reply(&status);
82 
83 	if (status.cqe.status != 0) {
84 		device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
85 		    le16toh(status.cqe.status));
86 		return (EIO);
87 	}
88 
89 	rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
90 	if (size == 8)
91 		*value = le64toh(rsp->value.u64);
92 	else
93 		*value = le32toh(rsp->value.u32.low);
94 	return (0);
95 }
96 
97 static int
98 nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
99     uint64_t value)
100 {
101 	struct nvmf_completion_status status;
102 
103 	nvmf_status_init(&status);
104 	if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
105 	    M_WAITOK))
106 		return (ECONNABORTED);
107 	nvmf_wait_for_reply(&status);
108 
109 	if (status.cqe.status != 0) {
110 		device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
111 		    le16toh(status.cqe.status));
112 		return (EIO);
113 	}
114 	return (0);
115 }
116 
117 static void
118 nvmf_shutdown_controller(struct nvmf_softc *sc)
119 {
120 	uint64_t cc;
121 	int error;
122 
123 	error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
124 	if (error != 0) {
125 		device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
126 		return;
127 	}
128 
129 	cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
130 
131 	error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
132 	if (error != 0)
133 		device_printf(sc->dev,
134 		    "Failed to set CC to trigger shutdown\n");
135 }
136 
137 static void
138 nvmf_check_keep_alive(void *arg)
139 {
140 	struct nvmf_softc *sc = arg;
141 	int traffic;
142 
143 	traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
144 	if (traffic == 0) {
145 		device_printf(sc->dev,
146 		    "disconnecting due to KeepAlive timeout\n");
147 		nvmf_disconnect(sc);
148 		return;
149 	}
150 
151 	callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
152 }
153 
154 static void
155 nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
156 {
157 	struct nvmf_softc *sc = arg;
158 
159 	atomic_store_int(&sc->ka_active_rx_traffic, 1);
160 	if (cqe->status != 0) {
161 		device_printf(sc->dev,
162 		    "KeepAlive response reported status %#x\n",
163 		    le16toh(cqe->status));
164 	}
165 }
166 
167 static void
168 nvmf_send_keep_alive(void *arg)
169 {
170 	struct nvmf_softc *sc = arg;
171 	int traffic;
172 
173 	/*
174 	 * Don't bother sending a KeepAlive command if TKAS is active
175 	 * and another command has been sent during the interval.
176 	 */
177 	traffic = atomic_load_int(&sc->ka_active_tx_traffic);
178 	if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
179 	    sc, M_NOWAIT))
180 		device_printf(sc->dev,
181 		    "Failed to allocate KeepAlive command\n");
182 
183 	/* Clear ka_active_tx_traffic after sending the keep alive command. */
184 	atomic_store_int(&sc->ka_active_tx_traffic, 0);
185 
186 	callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
187 }
188 
189 int
190 nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh)
191 {
192 	size_t len;
193 	u_int i;
194 	int error;
195 
196 	memset(ivars, 0, sizeof(*ivars));
197 
198 	if (!hh->admin.admin || hh->num_io_queues < 1)
199 		return (EINVAL);
200 
201 	ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK);
202 	error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata));
203 	if (error != 0)
204 		goto out;
205 	nvme_controller_data_swapbytes(ivars->cdata);
206 
207 	len = hh->num_io_queues * sizeof(*ivars->io_params);
208 	ivars->io_params = malloc(len, M_NVMF, M_WAITOK);
209 	error = copyin(hh->io, ivars->io_params, len);
210 	if (error != 0)
211 		goto out;
212 	for (i = 0; i < hh->num_io_queues; i++) {
213 		if (ivars->io_params[i].admin) {
214 			error = EINVAL;
215 			goto out;
216 		}
217 
218 		/* Require all I/O queues to be the same size. */
219 		if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) {
220 			error = EINVAL;
221 			goto out;
222 		}
223 	}
224 
225 	ivars->hh = hh;
226 	return (0);
227 
228 out:
229 	free(ivars->io_params, M_NVMF);
230 	free(ivars->cdata, M_NVMF);
231 	return (error);
232 }
233 
234 void
235 nvmf_free_ivars(struct nvmf_ivars *ivars)
236 {
237 	free(ivars->io_params, M_NVMF);
238 	free(ivars->cdata, M_NVMF);
239 }
240 
241 static int
242 nvmf_probe(device_t dev)
243 {
244 	struct nvmf_ivars *ivars = device_get_ivars(dev);
245 	char desc[260];
246 
247 	if (ivars == NULL)
248 		return (ENXIO);
249 
250 	snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn);
251 	device_set_desc_copy(dev, desc);
252 	return (BUS_PROBE_DEFAULT);
253 }
254 
255 static int
256 nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars)
257 {
258 	char name[16];
259 
260 	/* Setup the admin queue. */
261 	sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin,
262 	    "admin queue");
263 	if (sc->admin == NULL) {
264 		device_printf(sc->dev, "Failed to setup admin queue\n");
265 		return (ENXIO);
266 	}
267 
268 	/* Setup I/O queues. */
269 	sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF,
270 	    M_WAITOK | M_ZERO);
271 	sc->num_io_queues = ivars->hh->num_io_queues;
272 	for (u_int i = 0; i < sc->num_io_queues; i++) {
273 		snprintf(name, sizeof(name), "I/O queue %u", i);
274 		sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype,
275 		    &ivars->io_params[i], name);
276 		if (sc->io[i] == NULL) {
277 			device_printf(sc->dev, "Failed to setup I/O queue %u\n",
278 			    i + 1);
279 			return (ENXIO);
280 		}
281 	}
282 
283 	/* Start KeepAlive timers. */
284 	if (ivars->hh->kato != 0) {
285 		sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
286 		    sc->cdata->ctratt) != 0;
287 		sc->ka_rx_sbt = mstosbt(ivars->hh->kato);
288 		sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
289 		callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
290 		    nvmf_check_keep_alive, sc, C_HARDCLOCK);
291 		callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
292 		    nvmf_send_keep_alive, sc, C_HARDCLOCK);
293 	}
294 
295 	return (0);
296 }
297 
298 static bool
299 nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
300     struct nvme_namespace_data *data, uint32_t *nsidp)
301 {
302 	struct nvmf_completion_status status;
303 	uint32_t nsid;
304 
305 	nvmf_status_init(&status);
306 	nvmf_status_wait_io(&status);
307 	if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
308 	    nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
309 		device_printf(sc->dev,
310 		    "failed to send IDENTIFY active namespaces command\n");
311 		return (false);
312 	}
313 	nvmf_wait_for_reply(&status);
314 
315 	if (status.cqe.status != 0) {
316 		device_printf(sc->dev,
317 		    "IDENTIFY active namespaces failed, status %#x\n",
318 		    le16toh(status.cqe.status));
319 		return (false);
320 	}
321 
322 	if (status.io_error != 0) {
323 		device_printf(sc->dev,
324 		    "IDENTIFY active namespaces failed with I/O error %d\n",
325 		    status.io_error);
326 		return (false);
327 	}
328 
329 	for (u_int i = 0; i < nitems(nslist->ns); i++) {
330 		nsid = nslist->ns[i];
331 		if (nsid == 0) {
332 			*nsidp = 0;
333 			return (true);
334 		}
335 
336 		if (sc->ns[nsid - 1] != NULL) {
337 			device_printf(sc->dev,
338 			    "duplicate namespace %u in active namespace list\n",
339 			    nsid);
340 			return (false);
341 		}
342 
343 		nvmf_status_init(&status);
344 		nvmf_status_wait_io(&status);
345 		if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
346 		    &status, nvmf_io_complete, &status, M_WAITOK)) {
347 			device_printf(sc->dev,
348 			    "failed to send IDENTIFY namespace %u command\n",
349 			    nsid);
350 			return (false);
351 		}
352 		nvmf_wait_for_reply(&status);
353 
354 		if (status.cqe.status != 0) {
355 			device_printf(sc->dev,
356 			    "IDENTIFY namespace %u failed, status %#x\n", nsid,
357 			    le16toh(status.cqe.status));
358 			return (false);
359 		}
360 
361 		if (status.io_error != 0) {
362 			device_printf(sc->dev,
363 			    "IDENTIFY namespace %u failed with I/O error %d\n",
364 			    nsid, status.io_error);
365 			return (false);
366 		}
367 
368 		/*
369 		 * As in nvme_ns_construct, a size of zero indicates an
370 		 * invalid namespace.
371 		 */
372 		nvme_namespace_data_swapbytes(data);
373 		if (data->nsze == 0) {
374 			device_printf(sc->dev,
375 			    "ignoring active namespace %u with zero size\n",
376 			    nsid);
377 			continue;
378 		}
379 
380 		sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
381 
382 		nvmf_sim_rescan_ns(sc, nsid);
383 	}
384 
385 	MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
386 
387 	if (nsid >= 0xfffffffd)
388 		*nsidp = 0;
389 	else
390 		*nsidp = nsid + 1;
391 	return (true);
392 }
393 
394 static bool
395 nvmf_add_namespaces(struct nvmf_softc *sc)
396 {
397 	struct nvme_namespace_data *data;
398 	struct nvme_ns_list *nslist;
399 	uint32_t nsid;
400 	bool retval;
401 
402 	sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
403 	    M_WAITOK | M_ZERO);
404 	nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
405 	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
406 
407 	nsid = 0;
408 	retval = true;
409 	for (;;) {
410 		if (!nvmf_scan_nslist(sc, nslist, data, &nsid)) {
411 			retval = false;
412 			break;
413 		}
414 		if (nsid == 0)
415 			break;
416 	}
417 
418 	free(data, M_NVMF);
419 	free(nslist, M_NVMF);
420 	return (retval);
421 }
422 
423 static int
424 nvmf_attach(device_t dev)
425 {
426 	struct make_dev_args mda;
427 	struct nvmf_softc *sc = device_get_softc(dev);
428 	struct nvmf_ivars *ivars = device_get_ivars(dev);
429 	uint64_t val;
430 	u_int i;
431 	int error;
432 
433 	if (ivars == NULL)
434 		return (ENXIO);
435 
436 	sc->dev = dev;
437 	sc->trtype = ivars->hh->trtype;
438 	callout_init(&sc->ka_rx_timer, 1);
439 	callout_init(&sc->ka_tx_timer, 1);
440 	sx_init(&sc->connection_lock, "nvmf connection");
441 	TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
442 
443 	/* Claim the cdata pointer from ivars. */
444 	sc->cdata = ivars->cdata;
445 	ivars->cdata = NULL;
446 
447 	nvmf_init_aer(sc);
448 
449 	/* TODO: Multiqueue support. */
450 	sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */;
451 
452 	error = nvmf_establish_connection(sc, ivars);
453 	if (error != 0)
454 		goto out;
455 
456 	error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
457 	if (error != 0) {
458 		device_printf(sc->dev, "Failed to fetch CAP\n");
459 		error = ENXIO;
460 		goto out;
461 	}
462 
463 	error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
464 	if (error != 0) {
465 		device_printf(sc->dev, "Failed to fetch VS\n");
466 		error = ENXIO;
467 		goto out;
468 	}
469 	sc->vs = val;
470 
471 	/* Honor MDTS if it is set. */
472 	sc->max_xfer_size = maxphys;
473 	if (sc->cdata->mdts != 0) {
474 		sc->max_xfer_size = ulmin(sc->max_xfer_size,
475 		    1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
476 		    NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
477 	}
478 
479 	error = nvmf_init_sim(sc);
480 	if (error != 0)
481 		goto out;
482 
483 	error = nvmf_start_aer(sc);
484 	if (error != 0) {
485 		nvmf_destroy_sim(sc);
486 		goto out;
487 	}
488 
489 	if (!nvmf_add_namespaces(sc)) {
490 		nvmf_destroy_sim(sc);
491 		goto out;
492 	}
493 
494 	make_dev_args_init(&mda);
495 	mda.mda_devsw = &nvmf_cdevsw;
496 	mda.mda_uid = UID_ROOT;
497 	mda.mda_gid = GID_WHEEL;
498 	mda.mda_mode = 0600;
499 	mda.mda_si_drv1 = sc;
500 	error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
501 	if (error != 0) {
502 		nvmf_destroy_sim(sc);
503 		goto out;
504 	}
505 
506 	return (0);
507 out:
508 	if (sc->ns != NULL) {
509 		for (i = 0; i < sc->cdata->nn; i++) {
510 			if (sc->ns[i] != NULL)
511 				nvmf_destroy_ns(sc->ns[i]);
512 		}
513 		free(sc->ns, M_NVMF);
514 	}
515 
516 	callout_drain(&sc->ka_tx_timer);
517 	callout_drain(&sc->ka_rx_timer);
518 
519 	if (sc->admin != NULL)
520 		nvmf_shutdown_controller(sc);
521 
522 	for (i = 0; i < sc->num_io_queues; i++) {
523 		if (sc->io[i] != NULL)
524 			nvmf_destroy_qp(sc->io[i]);
525 	}
526 	free(sc->io, M_NVMF);
527 	if (sc->admin != NULL)
528 		nvmf_destroy_qp(sc->admin);
529 
530 	nvmf_destroy_aer(sc);
531 
532 	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
533 	sx_destroy(&sc->connection_lock);
534 	free(sc->cdata, M_NVMF);
535 	return (error);
536 }
537 
538 void
539 nvmf_disconnect(struct nvmf_softc *sc)
540 {
541 	taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
542 }
543 
544 static void
545 nvmf_disconnect_task(void *arg, int pending __unused)
546 {
547 	struct nvmf_softc *sc = arg;
548 	u_int i;
549 
550 	sx_xlock(&sc->connection_lock);
551 	if (sc->admin == NULL) {
552 		/*
553 		 * Ignore transport errors if there is no active
554 		 * association.
555 		 */
556 		sx_xunlock(&sc->connection_lock);
557 		return;
558 	}
559 
560 	if (sc->detaching) {
561 		if (sc->admin != NULL) {
562 			/*
563 			 * This unsticks the detach process if a
564 			 * transport error occurs during detach.
565 			 */
566 			nvmf_shutdown_qp(sc->admin);
567 		}
568 		sx_xunlock(&sc->connection_lock);
569 		return;
570 	}
571 
572 	if (sc->cdev == NULL) {
573 		/*
574 		 * Transport error occurred during attach (nvmf_add_namespaces).
575 		 * Shutdown the admin queue.
576 		 */
577 		nvmf_shutdown_qp(sc->admin);
578 		sx_xunlock(&sc->connection_lock);
579 		return;
580 	}
581 
582 	callout_drain(&sc->ka_tx_timer);
583 	callout_drain(&sc->ka_rx_timer);
584 	sc->ka_traffic = false;
585 
586 	/* Quiesce namespace consumers. */
587 	nvmf_disconnect_sim(sc);
588 	for (i = 0; i < sc->cdata->nn; i++) {
589 		if (sc->ns[i] != NULL)
590 			nvmf_disconnect_ns(sc->ns[i]);
591 	}
592 
593 	/* Shutdown the existing qpairs. */
594 	for (i = 0; i < sc->num_io_queues; i++) {
595 		nvmf_destroy_qp(sc->io[i]);
596 	}
597 	free(sc->io, M_NVMF);
598 	sc->io = NULL;
599 	sc->num_io_queues = 0;
600 	nvmf_destroy_qp(sc->admin);
601 	sc->admin = NULL;
602 
603 	sx_xunlock(&sc->connection_lock);
604 }
605 
606 static int
607 nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
608 {
609 	struct nvmf_ivars ivars;
610 	u_int i;
611 	int error;
612 
613 	/* XXX: Should we permit changing the transport type? */
614 	if (sc->trtype != hh->trtype) {
615 		device_printf(sc->dev,
616 		    "transport type mismatch on reconnect\n");
617 		return (EINVAL);
618 	}
619 
620 	error = nvmf_init_ivars(&ivars, hh);
621 	if (error != 0)
622 		return (error);
623 
624 	sx_xlock(&sc->connection_lock);
625 	if (sc->admin != NULL || sc->detaching) {
626 		error = EBUSY;
627 		goto out;
628 	}
629 
630 	/*
631 	 * Ensure this is for the same controller.  Note that the
632 	 * controller ID can vary across associations if the remote
633 	 * system is using the dynamic controller model.  This merely
634 	 * ensures the new association is connected to the same NVMe
635 	 * subsystem.
636 	 */
637 	if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn,
638 	    sizeof(ivars.cdata->subnqn)) != 0) {
639 		device_printf(sc->dev,
640 		    "controller subsystem NQN mismatch on reconnect\n");
641 		error = EINVAL;
642 		goto out;
643 	}
644 
645 	/*
646 	 * XXX: Require same number and size of I/O queues so that
647 	 * max_pending_io is still correct?
648 	 */
649 
650 	error = nvmf_establish_connection(sc, &ivars);
651 	if (error != 0)
652 		goto out;
653 
654 	error = nvmf_start_aer(sc);
655 	if (error != 0)
656 		goto out;
657 
658 	device_printf(sc->dev,
659 	    "established new association with %u I/O queues\n",
660 	    sc->num_io_queues);
661 
662 	/* Restart namespace consumers. */
663 	for (i = 0; i < sc->cdata->nn; i++) {
664 		if (sc->ns[i] != NULL)
665 			nvmf_reconnect_ns(sc->ns[i]);
666 	}
667 	nvmf_reconnect_sim(sc);
668 out:
669 	sx_xunlock(&sc->connection_lock);
670 	nvmf_free_ivars(&ivars);
671 	return (error);
672 }
673 
674 static int
675 nvmf_detach(device_t dev)
676 {
677 	struct nvmf_softc *sc = device_get_softc(dev);
678 	u_int i;
679 
680 	destroy_dev(sc->cdev);
681 
682 	sx_xlock(&sc->connection_lock);
683 	sc->detaching = true;
684 	sx_xunlock(&sc->connection_lock);
685 
686 	nvmf_destroy_sim(sc);
687 	for (i = 0; i < sc->cdata->nn; i++) {
688 		if (sc->ns[i] != NULL)
689 			nvmf_destroy_ns(sc->ns[i]);
690 	}
691 	free(sc->ns, M_NVMF);
692 
693 	callout_drain(&sc->ka_tx_timer);
694 	callout_drain(&sc->ka_rx_timer);
695 
696 	if (sc->admin != NULL)
697 		nvmf_shutdown_controller(sc);
698 
699 	for (i = 0; i < sc->num_io_queues; i++) {
700 		nvmf_destroy_qp(sc->io[i]);
701 	}
702 	free(sc->io, M_NVMF);
703 
704 	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
705 
706 	if (sc->admin != NULL)
707 		nvmf_destroy_qp(sc->admin);
708 
709 	nvmf_destroy_aer(sc);
710 
711 	sx_destroy(&sc->connection_lock);
712 	free(sc->cdata, M_NVMF);
713 	return (0);
714 }
715 
716 void
717 nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
718 {
719 	struct nvmf_completion_status status;
720 	struct nvme_namespace_data *data;
721 	struct nvmf_namespace *ns;
722 
723 	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
724 
725 	nvmf_status_init(&status);
726 	nvmf_status_wait_io(&status);
727 	if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
728 	    &status, nvmf_io_complete, &status, M_WAITOK)) {
729 		device_printf(sc->dev,
730 		    "failed to send IDENTIFY namespace %u command\n", nsid);
731 		free(data, M_NVMF);
732 		return;
733 	}
734 	nvmf_wait_for_reply(&status);
735 
736 	if (status.cqe.status != 0) {
737 		device_printf(sc->dev,
738 		    "IDENTIFY namespace %u failed, status %#x\n", nsid,
739 		    le16toh(status.cqe.status));
740 		free(data, M_NVMF);
741 		return;
742 	}
743 
744 	if (status.io_error != 0) {
745 		device_printf(sc->dev,
746 		    "IDENTIFY namespace %u failed with I/O error %d\n",
747 		    nsid, status.io_error);
748 		free(data, M_NVMF);
749 		return;
750 	}
751 
752 	nvme_namespace_data_swapbytes(data);
753 
754 	/* XXX: Needs locking around sc->ns[]. */
755 	ns = sc->ns[nsid - 1];
756 	if (data->nsze == 0) {
757 		/* XXX: Needs locking */
758 		if (ns != NULL) {
759 			nvmf_destroy_ns(ns);
760 			sc->ns[nsid - 1] = NULL;
761 		}
762 	} else {
763 		/* XXX: Needs locking */
764 		if (ns == NULL) {
765 			sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
766 		} else {
767 			if (!nvmf_update_ns(ns, data)) {
768 				nvmf_destroy_ns(ns);
769 				sc->ns[nsid - 1] = NULL;
770 			}
771 		}
772 	}
773 
774 	free(data, M_NVMF);
775 
776 	nvmf_sim_rescan_ns(sc, nsid);
777 }
778 
779 int
780 nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
781     bool admin)
782 {
783 	struct nvmf_completion_status status;
784 	struct nvme_command cmd;
785 	struct memdesc mem;
786 	struct nvmf_host_qpair *qp;
787 	struct nvmf_request *req;
788 	void *buf;
789 	int error;
790 
791 	if (pt->len > sc->max_xfer_size)
792 		return (EINVAL);
793 
794 	buf = NULL;
795 	if (pt->len != 0) {
796 		/*
797 		 * XXX: Depending on the size we may want to pin the
798 		 * user pages and use a memdesc with vm_page_t's
799 		 * instead.
800 		 */
801 		buf = malloc(pt->len, M_NVMF, M_WAITOK);
802 		if (pt->is_read == 0) {
803 			error = copyin(pt->buf, buf, pt->len);
804 			if (error != 0) {
805 				free(buf, M_NVMF);
806 				return (error);
807 			}
808 		} else {
809 			/* Ensure no kernel data is leaked to userland. */
810 			memset(buf, 0, pt->len);
811 		}
812 	}
813 
814 	memset(&cmd, 0, sizeof(cmd));
815 	cmd.opc = pt->cmd.opc;
816 	cmd.fuse = pt->cmd.fuse;
817 	cmd.nsid = pt->cmd.nsid;
818 	cmd.cdw10 = pt->cmd.cdw10;
819 	cmd.cdw11 = pt->cmd.cdw11;
820 	cmd.cdw12 = pt->cmd.cdw12;
821 	cmd.cdw13 = pt->cmd.cdw13;
822 	cmd.cdw14 = pt->cmd.cdw14;
823 	cmd.cdw15 = pt->cmd.cdw15;
824 
825 	if (admin)
826 		qp = sc->admin;
827 	else
828 		qp = nvmf_select_io_queue(sc);
829 	nvmf_status_init(&status);
830 	req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
831 	if (req == NULL) {
832 		device_printf(sc->dev, "failed to send passthrough command\n");
833 		error = ECONNABORTED;
834 		goto error;
835 	}
836 
837 	if (pt->len != 0) {
838 		mem = memdesc_vaddr(buf, pt->len);
839 		nvmf_capsule_append_data(req->nc, &mem, pt->len,
840 		    pt->is_read == 0, nvmf_io_complete, &status);
841 		nvmf_status_wait_io(&status);
842 	}
843 
844 	nvmf_submit_request(req);
845 	nvmf_wait_for_reply(&status);
846 
847 	memset(&pt->cpl, 0, sizeof(pt->cpl));
848 	pt->cpl.cdw0 = status.cqe.cdw0;
849 	pt->cpl.status = status.cqe.status;
850 
851 	error = status.io_error;
852 	if (error == 0 && pt->len != 0 && pt->is_read != 0)
853 		error = copyout(buf, pt->buf, pt->len);
854 error:
855 	free(buf, M_NVMF);
856 	return (error);
857 }
858 
859 static int
860 nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
861     struct thread *td)
862 {
863 	struct nvmf_softc *sc = cdev->si_drv1;
864 	struct nvme_get_nsid *gnsid;
865 	struct nvme_pt_command *pt;
866 	struct nvmf_reconnect_params *rp;
867 	struct nvmf_handoff_host *hh;
868 
869 	switch (cmd) {
870 	case NVME_PASSTHROUGH_CMD:
871 		pt = (struct nvme_pt_command *)arg;
872 		return (nvmf_passthrough_cmd(sc, pt, true));
873 	case NVME_GET_NSID:
874 		gnsid = (struct nvme_get_nsid *)arg;
875 		strncpy(gnsid->cdev, device_get_nameunit(sc->dev),
876 		    sizeof(gnsid->cdev));
877 		gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
878 		gnsid->nsid = 0;
879 		return (0);
880 	case NVME_GET_MAX_XFER_SIZE:
881 		*(uint64_t *)arg = sc->max_xfer_size;
882 		return (0);
883 	case NVMF_RECONNECT_PARAMS:
884 		rp = (struct nvmf_reconnect_params *)arg;
885 		if ((sc->cdata->fcatt & 1) == 0)
886 			rp->cntlid = NVMF_CNTLID_DYNAMIC;
887 		else
888 			rp->cntlid = sc->cdata->ctrlr_id;
889 		memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn));
890 		return (0);
891 	case NVMF_RECONNECT_HOST:
892 		hh = (struct nvmf_handoff_host *)arg;
893 		return (nvmf_reconnect_host(sc, hh));
894 	default:
895 		return (ENOTTY);
896 	}
897 }
898 
899 static struct cdevsw nvmf_cdevsw = {
900 	.d_version = D_VERSION,
901 	.d_ioctl = nvmf_ioctl
902 };
903 
904 static int
905 nvmf_modevent(module_t mod, int what, void *arg)
906 {
907 	switch (what) {
908 	case MOD_LOAD:
909 		return (nvmf_ctl_load());
910 	case MOD_QUIESCE:
911 		return (0);
912 	case MOD_UNLOAD:
913 		nvmf_ctl_unload();
914 		destroy_dev_drain(&nvmf_cdevsw);
915 		return (0);
916 	default:
917 		return (EOPNOTSUPP);
918 	}
919 }
920 
921 static device_method_t nvmf_methods[] = {
922 	/* Device interface */
923 	DEVMETHOD(device_probe,     nvmf_probe),
924 	DEVMETHOD(device_attach,    nvmf_attach),
925 	DEVMETHOD(device_detach,    nvmf_detach),
926 #if 0
927 	DEVMETHOD(device_shutdown,  nvmf_shutdown),
928 #endif
929 	DEVMETHOD_END
930 };
931 
932 driver_t nvme_nvmf_driver = {
933 	"nvme",
934 	nvmf_methods,
935 	sizeof(struct nvmf_softc),
936 };
937 
938 DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
939 MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);
940