xref: /dragonfly/sys/dev/disk/nvme/nvme_disk.c (revision c87dd536)
1 /*
2  * Copyright (c) 2016 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "nvme.h"
36 
37 static void nvme_disk_callback(nvme_request_t *req, struct lock *lk);
38 static int nvme_strategy_core(nvme_softns_t *nsc, struct bio *bio, int delay);
39 
40 static d_open_t nvme_open;
41 static d_close_t nvme_close;
42 static d_ioctl_t nvme_ioctl;
43 static d_strategy_t nvme_strategy;
44 static d_dump_t nvme_dump;
45 
46 static struct dev_ops nvme_ops = {
47 	{ "nvme", 0, D_DISK | D_MPSAFE | D_CANFREE | D_TRACKCLOSE | D_KVABIO },
48 	.d_open =       nvme_open,
49 	.d_close =      nvme_close,
50 	.d_read =       physread,
51 	.d_dump =       nvme_dump,
52 	.d_write =      physwrite,
53 	.d_ioctl =      nvme_ioctl,
54 	.d_strategy =   nvme_strategy,
55 };
56 
57 static int nvme_sync_delay = 0;
58 SYSCTL_INT(_debug, OID_AUTO, nvme_sync_delay, CTLFLAG_RW, &nvme_sync_delay, 0,
59 	   "Enable synchronous delay/completion-check, uS");
60 
61 /*
62  * Attach a namespace as a disk, making the disk available to the system.
63  */
64 void
65 nvme_disk_attach(nvme_softns_t *nsc)
66 {
67 	nvme_softc_t *sc;
68 	struct disk_info info;
69 	char serial[20+16];
70 	size_t len;
71 	uint64_t cap_gb;
72 
73 	sc = nsc->sc;
74 	devstat_add_entry(&nsc->stats, "nvme", nsc->unit, nsc->blksize,
75 			  DEVSTAT_NO_ORDERED_TAGS,
76 			  DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER,
77 			  DEVSTAT_PRIORITY_OTHER);
78 	nsc->cdev = disk_create(nsc->unit, &nsc->disk, &nvme_ops);
79 	nsc->cdev->si_drv1 = nsc;
80 	nsc->cdev->si_iosize_max = MAXPHYS;	/* XXX */
81 	disk_setdisktype(&nsc->disk, "ssd");
82 
83 	bzero(&info, sizeof(info));
84 	info.d_media_blksize = nsc->blksize;
85 	info.d_media_blocks = nsc->idns.size;
86 	info.d_secpertrack = 1024;
87 	info.d_nheads = 1;
88 	info.d_secpercyl = info.d_secpertrack * info.d_nheads;
89 	info.d_ncylinders =  (u_int)(info.d_media_blocks / info.d_secpercyl);
90 
91 	KKASSERT(sizeof(sc->idctlr.serialno) == 20);
92 	bzero(serial, sizeof(serial));
93 	bcopy(sc->idctlr.serialno, serial, sizeof(sc->idctlr.serialno));
94 	len = string_cleanup(serial, 1);
95 
96 	ksnprintf(serial + len, sizeof(serial) - len, "-%u", nsc->nsid);
97 
98 	info.d_serialno = serial;
99 
100 	cap_gb = nsc->idns.size / (1024 * 1024 * 1024 / nsc->blksize);
101 	device_printf(sc->dev,
102 		"Disk nvme%d ns=%u "
103 		"blksize=%u lbacnt=%ju cap=%juGB serno=%s\n",
104 		nsc->unit, nsc->nsid,
105 		nsc->blksize, nsc->idns.size, cap_gb, serial);
106 
107 	disk_setdiskinfo(&nsc->disk, &info);
108 	/* serial is copied and does not have to be persistent */
109 }
110 
111 void
112 nvme_disk_detach(nvme_softns_t *nsc)
113 {
114 	if (nsc->cdev) {
115 		disk_destroy(&nsc->disk);
116 		devstat_remove_entry(&nsc->stats);
117 	}
118 }
119 
120 static
121 int
122 nvme_open(struct dev_open_args *ap)
123 {
124 	cdev_t dev = ap->a_head.a_dev;
125 	nvme_softns_t *nsc = dev->si_drv1;
126 	nvme_softc_t *sc = nsc->sc;
127 
128 	if (sc->flags & NVME_SC_UNLOADING)
129 		return ENXIO;
130 
131 	atomic_add_long(&sc->opencnt, 1);
132 
133 	return 0;
134 }
135 
136 static
137 int
138 nvme_close(struct dev_close_args *ap)
139 {
140 	cdev_t dev = ap->a_head.a_dev;
141 	nvme_softns_t *nsc = dev->si_drv1;
142 	nvme_softc_t *sc = nsc->sc;
143 
144 	atomic_add_long(&sc->opencnt, -1);
145 
146 	return 0;
147 }
148 
149 static int
150 nvme_ioctl(struct dev_ioctl_args *ap)
151 {
152 	cdev_t dev = ap->a_head.a_dev;
153 	nvme_softns_t *nsc = dev->si_drv1;
154 	nvme_softc_t *sc = nsc->sc;
155 	int error;
156 
157 	switch(ap->a_cmd) {
158 	case NVMEIOCGETLOG:
159 		error = nvme_getlog_ioctl(sc, (void *)ap->a_data);
160 		break;
161 	default:
162 		error = ENOIOCTL;
163 		break;
164 	}
165 	return error;
166 }
167 
168 static int
169 nvme_strategy(struct dev_strategy_args *ap)
170 {
171 	cdev_t dev = ap->a_head.a_dev;
172 	nvme_softns_t *nsc = dev->si_drv1;
173 
174 	nvme_strategy_core(nsc, ap->a_bio, nvme_sync_delay);
175 
176 	return 0;
177 }
178 
179 /*
180  * Called from admin thread to requeue BIOs.  We must call
181  * nvme_strategy_core() with delay = 0 to disable synchronous
182  * optimizations to avoid deadlocking the admin thread.
183  */
184 void
185 nvme_disk_requeues(nvme_softc_t *sc)
186 {
187 	nvme_softns_t *nsc;
188 	struct bio *bio;
189 	int i;
190 
191 	for (i = 0; i < sc->nscmax; ++i) {
192 		nsc = sc->nscary[i];
193 		if (nsc == NULL || nsc->sc == NULL)
194 			continue;
195 		if (bioq_first(&nsc->bioq)) {
196 			lockmgr(&nsc->lk, LK_EXCLUSIVE);
197 			while ((bio = bioq_first(&nsc->bioq)) != NULL) {
198 				bioq_remove(&nsc->bioq, bio);
199 				lockmgr(&nsc->lk, LK_RELEASE);
200 				if (nvme_strategy_core(nsc, bio, 0))
201 					goto next;
202 				lockmgr(&nsc->lk, LK_EXCLUSIVE);
203 			}
204 			lockmgr(&nsc->lk, LK_RELEASE);
205 		}
206 next:
207 		;
208 	}
209 }
210 
211 
212 /*
213  * Returns non-zero if no requests are available.
214  *
215  * WARNING! We are using the KVABIO API and must not access memory
216  *	    through bp->b_data without first calling bkvasync(bp).
217  */
218 static int
219 nvme_strategy_core(nvme_softns_t *nsc, struct bio *bio, int delay)
220 {
221 	nvme_softc_t *sc = nsc->sc;
222 	struct buf *bp = bio->bio_buf;
223 	uint64_t nlba;
224 	uint64_t secno;
225 	nvme_subqueue_t *subq;
226 	nvme_request_t *req;
227 	int nobytes;
228 
229 	/*
230 	 * Calculate sector/extent
231 	 */
232 	secno = bio->bio_offset / nsc->blksize;
233 	nlba = bp->b_bcount / nsc->blksize;
234 
235 	devstat_start_transaction(&nsc->stats);
236 
237 	subq = NULL;
238 	req = NULL;
239 	nobytes = 0;
240 
241 	/*
242 	 * Convert bio to low-level request
243 	 */
244 	switch (bp->b_cmd) {
245 	case BUF_CMD_READ:
246 		if (nlba == 0) {
247 			nobytes = 1;
248 			break;
249 		}
250 		subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_RD]];
251 		/* get_request does not need the subq lock */
252 		req = nvme_get_request(subq, NVME_IOCMD_READ,
253 				       bp->b_data, nlba * nsc->blksize);
254 		if (req == NULL)
255 			goto requeue;
256 
257 		req->cmd.read.head.nsid = nsc->nsid;
258 		req->cmd.read.start_lba = secno;
259 		req->cmd.read.count_lba = nlba - 1;	/* 0's based */
260 		req->cmd.read.ioflags = 0; /* NVME_IOFLG_LR, NVME_IOFLG_FUA */
261 		req->cmd.read.dsm = 0;	   /* NVME_DSM_INCOMPRESSIBLE */
262 					   /* NVME_DSM_SEQREQ */
263 		break;
264 	case BUF_CMD_WRITE:
265 		if (nlba == 0) {
266 			nobytes = 1;
267 			break;
268 		}
269 		subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
270 		/* get_request does not need the subq lock */
271 		req = nvme_get_request(subq, NVME_IOCMD_WRITE,
272 				       bp->b_data, nlba * nsc->blksize);
273 		if (req == NULL)
274 			goto requeue;
275 		req->cmd.write.head.nsid = nsc->nsid;
276 		req->cmd.write.start_lba = secno;
277 		req->cmd.write.count_lba = nlba - 1;	/* 0's based */
278 		break;
279 	case BUF_CMD_FREEBLKS:
280 		if (nlba == 0) {
281 			nobytes = 1;
282 			break;
283 		}
284 		if (nlba > 65536) {
285 			/* will cause INVAL error */
286 			break;
287 		}
288 		subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
289 		/* get_request does not need the subq lock */
290 		req = nvme_get_request(subq, NVME_IOCMD_WRITEZ, NULL, 0);
291 		if (req == NULL)
292 			goto requeue;
293 		req->cmd.writez.head.nsid = nsc->nsid;
294 		req->cmd.writez.start_lba = secno;
295 		req->cmd.writez.count_lba = nlba - 1;	/* 0's based */
296 		req->cmd.read.ioflags = 0; /* NVME_IOFLG_LR, NVME_IOFLG_FUA */
297 		req->cmd.read.dsm = 0;	   /* NVME_DSM_INCOMPRESSIBLE */
298 					   /* NVME_DSM_SEQREQ */
299 		break;
300 	case BUF_CMD_FLUSH:
301 		subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
302 		/* get_request does not need the subq lock */
303 		req = nvme_get_request(subq, NVME_IOCMD_FLUSH, NULL, 0);
304 		if (req == NULL)
305 			goto requeue;
306 		req->cmd.flush.head.nsid = nsc->nsid;
307 		break;
308 	default:
309 		break;
310 	}
311 
312 	/*
313 	 * Submit the request
314 	 */
315 	if (req) {
316 		nvme_comqueue_t *comq;
317 
318 		/* HACK OPTIMIZATIONS - TODO NEEDS WORK */
319 
320 		/*
321 		 * Prevent callback from occurring if the synchronous
322 		 * delay optimization is enabled.
323 		 *
324 		 * NOTE: subq lock does not protect the I/O (completion
325 		 *	 only needs the comq lock).
326 		 */
327 		if (delay == 0)
328 			req->callback = nvme_disk_callback;
329 		req->nsc = nsc;
330 		req->bio = bio;
331 		BUF_KERNPROC(bp);		/* do before submit */
332 		lockmgr(&subq->lk, LK_EXCLUSIVE);
333 		nvme_submit_request(req);	/* needs subq lock */
334 		lockmgr(&subq->lk, LK_RELEASE);
335 		if (delay) {
336 			comq = req->comq;
337 			DELAY(delay);		/* XXX */
338 			lockmgr(&comq->lk, LK_EXCLUSIVE);
339 			nvme_poll_completions(comq, &comq->lk);
340 			if (req->state == NVME_REQ_SUBMITTED) {
341 				/*
342 				 * Didn't finish, do it the slow way
343 				 * (restore async completion).
344 				 */
345 				req->callback = nvme_disk_callback;
346 				lockmgr(&comq->lk, LK_RELEASE);
347 			} else {
348 				/*
349 				 * Jeeze, that was fast.
350 				 */
351 				nvme_disk_callback(req, &comq->lk);
352 				lockmgr(&comq->lk, LK_RELEASE);
353 			}
354 		} /* else async completion */
355 	} else if (nobytes) {
356 		devstat_end_transaction_buf(&nsc->stats, bp);
357 		biodone(bio);
358 	} else {
359 		bp->b_error = EINVAL;
360 		bp->b_flags |= B_ERROR;
361 		devstat_end_transaction_buf(&nsc->stats, bp);
362 		biodone(bio);
363 	}
364 	return 0;
365 
366 	/*
367 	 * No requests were available, requeue the bio.
368 	 *
369 	 * The nvme_get_request() call armed the requeue signal but
370 	 * it is possible that it was picked up too quickly.  If it
371 	 * was, signal the admin thread ourselves.  This case will occur
372 	 * relatively rarely and only under heavy I/O conditions so we
373 	 * don't have to be entirely efficient about dealing with it.
374 	 */
375 requeue:
376 	BUF_KERNPROC(bp);
377 	lockmgr(&nsc->lk, LK_EXCLUSIVE);
378 	bioqdisksort(&nsc->bioq, bio);
379 	lockmgr(&nsc->lk, LK_RELEASE);
380 	if (atomic_swap_int(&subq->signal_requeue, 1) == 0) {
381 		atomic_swap_int(&subq->signal_requeue, 0);
382                 atomic_set_int(&subq->sc->admin_signal, ADMIN_SIG_REQUEUE);
383                 wakeup(&subq->sc->admin_signal);
384 	}
385 	return 1;
386 }
387 
388 static
389 void
390 nvme_disk_callback(nvme_request_t *req, struct lock *lk)
391 {
392 	nvme_softns_t *nsc = req->nsc;
393 	struct bio *bio;
394 	struct buf *bp;
395 	int status;
396 
397 	status = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status);
398 	bio = req->bio;
399 	bp = bio->bio_buf;
400 
401 	if (lk)					/* comq lock */
402 		lockmgr(lk, LK_RELEASE);
403 	nvme_put_request(req);			/* does not need subq lock */
404 	devstat_end_transaction_buf(&nsc->stats, bp);
405 	if (status) {
406 		bp->b_error = EIO;
407 		bp->b_flags |= B_ERROR;
408 		biodone(bio);
409 	} else {
410 		bp->b_resid = 0;
411 		biodone(bio);
412 	}
413 	if (lk)					/* comq lock */
414 		lockmgr(lk, LK_EXCLUSIVE);
415 }
416 
417 int
418 nvme_alloc_disk_unit(void)
419 {
420 	static int unit_counter = 0;
421 	int unit;
422 
423 	unit = atomic_fetchadd_int(&unit_counter, 1);
424 
425 	return unit;
426 }
427 
428 static int
429 nvme_dump(struct dev_dump_args *ap)
430 {
431 	cdev_t dev = ap->a_head.a_dev;
432 	nvme_softns_t *nsc = dev->si_drv1;
433 	nvme_softc_t *sc = nsc->sc;
434 	uint64_t nlba;
435 	uint64_t secno;
436 	nvme_subqueue_t *subq;
437 	nvme_comqueue_t *comq;
438 	nvme_request_t *req;
439 	int didlock;
440 
441 	/*
442 	 * Calculate sector/extent
443 	 */
444 	secno = ap->a_offset / nsc->blksize;
445 	nlba = ap->a_length / nsc->blksize;
446 
447 	subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
448 
449 	if (nlba) {
450 		/*
451 		 * Issue a WRITE
452 		 *
453 		 * get_request does not need the subq lock.
454 		 */
455 		req = nvme_get_dump_request(subq, NVME_IOCMD_WRITE,
456 				       ap->a_virtual, nlba * nsc->blksize);
457 		req->cmd.write.head.nsid = nsc->nsid;
458 		req->cmd.write.start_lba = secno;
459 		req->cmd.write.count_lba = nlba - 1;	/* 0's based */
460 	} else {
461 		/*
462 		 * Issue a FLUSH
463 		 *
464 		 * get_request does not need the subq lock.
465 		 */
466 		req = nvme_get_dump_request(subq, NVME_IOCMD_FLUSH, NULL, 0);
467 		req->cmd.flush.head.nsid = nsc->nsid;
468 	}
469 
470 	/*
471 	 * Prevent callback from occurring if the synchronous
472 	 * delay optimization is enabled.
473 	 */
474 	req->callback = NULL;
475 	req->nsc = nsc;
476 
477 	/*
478 	 * 500 x 1uS poll wait on lock.  We might be the idle thread, so
479 	 * we can't safely block during a dump.
480 	 */
481 	didlock = 500;
482 	while (lockmgr(&subq->lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
483 		if (--didlock == 0)
484 			break;
485 		tsc_delay(1000);	/* 1uS */
486 		lwkt_switch();
487 	}
488 	nvme_submit_request(req);	/* needs subq lock */
489 	if (didlock)
490 		lockmgr(&subq->lk, LK_RELEASE);
491 
492 	comq = req->comq;
493 	nvme_poll_request(req);
494 	nvme_put_dump_request(req);		/* does not need subq lock */
495 
496 	/*
497 	 * Shut the nvme controller down nicely when we finish the dump.
498 	 * We should to do this whether we are in a panic or not because
499 	 * frankly the dump is overwriting swap space, thus the system is
500 	 * probably not stable.
501 	 */
502 	if (nlba == 0)
503 		nvme_issue_shutdown(sc, 1);
504 	return 0;
505 }
506