1 /* 2 * Copyright (c) 2016 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "nvme.h" 36 37 static void nvme_disk_callback(nvme_request_t *req, struct lock *lk); 38 static int nvme_strategy_core(nvme_softns_t *nsc, struct bio *bio, int delay); 39 40 static d_open_t nvme_open; 41 static d_close_t nvme_close; 42 static d_ioctl_t nvme_ioctl; 43 static d_strategy_t nvme_strategy; 44 static d_dump_t nvme_dump; 45 46 static struct dev_ops nvme_ops = { 47 { "nvme", 0, D_DISK | D_MPSAFE | D_CANFREE | D_TRACKCLOSE | D_KVABIO }, 48 .d_open = nvme_open, 49 .d_close = nvme_close, 50 .d_read = physread, 51 .d_dump = nvme_dump, 52 .d_write = physwrite, 53 .d_ioctl = nvme_ioctl, 54 .d_strategy = nvme_strategy, 55 }; 56 57 static int nvme_sync_delay = 0; 58 SYSCTL_INT(_debug, OID_AUTO, nvme_sync_delay, CTLFLAG_RW, &nvme_sync_delay, 0, 59 "Enable synchronous delay/completion-check, uS"); 60 61 /* 62 * Attach a namespace as a disk, making the disk available to the system. 63 */ 64 void 65 nvme_disk_attach(nvme_softns_t *nsc) 66 { 67 nvme_softc_t *sc; 68 struct disk_info info; 69 char serial[20+16]; 70 size_t len; 71 uint64_t cap_gb; 72 73 sc = nsc->sc; 74 devstat_add_entry(&nsc->stats, "nvme", nsc->unit, nsc->blksize, 75 DEVSTAT_NO_ORDERED_TAGS, 76 DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER, 77 DEVSTAT_PRIORITY_OTHER); 78 nsc->cdev = disk_create(nsc->unit, &nsc->disk, &nvme_ops); 79 nsc->cdev->si_drv1 = nsc; 80 nsc->cdev->si_iosize_max = MAXPHYS; /* XXX */ 81 disk_setdisktype(&nsc->disk, "ssd"); 82 83 bzero(&info, sizeof(info)); 84 info.d_media_blksize = nsc->blksize; 85 info.d_media_blocks = nsc->idns.size; 86 info.d_secpertrack = 1024; 87 info.d_nheads = 1; 88 info.d_secpercyl = info.d_secpertrack * info.d_nheads; 89 info.d_ncylinders = (u_int)(info.d_media_blocks / info.d_secpercyl); 90 91 KKASSERT(sizeof(sc->idctlr.serialno) == 20); 92 bzero(serial, sizeof(serial)); 93 bcopy(sc->idctlr.serialno, serial, sizeof(sc->idctlr.serialno)); 94 len = string_cleanup(serial, 1); 95 96 ksnprintf(serial + len, sizeof(serial) - len, "-%u", nsc->nsid); 97 98 info.d_serialno = serial; 99 100 cap_gb = nsc->idns.size / (1024 * 1024 * 1024 / nsc->blksize); 101 device_printf(sc->dev, 102 "Disk nvme%d ns=%u " 103 "blksize=%u lbacnt=%ju cap=%juGB serno=%s\n", 104 nsc->unit, nsc->nsid, 105 nsc->blksize, nsc->idns.size, cap_gb, serial); 106 107 disk_setdiskinfo(&nsc->disk, &info); 108 /* serial is copied and does not have to be persistent */ 109 } 110 111 void 112 nvme_disk_detach(nvme_softns_t *nsc) 113 { 114 if (nsc->cdev) { 115 disk_destroy(&nsc->disk); 116 devstat_remove_entry(&nsc->stats); 117 } 118 } 119 120 static 121 int 122 nvme_open(struct dev_open_args *ap) 123 { 124 cdev_t dev = ap->a_head.a_dev; 125 nvme_softns_t *nsc = dev->si_drv1; 126 nvme_softc_t *sc = nsc->sc; 127 128 if (sc->flags & NVME_SC_UNLOADING) 129 return ENXIO; 130 131 atomic_add_long(&sc->opencnt, 1); 132 133 return 0; 134 } 135 136 static 137 int 138 nvme_close(struct dev_close_args *ap) 139 { 140 cdev_t dev = ap->a_head.a_dev; 141 nvme_softns_t *nsc = dev->si_drv1; 142 nvme_softc_t *sc = nsc->sc; 143 144 atomic_add_long(&sc->opencnt, -1); 145 146 return 0; 147 } 148 149 static int 150 nvme_ioctl(struct dev_ioctl_args *ap) 151 { 152 cdev_t dev = ap->a_head.a_dev; 153 nvme_softns_t *nsc = dev->si_drv1; 154 nvme_softc_t *sc = nsc->sc; 155 int error; 156 157 switch(ap->a_cmd) { 158 case NVMEIOCGETLOG: 159 error = nvme_getlog_ioctl(sc, (void *)ap->a_data); 160 break; 161 default: 162 error = ENOIOCTL; 163 break; 164 } 165 return error; 166 } 167 168 static int 169 nvme_strategy(struct dev_strategy_args *ap) 170 { 171 cdev_t dev = ap->a_head.a_dev; 172 nvme_softns_t *nsc = dev->si_drv1; 173 174 nvme_strategy_core(nsc, ap->a_bio, nvme_sync_delay); 175 176 return 0; 177 } 178 179 /* 180 * Called from admin thread to requeue BIOs. We must call 181 * nvme_strategy_core() with delay = 0 to disable synchronous 182 * optimizations to avoid deadlocking the admin thread. 183 */ 184 void 185 nvme_disk_requeues(nvme_softc_t *sc) 186 { 187 nvme_softns_t *nsc; 188 struct bio *bio; 189 int i; 190 191 for (i = 0; i < sc->nscmax; ++i) { 192 nsc = sc->nscary[i]; 193 if (nsc == NULL || nsc->sc == NULL) 194 continue; 195 if (bioq_first(&nsc->bioq)) { 196 lockmgr(&nsc->lk, LK_EXCLUSIVE); 197 while ((bio = bioq_first(&nsc->bioq)) != NULL) { 198 bioq_remove(&nsc->bioq, bio); 199 lockmgr(&nsc->lk, LK_RELEASE); 200 if (nvme_strategy_core(nsc, bio, 0)) 201 goto next; 202 lockmgr(&nsc->lk, LK_EXCLUSIVE); 203 } 204 lockmgr(&nsc->lk, LK_RELEASE); 205 } 206 next: 207 ; 208 } 209 } 210 211 212 /* 213 * Returns non-zero if no requests are available. 214 * 215 * WARNING! We are using the KVABIO API and must not access memory 216 * through bp->b_data without first calling bkvasync(bp). 217 */ 218 static int 219 nvme_strategy_core(nvme_softns_t *nsc, struct bio *bio, int delay) 220 { 221 nvme_softc_t *sc = nsc->sc; 222 struct buf *bp = bio->bio_buf; 223 uint64_t nlba; 224 uint64_t secno; 225 nvme_subqueue_t *subq; 226 nvme_request_t *req; 227 int nobytes; 228 229 /* 230 * Calculate sector/extent 231 */ 232 secno = bio->bio_offset / nsc->blksize; 233 nlba = bp->b_bcount / nsc->blksize; 234 235 devstat_start_transaction(&nsc->stats); 236 237 subq = NULL; 238 req = NULL; 239 nobytes = 0; 240 241 /* 242 * Convert bio to low-level request 243 */ 244 switch (bp->b_cmd) { 245 case BUF_CMD_READ: 246 if (nlba == 0) { 247 nobytes = 1; 248 break; 249 } 250 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_RD]]; 251 /* get_request does not need the subq lock */ 252 req = nvme_get_request(subq, NVME_IOCMD_READ, 253 bp->b_data, nlba * nsc->blksize); 254 if (req == NULL) 255 goto requeue; 256 257 req->cmd.read.head.nsid = nsc->nsid; 258 req->cmd.read.start_lba = secno; 259 req->cmd.read.count_lba = nlba - 1; /* 0's based */ 260 req->cmd.read.ioflags = 0; /* NVME_IOFLG_LR, NVME_IOFLG_FUA */ 261 req->cmd.read.dsm = 0; /* NVME_DSM_INCOMPRESSIBLE */ 262 /* NVME_DSM_SEQREQ */ 263 break; 264 case BUF_CMD_WRITE: 265 if (nlba == 0) { 266 nobytes = 1; 267 break; 268 } 269 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]]; 270 /* get_request does not need the subq lock */ 271 req = nvme_get_request(subq, NVME_IOCMD_WRITE, 272 bp->b_data, nlba * nsc->blksize); 273 if (req == NULL) 274 goto requeue; 275 req->cmd.write.head.nsid = nsc->nsid; 276 req->cmd.write.start_lba = secno; 277 req->cmd.write.count_lba = nlba - 1; /* 0's based */ 278 break; 279 case BUF_CMD_FREEBLKS: 280 if (nlba == 0) { 281 nobytes = 1; 282 break; 283 } 284 if (nlba > 65536) { 285 /* will cause INVAL error */ 286 break; 287 } 288 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]]; 289 /* get_request does not need the subq lock */ 290 req = nvme_get_request(subq, NVME_IOCMD_WRITEZ, NULL, 0); 291 if (req == NULL) 292 goto requeue; 293 req->cmd.writez.head.nsid = nsc->nsid; 294 req->cmd.writez.start_lba = secno; 295 req->cmd.writez.count_lba = nlba - 1; /* 0's based */ 296 req->cmd.read.ioflags = 0; /* NVME_IOFLG_LR, NVME_IOFLG_FUA */ 297 req->cmd.read.dsm = 0; /* NVME_DSM_INCOMPRESSIBLE */ 298 /* NVME_DSM_SEQREQ */ 299 break; 300 case BUF_CMD_FLUSH: 301 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]]; 302 /* get_request does not need the subq lock */ 303 req = nvme_get_request(subq, NVME_IOCMD_FLUSH, NULL, 0); 304 if (req == NULL) 305 goto requeue; 306 req->cmd.flush.head.nsid = nsc->nsid; 307 break; 308 default: 309 break; 310 } 311 312 /* 313 * Submit the request 314 */ 315 if (req) { 316 nvme_comqueue_t *comq; 317 318 /* HACK OPTIMIZATIONS - TODO NEEDS WORK */ 319 320 /* 321 * Prevent callback from occurring if the synchronous 322 * delay optimization is enabled. 323 * 324 * NOTE: subq lock does not protect the I/O (completion 325 * only needs the comq lock). 326 */ 327 if (delay == 0) 328 req->callback = nvme_disk_callback; 329 req->nsc = nsc; 330 req->bio = bio; 331 BUF_KERNPROC(bp); /* do before submit */ 332 lockmgr(&subq->lk, LK_EXCLUSIVE); 333 nvme_submit_request(req); /* needs subq lock */ 334 lockmgr(&subq->lk, LK_RELEASE); 335 if (delay) { 336 comq = req->comq; 337 DELAY(delay); /* XXX */ 338 lockmgr(&comq->lk, LK_EXCLUSIVE); 339 nvme_poll_completions(comq, &comq->lk); 340 if (req->state == NVME_REQ_SUBMITTED) { 341 /* 342 * Didn't finish, do it the slow way 343 * (restore async completion). 344 */ 345 req->callback = nvme_disk_callback; 346 lockmgr(&comq->lk, LK_RELEASE); 347 } else { 348 /* 349 * Jeeze, that was fast. 350 */ 351 nvme_disk_callback(req, &comq->lk); 352 lockmgr(&comq->lk, LK_RELEASE); 353 } 354 } /* else async completion */ 355 } else if (nobytes) { 356 devstat_end_transaction_buf(&nsc->stats, bp); 357 biodone(bio); 358 } else { 359 bp->b_error = EINVAL; 360 bp->b_flags |= B_ERROR; 361 devstat_end_transaction_buf(&nsc->stats, bp); 362 biodone(bio); 363 } 364 return 0; 365 366 /* 367 * No requests were available, requeue the bio. 368 * 369 * The nvme_get_request() call armed the requeue signal but 370 * it is possible that it was picked up too quickly. If it 371 * was, signal the admin thread ourselves. This case will occur 372 * relatively rarely and only under heavy I/O conditions so we 373 * don't have to be entirely efficient about dealing with it. 374 */ 375 requeue: 376 BUF_KERNPROC(bp); 377 lockmgr(&nsc->lk, LK_EXCLUSIVE); 378 bioqdisksort(&nsc->bioq, bio); 379 lockmgr(&nsc->lk, LK_RELEASE); 380 if (atomic_swap_int(&subq->signal_requeue, 1) == 0) { 381 atomic_swap_int(&subq->signal_requeue, 0); 382 atomic_set_int(&subq->sc->admin_signal, ADMIN_SIG_REQUEUE); 383 wakeup(&subq->sc->admin_signal); 384 } 385 return 1; 386 } 387 388 static 389 void 390 nvme_disk_callback(nvme_request_t *req, struct lock *lk) 391 { 392 nvme_softns_t *nsc = req->nsc; 393 struct bio *bio; 394 struct buf *bp; 395 int status; 396 397 status = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status); 398 bio = req->bio; 399 bp = bio->bio_buf; 400 401 if (lk) /* comq lock */ 402 lockmgr(lk, LK_RELEASE); 403 nvme_put_request(req); /* does not need subq lock */ 404 devstat_end_transaction_buf(&nsc->stats, bp); 405 if (status) { 406 bp->b_error = EIO; 407 bp->b_flags |= B_ERROR; 408 biodone(bio); 409 } else { 410 bp->b_resid = 0; 411 biodone(bio); 412 } 413 if (lk) /* comq lock */ 414 lockmgr(lk, LK_EXCLUSIVE); 415 } 416 417 int 418 nvme_alloc_disk_unit(void) 419 { 420 static int unit_counter = 0; 421 int unit; 422 423 unit = atomic_fetchadd_int(&unit_counter, 1); 424 425 return unit; 426 } 427 428 static int 429 nvme_dump(struct dev_dump_args *ap) 430 { 431 cdev_t dev = ap->a_head.a_dev; 432 nvme_softns_t *nsc = dev->si_drv1; 433 nvme_softc_t *sc = nsc->sc; 434 uint64_t nlba; 435 uint64_t secno; 436 nvme_subqueue_t *subq; 437 nvme_comqueue_t *comq; 438 nvme_request_t *req; 439 int didlock; 440 441 /* 442 * Calculate sector/extent 443 */ 444 secno = ap->a_offset / nsc->blksize; 445 nlba = ap->a_length / nsc->blksize; 446 447 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]]; 448 449 if (nlba) { 450 /* 451 * Issue a WRITE 452 * 453 * get_request does not need the subq lock. 454 */ 455 req = nvme_get_dump_request(subq, NVME_IOCMD_WRITE, 456 ap->a_virtual, nlba * nsc->blksize); 457 req->cmd.write.head.nsid = nsc->nsid; 458 req->cmd.write.start_lba = secno; 459 req->cmd.write.count_lba = nlba - 1; /* 0's based */ 460 } else { 461 /* 462 * Issue a FLUSH 463 * 464 * get_request does not need the subq lock. 465 */ 466 req = nvme_get_dump_request(subq, NVME_IOCMD_FLUSH, NULL, 0); 467 req->cmd.flush.head.nsid = nsc->nsid; 468 } 469 470 /* 471 * Prevent callback from occurring if the synchronous 472 * delay optimization is enabled. 473 */ 474 req->callback = NULL; 475 req->nsc = nsc; 476 477 /* 478 * 500 x 1uS poll wait on lock. We might be the idle thread, so 479 * we can't safely block during a dump. 480 */ 481 didlock = 500; 482 while (lockmgr(&subq->lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 483 if (--didlock == 0) 484 break; 485 tsc_delay(1000); /* 1uS */ 486 lwkt_switch(); 487 } 488 nvme_submit_request(req); /* needs subq lock */ 489 if (didlock) 490 lockmgr(&subq->lk, LK_RELEASE); 491 492 comq = req->comq; 493 nvme_poll_request(req); 494 nvme_put_dump_request(req); /* does not need subq lock */ 495 496 /* 497 * Shut the nvme controller down nicely when we finish the dump. 498 * We should to do this whether we are in a panic or not because 499 * frankly the dump is overwriting swap space, thus the system is 500 * probably not stable. 501 */ 502 if (nlba == 0) 503 nvme_issue_shutdown(sc, 1); 504 return 0; 505 } 506