1 /*
2 * Copyright (c) 2016 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #include "nvme.h"
36
37 static int nvme_pci_attach(device_t);
38 static int nvme_pci_detach(device_t);
39
40 static const nvme_device_t nvme_devices[] = {
41 /* Vendor-specific table goes here (see ahci for example) */
42 { 0, 0, nvme_pci_attach, nvme_pci_detach, "NVME-PCIe" }
43 };
44
45 static int nvme_msix_enable = 1;
46 TUNABLE_INT("hw.nvme.msix.enable", &nvme_msix_enable);
47 static int nvme_msi_enable = 0;
48 TUNABLE_INT("hw.nvme.msi.enable", &nvme_msi_enable);
49
50 TAILQ_HEAD(, nvme_softc) nvme_sc_list = TAILQ_HEAD_INITIALIZER(nvme_sc_list);
51 struct lock nvme_master_lock = LOCK_INITIALIZER("nvmstr", 0, 0);
52
53 static int last_global_cpu;
54
55 /*
56 * Match during probe and attach. The device does not yet have a softc.
57 */
58 const nvme_device_t *
nvme_lookup_device(device_t dev)59 nvme_lookup_device(device_t dev)
60 {
61 const nvme_device_t *ad;
62 uint16_t vendor = pci_get_vendor(dev);
63 uint16_t product = pci_get_device(dev);
64 uint8_t class = pci_get_class(dev);
65 uint8_t subclass = pci_get_subclass(dev);
66 uint8_t progif = pci_read_config(dev, PCIR_PROGIF, 1);
67 int is_nvme;
68
69 /*
70 * Generally speaking if the pci device does not identify as
71 * AHCI we skip it.
72 */
73 if (class == PCIC_STORAGE && subclass == PCIS_STORAGE_NVM &&
74 progif == PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0) {
75 is_nvme = 1;
76 } else {
77 is_nvme = 0;
78 }
79
80 for (ad = &nvme_devices[0]; ad->vendor; ++ad) {
81 if (ad->vendor == vendor && ad->product == product)
82 return (ad);
83 }
84
85 /*
86 * Last ad is the default match if the PCI device matches SATA.
87 */
88 if (is_nvme == 0)
89 ad = NULL;
90 return (ad);
91 }
92
93 /*
94 * Attach functions. They all eventually fall through to nvme_pci_attach().
95 */
96 static int
nvme_pci_attach(device_t dev)97 nvme_pci_attach(device_t dev)
98 {
99 nvme_softc_t *sc = device_get_softc(dev);
100 uint32_t reg;
101 int error;
102 int msi_enable;
103 int msix_enable;
104
105 #if 0
106 if (pci_read_config(dev, PCIR_COMMAND, 2) & 0x0400) {
107 device_printf(dev, "BIOS disabled PCI interrupt, "
108 "re-enabling\n");
109 pci_write_config(dev, PCIR_COMMAND,
110 pci_read_config(dev, PCIR_COMMAND, 2) & ~0x0400, 2);
111 }
112 #endif
113
114 sc->dev = dev;
115
116 /*
117 * Map the register window
118 */
119 sc->rid_regs = PCIR_BAR(0);
120 sc->regs = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
121 &sc->rid_regs, RF_ACTIVE);
122 if (sc->regs == NULL) {
123 device_printf(dev, "unable to map registers\n");
124 nvme_pci_detach(dev);
125 return (ENXIO);
126 }
127 sc->iot = rman_get_bustag(sc->regs);
128 sc->ioh = rman_get_bushandle(sc->regs);
129
130 /*
131 * NVMe allows the MSI-X table to be mapped to BAR 4/5.
132 * Always try to map BAR4, but it's ok if it fails. Must
133 * be done prior to allocating our interrupts.
134 */
135 sc->rid_bar4 = PCIR_BAR(4);
136 sc->bar4 = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
137 &sc->rid_bar4, RF_ACTIVE);
138
139 /*
140 * Map the interrupt or initial interrupt which will be used for
141 * the admin queue. NVME chipsets can potentially support a huge
142 * number of MSIX vectors but we really only need enough for
143 * available cpus, plus 1.
144 */
145 msi_enable = device_getenv_int(dev, "msi.enable", nvme_msi_enable);
146 msix_enable = device_getenv_int(dev, "msix.enable", nvme_msix_enable);
147
148 error = 0;
149 if (msix_enable) {
150 int i;
151 int cpu;
152
153 sc->nirqs = pci_msix_count(dev);
154 sc->irq_type = PCI_INTR_TYPE_MSIX;
155 if (sc->nirqs > ncpus + 1) /* max we need */
156 sc->nirqs = ncpus + 1;
157
158 error = pci_setup_msix(dev);
159 cpu = (last_global_cpu + 0) % ncpus; /* GCC warn */
160 for (i = 0; error == 0 && i < sc->nirqs; ++i) {
161 cpu = (last_global_cpu + i) % ncpus;
162 error = pci_alloc_msix_vector(dev, i,
163 &sc->rid_irq[i], cpu);
164 if (error)
165 break;
166 sc->irq[i] = bus_alloc_resource_any(dev, SYS_RES_IRQ,
167 &sc->rid_irq[i],
168 RF_ACTIVE);
169 /*
170 * We want this to overwrite queue 0's cpu vector
171 * when the cpu's rotate through later on.
172 */
173 if (sc->cputovect[cpu] == 0)
174 sc->cputovect[cpu] = i;
175 }
176
177 /*
178 * If we did not iterate enough cpus (that is, there weren't
179 * enough irqs for all available cpus) we still need to
180 * finish or sc->cputovect[] mapping.
181 */
182 while (error == 0) {
183 cpu = (cpu + 1) % ncpus;
184 i = (i + 1) % sc->nirqs;
185 if (i == 0)
186 i = 1;
187 if (sc->cputovect[cpu] != 0)
188 break;
189 sc->cputovect[cpu] = i;
190 }
191
192 if (error) {
193 while (--i >= 0) {
194 bus_release_resource(dev, SYS_RES_IRQ,
195 sc->rid_irq[i],
196 sc->irq[i]);
197 pci_release_msix_vector(dev, sc->rid_irq[i]);
198 sc->irq[i] = NULL;
199 }
200 /* leave error intact to fall through to normal */
201 } else {
202 last_global_cpu = (last_global_cpu + sc->nirqs) % ncpus;
203 pci_enable_msix(dev);
204 }
205 }
206
207 /*
208 * If we have to use a normal interrupt we fake the cputovect[] in
209 * order to try to map at least (ncpus) submission queues. The admin
210 * code will limit the number of completion queues to something
211 * reasonable when nirqs is 1 since the single interrupt polls all
212 * completion queues.
213 *
214 * NOTE: We do NOT want to map a single completion queue (#0), because
215 * then an I/O submission and/or completion queue will overlap
216 * the admin submission or completion queue, and that can cause
217 * havoc when admin commands are submitted that don't return
218 * for long periods of time.
219 *
220 * NOTE: Chipsets supporting MSI-X *MIGHT* *NOT* properly support
221 * a normal pin-based level interrupt. For example, the BPX
222 * NVMe SSD just leaves the level interrupt stuck on. Do not
223 * disable MSI-X unless you have no choice.
224 */
225 if (msix_enable == 0 || error) {
226 uint32_t irq_flags;
227 int i;
228
229 error = 0;
230 sc->nirqs = 1;
231 sc->irq_type = pci_alloc_1intr(dev, msi_enable,
232 &sc->rid_irq[0], &irq_flags);
233 sc->irq[0] = bus_alloc_resource_any(dev, SYS_RES_IRQ,
234 &sc->rid_irq[0], irq_flags);
235
236 for (i = 0; i < ncpus; ++i)
237 sc->cputovect[i] = i + 1;
238 }
239 if (sc->irq[0] == NULL) {
240 device_printf(dev, "unable to map interrupt\n");
241 nvme_pci_detach(dev);
242 return (ENXIO);
243 } else {
244 const char *type;
245 switch(sc->irq_type) {
246 case PCI_INTR_TYPE_MSI:
247 type = "MSI";
248 break;
249 case PCI_INTR_TYPE_MSIX:
250 type = "MSIX";
251 break;
252 default:
253 type = "normal-int";
254 break;
255 }
256 device_printf(dev, "mapped %d %s IRQs\n", sc->nirqs, type);
257 }
258
259 /*
260 * Make sure the chip is disabled, which will reset all controller
261 * registers except for the admin queue registers. Device should
262 * already be disabled so this is usually instantanious. Use a
263 * fixed 5-second timeout in case it is not. I'd like my other
264 * reads to occur after the device has been disabled.
265 */
266 sc->entimo = hz * 5;
267 error = nvme_enable(sc, 0);
268 if (error) {
269 nvme_pci_detach(dev);
270 return (ENXIO);
271 }
272
273 /*
274 * Get capabillities and version and report
275 */
276 sc->vers = nvme_read(sc, NVME_REG_VERS);
277 sc->cap = nvme_read8(sc, NVME_REG_CAP);
278 sc->maxqe = NVME_CAP_MQES_GET(sc->cap);
279 sc->dstrd4 = NVME_CAP_DSTRD_GET(sc->cap);
280
281 device_printf(dev, "NVME Version %u.%u maxqe=%u caps=%016jx\n",
282 NVME_VERS_MAJOR_GET(sc->vers),
283 NVME_VERS_MINOR_GET(sc->vers),
284 sc->maxqe, sc->cap);
285
286 /*
287 * Enable timeout, 500ms increments. Convert to ticks.
288 */
289 sc->entimo = NVME_CAP_TIMEOUT_GET(sc->cap) * hz / 2; /* in ticks */
290 ++sc->entimo; /* fudge */
291
292 /*
293 * Validate maxqe. To cap the amount of memory we reserve for
294 * PRPs we limit maxqe to 256. Also make sure it is a power of
295 * two.
296 */
297 if (sc->maxqe < 2) {
298 device_printf(dev,
299 "Attach failed, max queue entries (%d) "
300 "below minimum (2)\n", sc->maxqe);
301 nvme_pci_detach(dev);
302 return (ENXIO);
303 }
304 if (sc->maxqe > 256)
305 sc->maxqe = 256;
306 for (reg = 2; reg <= sc->maxqe; reg <<= 1)
307 ;
308 sc->maxqe = reg >> 1;
309
310 /*
311 * DMA tags
312 *
313 * PRP - Worst case PRPs needed per queue is MAXPHYS / PAGE_SIZE
314 * (typically 64), multiplied by maxqe (typ 256). Roughly
315 * ~128KB per queue. Align for cache performance. We actually
316 * need one more PRP per queue entry worst-case to handle
317 * buffer overlap, but we have an extra one in the command
318 * structure so we don't have to calculate that out.
319 *
320 * Remember that we intend to allocate potentially many queues,
321 * so we don't want to bloat this too much. A queue depth of
322 * 256 is plenty.
323 *
324 * CMD - Storage for the submit queue. maxqe * 64 (~16KB)
325 *
326 * RES - Storage for the completion queue. maxqe * 16 (~4KB)
327 *
328 * ADM - Storage for admin command DMA data. Maximum admin command
329 * DMA data is 4KB so reserve maxqe * 4KB (~1MB). There is only
330 * one admin queue.
331 *
332 * NOTE: There are no boundary requirements for NVMe, but I specify a
333 * 4MB boundary anyway because this reduces mass-bit flipping
334 * of address bits inside the controller when incrementing
335 * DMA addresses. Why not? Can't hurt.
336 */
337 sc->prp_bytes = sizeof(uint64_t) * (MAXPHYS / PAGE_SIZE) * sc->maxqe;
338 sc->cmd_bytes = sizeof(nvme_subq_item_t) * sc->maxqe;
339 sc->res_bytes = sizeof(nvme_comq_item_t) * sc->maxqe;
340 sc->adm_bytes = NVME_MAX_ADMIN_BUFFER * sc->maxqe;
341
342 error = 0;
343
344 error += bus_dma_tag_create(
345 NULL, /* parent tag */
346 PAGE_SIZE, /* alignment */
347 4 * 1024 * 1024, /* boundary */
348 BUS_SPACE_MAXADDR, /* loaddr? */
349 BUS_SPACE_MAXADDR, /* hiaddr */
350 sc->prp_bytes, /* [max]size */
351 1, /* maxsegs */
352 sc->prp_bytes, /* maxsegsz */
353 0, /* flags */
354 &sc->prps_tag); /* return tag */
355
356 error += bus_dma_tag_create(
357 NULL, /* parent tag */
358 PAGE_SIZE, /* alignment */
359 4 * 1024 * 1024, /* boundary */
360 BUS_SPACE_MAXADDR, /* loaddr? */
361 BUS_SPACE_MAXADDR, /* hiaddr */
362 sc->cmd_bytes, /* [max]size */
363 1, /* maxsegs */
364 sc->cmd_bytes, /* maxsegsz */
365 0, /* flags */
366 &sc->sque_tag); /* return tag */
367
368 error += bus_dma_tag_create(
369 NULL, /* parent tag */
370 PAGE_SIZE, /* alignment */
371 4 * 1024 * 1024, /* boundary */
372 BUS_SPACE_MAXADDR, /* loaddr? */
373 BUS_SPACE_MAXADDR, /* hiaddr */
374 sc->res_bytes, /* [max]size */
375 1, /* maxsegs */
376 sc->res_bytes, /* maxsegsz */
377 0, /* flags */
378 &sc->cque_tag); /* return tag */
379
380 error += bus_dma_tag_create(
381 NULL, /* parent tag */
382 PAGE_SIZE, /* alignment */
383 4 * 1024 * 1024, /* boundary */
384 BUS_SPACE_MAXADDR, /* loaddr? */
385 BUS_SPACE_MAXADDR, /* hiaddr */
386 sc->adm_bytes, /* [max]size */
387 1, /* maxsegs */
388 sc->adm_bytes, /* maxsegsz */
389 0, /* flags */
390 &sc->adm_tag); /* return tag */
391
392 if (error) {
393 device_printf(dev, "unable to create dma tags\n");
394 nvme_pci_detach(dev);
395 return (ENXIO);
396 }
397
398 /*
399 * Setup the admin queues (qid 0).
400 */
401 error = nvme_alloc_subqueue(sc, 0);
402 if (error) {
403 device_printf(dev, "unable to allocate admin subqueue\n");
404 nvme_pci_detach(dev);
405 return (ENXIO);
406 }
407 error = nvme_alloc_comqueue(sc, 0);
408 if (error) {
409 device_printf(dev, "unable to allocate admin comqueue\n");
410 nvme_pci_detach(dev);
411 return (ENXIO);
412 }
413
414 /*
415 * Initialize the admin queue registers
416 */
417 reg = NVME_ATTR_COM_SET(sc->maxqe) | NVME_ATTR_SUB_SET(sc->maxqe);
418 nvme_write(sc, NVME_REG_ADM_ATTR, reg);
419 nvme_write8(sc, NVME_REG_ADM_SUBADR, (uint64_t)sc->subqueues[0].psubq);
420 nvme_write8(sc, NVME_REG_ADM_COMADR, (uint64_t)sc->comqueues[0].pcomq);
421
422 /*
423 * qemu appears to require this, real hardware does not appear
424 * to require this.
425 */
426 pci_enable_busmaster(dev);
427
428 /*
429 * Other configuration registers
430 */
431 reg = NVME_CONFIG_IOSUB_ES_SET(6) | /* 64 byte sub entry */
432 NVME_CONFIG_IOCOM_ES_SET(4) | /* 16 byte com entry */
433 NVME_CONFIG_MEMPG_SET(PAGE_SHIFT) | /* 4K pages */
434 NVME_CONFIG_CSS_NVM; /* NVME command set */
435 nvme_write(sc, NVME_REG_CONFIG, reg);
436
437 reg = nvme_read(sc, NVME_REG_MEMSIZE);
438
439 /*
440 * Enable the chip for operation
441 */
442 error = nvme_enable(sc, 1);
443 if (error) {
444 nvme_enable(sc, 0);
445 nvme_pci_detach(dev);
446 return (ENXIO);
447 }
448
449 /*
450 * Start the admin thread. This will also setup the admin queue
451 * interrupt.
452 */
453 error = nvme_start_admin_thread(sc);
454 if (error) {
455 nvme_pci_detach(dev);
456 return (ENXIO);
457 }
458 lockmgr(&nvme_master_lock, LK_EXCLUSIVE);
459 sc->flags |= NVME_SC_ATTACHED;
460 TAILQ_INSERT_TAIL(&nvme_sc_list, sc, entry);
461 lockmgr(&nvme_master_lock, LK_RELEASE);
462
463 return(0);
464 }
465
466 /*
467 * Device unload / detachment
468 */
469 static int
nvme_pci_detach(device_t dev)470 nvme_pci_detach(device_t dev)
471 {
472 nvme_softc_t *sc = device_get_softc(dev);
473 int i;
474
475 /*
476 * Stop the admin thread
477 */
478 nvme_stop_admin_thread(sc);
479
480 /*
481 * Issue a normal shutdown and wait for completion
482 */
483 nvme_issue_shutdown(sc, 0);
484
485 /*
486 * Disable the chip
487 */
488 nvme_enable(sc, 0);
489
490 /*
491 * Free admin memory
492 */
493 nvme_free_subqueue(sc, 0);
494 nvme_free_comqueue(sc, 0);
495
496 /*
497 * Release related resources.
498 */
499 for (i = 0; i < sc->nirqs; ++i) {
500 if (sc->irq[i]) {
501 bus_release_resource(dev, SYS_RES_IRQ,
502 sc->rid_irq[i], sc->irq[i]);
503 sc->irq[i] = NULL;
504 if (sc->irq_type == PCI_INTR_TYPE_MSIX)
505 pci_release_msix_vector(dev, sc->rid_irq[i]);
506 }
507 }
508 switch(sc->irq_type) {
509 case PCI_INTR_TYPE_MSI:
510 pci_release_msi(dev);
511 break;
512 case PCI_INTR_TYPE_MSIX:
513 pci_teardown_msix(dev);
514 break;
515 default:
516 break;
517 }
518
519 /*
520 * Release remaining chipset resources
521 */
522 if (sc->regs) {
523 bus_release_resource(dev, SYS_RES_MEMORY,
524 sc->rid_regs, sc->regs);
525 sc->regs = NULL;
526 }
527 if (sc->bar4) {
528 bus_release_resource(dev, SYS_RES_MEMORY,
529 sc->rid_bar4, sc->bar4);
530 sc->bar4 = NULL;
531 }
532
533 /*
534 * Cleanup the DMA tags
535 */
536 if (sc->prps_tag) {
537 bus_dma_tag_destroy(sc->prps_tag);
538 sc->prps_tag = NULL;
539 }
540 if (sc->sque_tag) {
541 bus_dma_tag_destroy(sc->sque_tag);
542 sc->sque_tag = NULL;
543 }
544 if (sc->cque_tag) {
545 bus_dma_tag_destroy(sc->cque_tag);
546 sc->cque_tag = NULL;
547 }
548 if (sc->adm_tag) {
549 bus_dma_tag_destroy(sc->adm_tag);
550 sc->adm_tag = NULL;
551 }
552
553 if (sc->flags & NVME_SC_ATTACHED) {
554 lockmgr(&nvme_master_lock, LK_EXCLUSIVE);
555 sc->flags &= ~NVME_SC_ATTACHED;
556 TAILQ_REMOVE(&nvme_sc_list, sc, entry);
557 lockmgr(&nvme_master_lock, LK_RELEASE);
558 }
559
560 return (0);
561 }
562