xref: /dragonfly/sys/dev/disk/nvme/nvme_attach.c (revision 03517d4e)
1 /*
2  * Copyright (c) 2016 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "nvme.h"
36 
37 static int	nvme_pci_attach(device_t);
38 static int	nvme_pci_detach(device_t);
39 
40 static const nvme_device_t nvme_devices[] = {
41 	/* Vendor-specific table goes here (see ahci for example) */
42 	{ 0, 0, nvme_pci_attach, nvme_pci_detach, "NVME-PCIe" }
43 };
44 
45 static int	nvme_msix_enable = 1;
46 TUNABLE_INT("hw.nvme.msix.enable", &nvme_msix_enable);
47 static int	nvme_msi_enable = 0;
48 TUNABLE_INT("hw.nvme.msi.enable", &nvme_msi_enable);
49 
50 TAILQ_HEAD(, nvme_softc) nvme_sc_list = TAILQ_HEAD_INITIALIZER(nvme_sc_list);
51 struct lock nvme_master_lock = LOCK_INITIALIZER("nvmstr", 0, 0);
52 
53 static int last_global_cpu;
54 
55 /*
56  * Match during probe and attach.  The device does not yet have a softc.
57  */
58 const nvme_device_t *
59 nvme_lookup_device(device_t dev)
60 {
61 	const nvme_device_t *ad;
62 	uint16_t vendor = pci_get_vendor(dev);
63 	uint16_t product = pci_get_device(dev);
64 	uint8_t class = pci_get_class(dev);
65 	uint8_t subclass = pci_get_subclass(dev);
66 	uint8_t progif = pci_read_config(dev, PCIR_PROGIF, 1);
67 	int is_nvme;
68 
69 	/*
70 	 * Generally speaking if the pci device does not identify as
71 	 * AHCI we skip it.
72 	 */
73 	if (class == PCIC_STORAGE && subclass == PCIS_STORAGE_NVM &&
74 	    progif == PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0) {
75 		is_nvme = 1;
76 	} else {
77 		is_nvme = 0;
78 	}
79 
80 	for (ad = &nvme_devices[0]; ad->vendor; ++ad) {
81 		if (ad->vendor == vendor && ad->product == product)
82 			return (ad);
83 	}
84 
85 	/*
86 	 * Last ad is the default match if the PCI device matches SATA.
87 	 */
88 	if (is_nvme == 0)
89 		ad = NULL;
90 	return (ad);
91 }
92 
93 /*
94  * Attach functions.  They all eventually fall through to nvme_pci_attach().
95  */
96 static int
97 nvme_pci_attach(device_t dev)
98 {
99 	nvme_softc_t *sc = device_get_softc(dev);
100 	uint32_t reg;
101 	int error;
102 	int msi_enable;
103 	int msix_enable;
104 
105 #if 0
106 	if (pci_read_config(dev, PCIR_COMMAND, 2) & 0x0400) {
107 		device_printf(dev, "BIOS disabled PCI interrupt, "
108 				   "re-enabling\n");
109 		pci_write_config(dev, PCIR_COMMAND,
110 			pci_read_config(dev, PCIR_COMMAND, 2) & ~0x0400, 2);
111 	}
112 #endif
113 
114 	sc->dev = dev;
115 
116 	/*
117 	 * Map the register window
118 	 */
119 	sc->rid_regs = PCIR_BAR(0);
120 	sc->regs = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
121 					  &sc->rid_regs, RF_ACTIVE);
122 	if (sc->regs == NULL) {
123 		device_printf(dev, "unable to map registers\n");
124 		nvme_pci_detach(dev);
125 		return (ENXIO);
126 	}
127 	sc->iot = rman_get_bustag(sc->regs);
128 	sc->ioh = rman_get_bushandle(sc->regs);
129 
130 	/*
131 	 * NVMe allows the MSI-X table to be mapped to BAR 4/5.
132 	 * Always try to map BAR4, but it's ok if it fails.  Must
133 	 * be done prior to allocating our interrupts.
134 	 */
135 	sc->rid_bar4 = PCIR_BAR(4);
136 	sc->bar4 = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
137 					  &sc->rid_bar4, RF_ACTIVE);
138 
139 	/*
140 	 * Map the interrupt or initial interrupt which will be used for
141 	 * the admin queue.  NVME chipsets can potentially support a huge
142 	 * number of MSIX vectors but we really only need enough for
143 	 * available cpus, plus 1.
144 	 */
145 	msi_enable = device_getenv_int(dev, "msi.enable", nvme_msi_enable);
146 	msix_enable = device_getenv_int(dev, "msix.enable", nvme_msix_enable);
147 
148 	error = 0;
149 	if (msix_enable) {
150 		int i;
151 		int cpu;
152 
153 		sc->nirqs = pci_msix_count(dev);
154 		sc->irq_type = PCI_INTR_TYPE_MSIX;
155 		if (sc->nirqs > ncpus + 1)		/* max we need */
156 			sc->nirqs = ncpus + 1;
157 
158 		error = pci_setup_msix(dev);
159 		cpu = (last_global_cpu + 0) % ncpus;	/* GCC warn */
160 		for (i = 0; error == 0 && i < sc->nirqs; ++i) {
161 			cpu = (last_global_cpu + i) % ncpus;
162 			error = pci_alloc_msix_vector(dev, i,
163 						      &sc->rid_irq[i], cpu);
164 			if (error)
165 				break;
166 			sc->irq[i] = bus_alloc_resource_any(dev, SYS_RES_IRQ,
167 							    &sc->rid_irq[i],
168 							    RF_ACTIVE);
169 			/*
170 			 * We want this to overwrite queue 0's cpu vector
171 			 * when the cpu's rotate through later on.
172 			 */
173 			if (sc->cputovect[cpu] == 0)
174 				sc->cputovect[cpu] = i;
175 		}
176 
177 		/*
178 		 * If we did not iterate enough cpus (that is, there weren't
179 		 * enough irqs for all available cpus) we still need to
180 		 * finish or sc->cputovect[] mapping.
181 		 */
182 		while (error == 0) {
183 			cpu = (cpu + 1) % ncpus;
184 			i = (i + 1) % sc->nirqs;
185 			if (i == 0)
186 				i = 1;
187 			if (sc->cputovect[cpu] != 0)
188 				break;
189 			sc->cputovect[cpu] = i;
190 		}
191 
192 		if (error) {
193 			while (--i >= 0) {
194 				bus_release_resource(dev, SYS_RES_IRQ,
195 						     sc->rid_irq[i],
196 						     sc->irq[i]);
197 				pci_release_msix_vector(dev, sc->rid_irq[i]);
198 				sc->irq[i] = NULL;
199 			}
200 			/* leave error intact to fall through to normal */
201 		} else {
202 			last_global_cpu = (last_global_cpu + sc->nirqs) % ncpus;
203 			pci_enable_msix(dev);
204 		}
205 	}
206 
207 	/*
208 	 * If we have to use a normal interrupt we fake the cputovect[] in
209 	 * order to try to map at least (ncpus) submission queues.  The admin
210 	 * code will limit the number of completion queues to something
211 	 * reasonable when nirqs is 1 since the single interrupt polls all
212 	 * completion queues.
213 	 *
214 	 * NOTE: We do NOT want to map a single completion queue (#0), because
215 	 *	 then an I/O submission and/or completion queue will overlap
216 	 *	 the admin submission or completion queue, and that can cause
217 	 *	 havoc when admin commands are submitted that don't return
218 	 *	 for long periods of time.
219 	 *
220 	 * NOTE: Chipsets supporting MSI-X *MIGHT* *NOT* properly support
221 	 *	 a normal pin-based level interrupt.  For example, the BPX
222 	 *	 NVMe SSD just leaves the level interrupt stuck on.  Do not
223 	 *	 disable MSI-X unless you have no choice.
224 	 */
225 	if (msix_enable == 0 || error) {
226 		uint32_t irq_flags;
227 		int i;
228 
229 		error = 0;
230 		sc->nirqs = 1;
231 		sc->irq_type = pci_alloc_1intr(dev, msi_enable,
232 					       &sc->rid_irq[0], &irq_flags);
233 		sc->irq[0] = bus_alloc_resource_any(dev, SYS_RES_IRQ,
234 						 &sc->rid_irq[0], irq_flags);
235 
236 		for (i = 0; i < ncpus; ++i)
237 			sc->cputovect[i] = i + 1;
238 	}
239 	if (sc->irq[0] == NULL) {
240 		device_printf(dev, "unable to map interrupt\n");
241 		nvme_pci_detach(dev);
242 		return (ENXIO);
243 	} else {
244 		const char *type;
245 		switch(sc->irq_type) {
246 		case PCI_INTR_TYPE_MSI:
247 			type = "MSI";
248 			break;
249 		case PCI_INTR_TYPE_MSIX:
250 			type = "MSIX";
251 			break;
252 		default:
253 			type = "normal-int";
254 			break;
255 		}
256 		device_printf(dev, "mapped %d %s IRQs\n", sc->nirqs, type);
257 	}
258 
259 	/*
260 	 * Make sure the chip is disabled, which will reset all controller
261 	 * registers except for the admin queue registers.  Device should
262 	 * already be disabled so this is usually instantanious.  Use a
263 	 * fixed 5-second timeout in case it is not.  I'd like my other
264 	 * reads to occur after the device has been disabled.
265 	 */
266 	sc->entimo = hz * 5;
267 	error = nvme_enable(sc, 0);
268 	if (error) {
269 		nvme_pci_detach(dev);
270 		return (ENXIO);
271 	}
272 
273 	/*
274 	 * Get capabillities and version and report
275 	 */
276 	sc->vers = nvme_read(sc, NVME_REG_VERS);
277 	sc->cap = nvme_read8(sc, NVME_REG_CAP);
278 	sc->maxqe = NVME_CAP_MQES_GET(sc->cap);
279 	sc->dstrd4 = NVME_CAP_DSTRD_GET(sc->cap);
280 
281 	device_printf(dev, "NVME Version %u.%u maxqe=%u caps=%016jx\n",
282 		      NVME_VERS_MAJOR_GET(sc->vers),
283 		      NVME_VERS_MINOR_GET(sc->vers),
284 		      sc->maxqe, sc->cap);
285 
286 	/*
287 	 * Enable timeout, 500ms increments.  Convert to ticks.
288 	 */
289 	sc->entimo = NVME_CAP_TIMEOUT_GET(sc->cap) * hz / 2; /* in ticks */
290 	++sc->entimo;		/* fudge */
291 
292 	/*
293 	 * Validate maxqe.  To cap the amount of memory we reserve for
294 	 * PRPs we limit maxqe to 256.  Also make sure it is a power of
295 	 * two.
296 	 */
297 	if (sc->maxqe < 2) {
298 		device_printf(dev,
299 			      "Attach failed, max queue entries (%d) "
300 			      "below minimum (2)\n", sc->maxqe);
301 		nvme_pci_detach(dev);
302 		return (ENXIO);
303 	}
304 	if (sc->maxqe > 256)
305 		sc->maxqe = 256;
306 	for (reg = 2; reg <= sc->maxqe; reg <<= 1)
307 		;
308 	sc->maxqe = reg >> 1;
309 
310 	/*
311 	 * DMA tags
312 	 *
313 	 * PRP	- Worst case PRPs needed per queue is MAXPHYS / PAGE_SIZE
314 	 *	  (typically 64), multiplied by maxqe (typ 256).  Roughly
315 	 *	  ~128KB per queue.  Align for cache performance.  We actually
316 	 *	  need one more PRP per queue entry worst-case to handle
317 	 *	  buffer overlap, but we have an extra one in the command
318 	 *	  structure so we don't have to calculate that out.
319 	 *
320 	 *	  Remember that we intend to allocate potentially many queues,
321 	 *	  so we don't want to bloat this too much.  A queue depth of
322 	 *	  256 is plenty.
323 	 *
324 	 * CMD - Storage for the submit queue.  maxqe * 64	(~16KB)
325 	 *
326 	 * RES - Storage for the completion queue.  maxqe * 16	(~4KB)
327 	 *
328 	 * ADM - Storage for admin command DMA data.  Maximum admin command
329 	 *	 DMA data is 4KB so reserve maxqe * 4KB (~1MB).  There is only
330 	 *	 one admin queue.
331 	 *
332 	 * NOTE: There are no boundary requirements for NVMe, but I specify a
333 	 *	 4MB boundary anyway because this reduces mass-bit flipping
334 	 *	 of address bits inside the controller when incrementing
335 	 *	 DMA addresses.  Why not?  Can't hurt.
336 	 */
337 	sc->prp_bytes = sizeof(uint64_t) * (MAXPHYS / PAGE_SIZE) * sc->maxqe;
338 	sc->cmd_bytes = sizeof(nvme_subq_item_t) * sc->maxqe;
339 	sc->res_bytes = sizeof(nvme_comq_item_t) * sc->maxqe;
340 	sc->adm_bytes = NVME_MAX_ADMIN_BUFFER * sc->maxqe;
341 
342 	error = 0;
343 
344 	error += bus_dma_tag_create(
345 			NULL,				/* parent tag */
346 			PAGE_SIZE,			/* alignment */
347 			4 * 1024 * 1024,		/* boundary */
348 			BUS_SPACE_MAXADDR,		/* loaddr? */
349 			BUS_SPACE_MAXADDR,		/* hiaddr */
350 			sc->prp_bytes,			/* [max]size */
351 			1,				/* maxsegs */
352 			sc->prp_bytes,			/* maxsegsz */
353 			0,				/* flags */
354 			&sc->prps_tag);			/* return tag */
355 
356 	error += bus_dma_tag_create(
357 			NULL,				/* parent tag */
358 			PAGE_SIZE,			/* alignment */
359 			4 * 1024 * 1024,		/* boundary */
360 			BUS_SPACE_MAXADDR,		/* loaddr? */
361 			BUS_SPACE_MAXADDR,		/* hiaddr */
362 			sc->cmd_bytes,			/* [max]size */
363 			1,				/* maxsegs */
364 			sc->cmd_bytes,			/* maxsegsz */
365 			0,				/* flags */
366 			&sc->sque_tag);			/* return tag */
367 
368 	error += bus_dma_tag_create(
369 			NULL,				/* parent tag */
370 			PAGE_SIZE,			/* alignment */
371 			4 * 1024 * 1024,		/* boundary */
372 			BUS_SPACE_MAXADDR,		/* loaddr? */
373 			BUS_SPACE_MAXADDR,		/* hiaddr */
374 			sc->res_bytes,			/* [max]size */
375 			1,				/* maxsegs */
376 			sc->res_bytes,			/* maxsegsz */
377 			0,				/* flags */
378 			&sc->cque_tag);			/* return tag */
379 
380 	error += bus_dma_tag_create(
381 			NULL,				/* parent tag */
382 			PAGE_SIZE,			/* alignment */
383 			4 * 1024 * 1024,		/* boundary */
384 			BUS_SPACE_MAXADDR,		/* loaddr? */
385 			BUS_SPACE_MAXADDR,		/* hiaddr */
386 			sc->adm_bytes,			/* [max]size */
387 			1,				/* maxsegs */
388 			sc->adm_bytes,			/* maxsegsz */
389 			0,				/* flags */
390 			&sc->adm_tag);			/* return tag */
391 
392 	if (error) {
393 		device_printf(dev, "unable to create dma tags\n");
394 		nvme_pci_detach(dev);
395 		return (ENXIO);
396 	}
397 
398 	/*
399 	 * Setup the admin queues (qid 0).
400 	 */
401 	error = nvme_alloc_subqueue(sc, 0);
402 	if (error) {
403 		device_printf(dev, "unable to allocate admin subqueue\n");
404 		nvme_pci_detach(dev);
405 		return (ENXIO);
406 	}
407 	error = nvme_alloc_comqueue(sc, 0);
408 	if (error) {
409 		device_printf(dev, "unable to allocate admin comqueue\n");
410 		nvme_pci_detach(dev);
411 		return (ENXIO);
412 	}
413 
414 	/*
415 	 * Initialize the admin queue registers
416 	 */
417 	reg = NVME_ATTR_COM_SET(sc->maxqe) | NVME_ATTR_SUB_SET(sc->maxqe);
418 	nvme_write(sc, NVME_REG_ADM_ATTR, reg);
419 	nvme_write8(sc, NVME_REG_ADM_SUBADR, (uint64_t)sc->subqueues[0].psubq);
420 	nvme_write8(sc, NVME_REG_ADM_COMADR, (uint64_t)sc->comqueues[0].pcomq);
421 
422 	/*
423 	 * qemu appears to require this, real hardware does not appear
424 	 * to require this.
425 	 */
426 	pci_enable_busmaster(dev);
427 
428 	/*
429 	 * Other configuration registers
430 	 */
431 	reg = NVME_CONFIG_IOSUB_ES_SET(6) |		/* 64 byte sub entry */
432 	      NVME_CONFIG_IOCOM_ES_SET(4) |		/* 16 byte com entry */
433 	      NVME_CONFIG_MEMPG_SET(PAGE_SHIFT) |	/* 4K pages */
434 	      NVME_CONFIG_CSS_NVM;			/* NVME command set */
435 	nvme_write(sc, NVME_REG_CONFIG, reg);
436 
437 	reg = nvme_read(sc, NVME_REG_MEMSIZE);
438 
439 	/*
440 	 * Enable the chip for operation
441 	 */
442 	error = nvme_enable(sc, 1);
443 	if (error) {
444 		nvme_enable(sc, 0);
445 		nvme_pci_detach(dev);
446 		return (ENXIO);
447 	}
448 
449 	/*
450 	 * Start the admin thread.  This will also setup the admin queue
451 	 * interrupt.
452 	 */
453 	error = nvme_start_admin_thread(sc);
454 	if (error) {
455 		nvme_pci_detach(dev);
456 		return (ENXIO);
457 	}
458 	lockmgr(&nvme_master_lock, LK_EXCLUSIVE);
459 	sc->flags |= NVME_SC_ATTACHED;
460 	TAILQ_INSERT_TAIL(&nvme_sc_list, sc, entry);
461 	lockmgr(&nvme_master_lock, LK_RELEASE);
462 
463 	return(0);
464 }
465 
466 /*
467  * Device unload / detachment
468  */
469 static int
470 nvme_pci_detach(device_t dev)
471 {
472 	nvme_softc_t *sc = device_get_softc(dev);
473 	int i;
474 
475 	/*
476 	 * Stop the admin thread
477 	 */
478 	nvme_stop_admin_thread(sc);
479 
480 	/*
481 	 * Issue a normal shutdown and wait for completion
482 	 */
483 	nvme_issue_shutdown(sc, 0);
484 
485 	/*
486 	 * Disable the chip
487 	 */
488 	nvme_enable(sc, 0);
489 
490 	/*
491 	 * Free admin memory
492 	 */
493 	nvme_free_subqueue(sc, 0);
494 	nvme_free_comqueue(sc, 0);
495 
496 	/*
497 	 * Release related resources.
498 	 */
499 	for (i = 0; i < sc->nirqs; ++i) {
500 		if (sc->irq[i]) {
501 			bus_release_resource(dev, SYS_RES_IRQ,
502 					     sc->rid_irq[i], sc->irq[i]);
503 			sc->irq[i] = NULL;
504 			if (sc->irq_type == PCI_INTR_TYPE_MSIX)
505 				pci_release_msix_vector(dev, sc->rid_irq[i]);
506 		}
507 	}
508 	switch(sc->irq_type) {
509 	case PCI_INTR_TYPE_MSI:
510 		pci_release_msi(dev);
511 		break;
512 	case PCI_INTR_TYPE_MSIX:
513 		pci_teardown_msix(dev);
514 		break;
515 	default:
516 		break;
517 	}
518 
519 	/*
520 	 * Release remaining chipset resources
521 	 */
522 	if (sc->regs) {
523 		bus_release_resource(dev, SYS_RES_MEMORY,
524 				     sc->rid_regs, sc->regs);
525 		sc->regs = NULL;
526 	}
527 	if (sc->bar4) {
528 		bus_release_resource(dev, SYS_RES_MEMORY,
529 				     sc->rid_bar4, sc->bar4);
530 		sc->bar4 = NULL;
531 	}
532 
533 	/*
534 	 * Cleanup the DMA tags
535 	 */
536 	if (sc->prps_tag) {
537 		bus_dma_tag_destroy(sc->prps_tag);
538 		sc->prps_tag = NULL;
539 	}
540 	if (sc->sque_tag) {
541 		bus_dma_tag_destroy(sc->sque_tag);
542 		sc->sque_tag = NULL;
543 	}
544 	if (sc->cque_tag) {
545 		bus_dma_tag_destroy(sc->cque_tag);
546 		sc->cque_tag = NULL;
547 	}
548 	if (sc->adm_tag) {
549 		bus_dma_tag_destroy(sc->adm_tag);
550 		sc->adm_tag = NULL;
551 	}
552 
553 	if (sc->flags & NVME_SC_ATTACHED) {
554 		lockmgr(&nvme_master_lock, LK_EXCLUSIVE);
555 		sc->flags &= ~NVME_SC_ATTACHED;
556 		TAILQ_REMOVE(&nvme_sc_list, sc, entry);
557 		lockmgr(&nvme_master_lock, LK_RELEASE);
558 	}
559 
560 	return (0);
561 }
562