xref: /illumos-gate/usr/src/cmd/bhyve/pci_passthru.c (revision cad3f045)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #ifndef WITHOUT_CAPSICUM
36 #include <sys/capsicum.h>
37 #endif
38 #include <sys/types.h>
39 #include <sys/mman.h>
40 #include <sys/pciio.h>
41 #include <sys/ioctl.h>
42 
43 #include <sys/pci.h>
44 
45 #include <dev/io/iodev.h>
46 #include <dev/pci/pcireg.h>
47 
48 #include <machine/iodev.h>
49 
50 #ifndef WITHOUT_CAPSICUM
51 #include <capsicum_helpers.h>
52 #endif
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <string.h>
56 #include <err.h>
57 #include <errno.h>
58 #include <fcntl.h>
59 #include <sysexits.h>
60 #include <unistd.h>
61 
62 #include <machine/vmm.h>
63 #include <vmmapi.h>
64 #include <sys/ppt_dev.h>
65 
66 #include "config.h"
67 #include "debug.h"
68 #include "pci_emul.h"
69 #include "mem.h"
70 
71 #define	LEGACY_SUPPORT	1
72 
73 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
74 #define MSIX_CAPLEN 12
75 
76 struct passthru_softc {
77 	struct pci_devinst *psc_pi;
78 	struct pcibar psc_bar[PCI_BARMAX + 1];
79 	struct {
80 		int		capoff;
81 		int		msgctrl;
82 		int		emulated;
83 	} psc_msi;
84 	struct {
85 		int		capoff;
86 	} psc_msix;
87 	int pptfd;
88 	int msi_limit;
89 	int msix_limit;
90 };
91 
92 static int
93 msi_caplen(int msgctrl)
94 {
95 	int len;
96 
97 	len = 10;		/* minimum length of msi capability */
98 
99 	if (msgctrl & PCIM_MSICTRL_64BIT)
100 		len += 4;
101 
102 #if 0
103 	/*
104 	 * Ignore the 'mask' and 'pending' bits in the MSI capability.
105 	 * We'll let the guest manipulate them directly.
106 	 */
107 	if (msgctrl & PCIM_MSICTRL_VECTOR)
108 		len += 10;
109 #endif
110 
111 	return (len);
112 }
113 
114 static uint32_t
115 read_config(const struct passthru_softc *sc, long reg, int width)
116 {
117 	struct ppt_cfg_io pi;
118 
119 	pi.pci_off = reg;
120 	pi.pci_width = width;
121 
122 	if (ioctl(sc->pptfd, PPT_CFG_READ, &pi) != 0) {
123 		return (0);
124 	}
125 	return (pi.pci_data);
126 }
127 
128 static void
129 write_config(const struct passthru_softc *sc, long reg, int width,
130     uint32_t data)
131 {
132 	struct ppt_cfg_io pi;
133 
134 	pi.pci_off = reg;
135 	pi.pci_width = width;
136 	pi.pci_data = data;
137 
138 	(void) ioctl(sc->pptfd, PPT_CFG_WRITE, &pi);
139 }
140 
141 static int
142 passthru_get_bar(struct passthru_softc *sc, int bar, enum pcibar_type *type,
143     uint64_t *base, uint64_t *size)
144 {
145 	struct ppt_bar_query pb;
146 
147 	pb.pbq_baridx = bar;
148 
149 	if (ioctl(sc->pptfd, PPT_BAR_QUERY, &pb) != 0) {
150 		return (-1);
151 	}
152 
153 	switch (pb.pbq_type) {
154 	case PCI_ADDR_IO:
155 		*type = PCIBAR_IO;
156 		break;
157 	case PCI_ADDR_MEM32:
158 		*type = PCIBAR_MEM32;
159 		break;
160 	case PCI_ADDR_MEM64:
161 		*type = PCIBAR_MEM64;
162 		break;
163 	default:
164 		err(1, "unrecognized BAR type: %u\n", pb.pbq_type);
165 		break;
166 	}
167 
168 	*base = pb.pbq_base;
169 	*size = pb.pbq_size;
170 	return (0);
171 }
172 
173 static int
174 passthru_dev_open(const char *path, int *pptfdp)
175 {
176 	int pptfd;
177 
178 	if ((pptfd = open(path, O_RDWR)) < 0) {
179 		return (errno);
180 	}
181 
182 	/* XXX: verify fd with ioctl? */
183 	*pptfdp = pptfd;
184 	return (0);
185 }
186 
187 #ifdef LEGACY_SUPPORT
188 static int
189 passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
190 {
191 	int capoff, i;
192 	struct msicap msicap;
193 	u_char *capdata;
194 
195 	pci_populate_msicap(&msicap, msgnum, nextptr);
196 
197 	/*
198 	 * XXX
199 	 * Copy the msi capability structure in the last 16 bytes of the
200 	 * config space. This is wrong because it could shadow something
201 	 * useful to the device.
202 	 */
203 	capoff = 256 - roundup(sizeof(msicap), 4);
204 	capdata = (u_char *)&msicap;
205 	for (i = 0; i < sizeof(msicap); i++)
206 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
207 
208 	return (capoff);
209 }
210 #endif	/* LEGACY_SUPPORT */
211 
212 static void
213 passthru_intr_limit(struct passthru_softc *sc, struct msixcap *msixcap)
214 {
215 	struct pci_devinst *pi = sc->psc_pi;
216 	int off;
217 
218 	/* Reduce the number of MSI vectors if higher than OS limit */
219 	if ((off = sc->psc_msi.capoff) != 0 && sc->msi_limit != -1) {
220 		int msi_limit, mmc;
221 
222 		msi_limit =
223 		    sc->msi_limit > 16 ? PCIM_MSICTRL_MMC_32 :
224 		    sc->msi_limit > 8 ? PCIM_MSICTRL_MMC_16 :
225 		    sc->msi_limit > 4 ? PCIM_MSICTRL_MMC_8 :
226 		    sc->msi_limit > 2 ? PCIM_MSICTRL_MMC_4 :
227 		    sc->msi_limit > 1 ? PCIM_MSICTRL_MMC_2 :
228 		    PCIM_MSICTRL_MMC_1;
229 		mmc = sc->psc_msi.msgctrl & PCIM_MSICTRL_MMC_MASK;
230 
231 		if (mmc > msi_limit) {
232 			sc->psc_msi.msgctrl &= ~PCIM_MSICTRL_MMC_MASK;
233 			sc->psc_msi.msgctrl |= msi_limit;
234 			pci_set_cfgdata16(pi, off + 2, sc->psc_msi.msgctrl);
235 		}
236 	}
237 
238 	/* Reduce the number of MSI-X vectors if higher than OS limit */
239 	if ((off = sc->psc_msix.capoff) != 0 && sc->msix_limit != -1) {
240 		if (MSIX_TABLE_COUNT(msixcap->msgctrl) > sc->msix_limit) {
241 			msixcap->msgctrl &= ~PCIM_MSIXCTRL_TABLE_SIZE;
242 			msixcap->msgctrl |= sc->msix_limit - 1;
243 			pci_set_cfgdata16(pi, off + 2, msixcap->msgctrl);
244 		}
245 	}
246 }
247 
248 static int
249 cfginitmsi(struct passthru_softc *sc)
250 {
251 	int i, ptr, capptr, cap, sts, caplen, table_size;
252 	uint32_t u32;
253 	struct pci_devinst *pi = sc->psc_pi;
254 	struct msixcap msixcap;
255 	uint32_t *msixcap_ptr;
256 
257 	/*
258 	 * Parse the capabilities and cache the location of the MSI
259 	 * and MSI-X capabilities.
260 	 */
261 	sts = read_config(sc, PCIR_STATUS, 2);
262 	if (sts & PCIM_STATUS_CAPPRESENT) {
263 		ptr = read_config(sc, PCIR_CAP_PTR, 1);
264 		while (ptr != 0 && ptr != 0xff) {
265 			cap = read_config(sc, ptr + PCICAP_ID, 1);
266 			if (cap == PCIY_MSI) {
267 				/*
268 				 * Copy the MSI capability into the config
269 				 * space of the emulated pci device
270 				 */
271 				sc->psc_msi.capoff = ptr;
272 				sc->psc_msi.msgctrl = read_config(sc,
273 				    ptr + 2, 2);
274 				sc->psc_msi.emulated = 0;
275 				caplen = msi_caplen(sc->psc_msi.msgctrl);
276 				capptr = ptr;
277 				while (caplen > 0) {
278 					u32 = read_config(sc, capptr, 4);
279 					pci_set_cfgdata32(pi, capptr, u32);
280 					caplen -= 4;
281 					capptr += 4;
282 				}
283 			} else if (cap == PCIY_MSIX) {
284 				/*
285 				 * Copy the MSI-X capability
286 				 */
287 				sc->psc_msix.capoff = ptr;
288 				caplen = 12;
289 				msixcap_ptr = (uint32_t*) &msixcap;
290 				capptr = ptr;
291 				while (caplen > 0) {
292 					u32 = read_config(sc, capptr, 4);
293 					*msixcap_ptr = u32;
294 					pci_set_cfgdata32(pi, capptr, u32);
295 					caplen -= 4;
296 					capptr += 4;
297 					msixcap_ptr++;
298 				}
299 			}
300 			ptr = read_config(sc, ptr + PCICAP_NEXTPTR, 1);
301 		}
302 	}
303 
304 	passthru_intr_limit(sc, &msixcap);
305 
306 	if (sc->psc_msix.capoff != 0) {
307 		pi->pi_msix.pba_bar =
308 		    msixcap.pba_info & PCIM_MSIX_BIR_MASK;
309 		pi->pi_msix.pba_offset =
310 		    msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
311 		pi->pi_msix.table_bar =
312 		    msixcap.table_info & PCIM_MSIX_BIR_MASK;
313 		pi->pi_msix.table_offset =
314 		    msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
315 		pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
316 		pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
317 
318 		/* Allocate the emulated MSI-X table array */
319 		table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
320 		pi->pi_msix.table = calloc(1, table_size);
321 
322 		/* Mask all table entries */
323 		for (i = 0; i < pi->pi_msix.table_count; i++) {
324 			pi->pi_msix.table[i].vector_control |=
325 						PCIM_MSIX_VCTRL_MASK;
326 		}
327 	}
328 
329 #ifdef LEGACY_SUPPORT
330 	/*
331 	 * If the passthrough device does not support MSI then craft a
332 	 * MSI capability for it. We link the new MSI capability at the
333 	 * head of the list of capabilities.
334 	 */
335 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
336 		int origptr, msiptr;
337 		origptr = read_config(sc, PCIR_CAP_PTR, 1);
338 		msiptr = passthru_add_msicap(pi, 1, origptr);
339 		sc->psc_msi.capoff = msiptr;
340 		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
341 		sc->psc_msi.emulated = 1;
342 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
343 	}
344 #endif
345 
346 	/* Make sure one of the capabilities is present */
347 	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) {
348 		return (-1);
349 	} else {
350 		return (0);
351 	}
352 }
353 
354 static uint64_t
355 passthru_msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
356 {
357 	struct pci_devinst *pi;
358 	struct msix_table_entry *entry;
359 	uint8_t *src8;
360 	uint16_t *src16;
361 	uint32_t *src32;
362 	uint64_t *src64;
363 	uint64_t data;
364 	size_t entry_offset;
365 	int index;
366 
367 	pi = sc->psc_pi;
368 	if (offset >= pi->pi_msix.pba_offset &&
369 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
370 		switch(size) {
371 		case 1:
372 			src8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
373 			    pi->pi_msix.pba_page_offset);
374 			data = *src8;
375 			break;
376 		case 2:
377 			src16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
378 			    pi->pi_msix.pba_page_offset);
379 			data = *src16;
380 			break;
381 		case 4:
382 			src32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
383 			    pi->pi_msix.pba_page_offset);
384 			data = *src32;
385 			break;
386 		case 8:
387 			src64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
388 			    pi->pi_msix.pba_page_offset);
389 			data = *src64;
390 			break;
391 		default:
392 			return (-1);
393 		}
394 		return (data);
395 	}
396 
397 	if (offset < pi->pi_msix.table_offset)
398 		return (-1);
399 
400 	offset -= pi->pi_msix.table_offset;
401 	index = offset / MSIX_TABLE_ENTRY_SIZE;
402 	if (index >= pi->pi_msix.table_count)
403 		return (-1);
404 
405 	entry = &pi->pi_msix.table[index];
406 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
407 
408 	switch(size) {
409 	case 1:
410 		src8 = (uint8_t *)((void *)entry + entry_offset);
411 		data = *src8;
412 		break;
413 	case 2:
414 		src16 = (uint16_t *)((void *)entry + entry_offset);
415 		data = *src16;
416 		break;
417 	case 4:
418 		src32 = (uint32_t *)((void *)entry + entry_offset);
419 		data = *src32;
420 		break;
421 	case 8:
422 		src64 = (uint64_t *)((void *)entry + entry_offset);
423 		data = *src64;
424 		break;
425 	default:
426 		return (-1);
427 	}
428 
429 	return (data);
430 }
431 
432 static void
433 passthru_msix_table_write(struct vmctx *ctx, int vcpu,
434     struct passthru_softc *sc, uint64_t offset, int size, uint64_t data)
435 {
436 	struct pci_devinst *pi;
437 	struct msix_table_entry *entry;
438 	uint8_t *dest8;
439 	uint16_t *dest16;
440 	uint32_t *dest32;
441 	uint64_t *dest64;
442 	size_t entry_offset;
443 	uint32_t vector_control;
444 	int index;
445 
446 	pi = sc->psc_pi;
447 	if (offset >= pi->pi_msix.pba_offset &&
448 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
449 		switch(size) {
450 		case 1:
451 			dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
452 			    pi->pi_msix.pba_page_offset);
453 			*dest8 = data;
454 			break;
455 		case 2:
456 			dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
457 			    pi->pi_msix.pba_page_offset);
458 			*dest16 = data;
459 			break;
460 		case 4:
461 			dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
462 			    pi->pi_msix.pba_page_offset);
463 			*dest32 = data;
464 			break;
465 		case 8:
466 			dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
467 			    pi->pi_msix.pba_page_offset);
468 			*dest64 = data;
469 			break;
470 		default:
471 			break;
472 		}
473 		return;
474 	}
475 
476 	if (offset < pi->pi_msix.table_offset)
477 		return;
478 
479 	offset -= pi->pi_msix.table_offset;
480 	index = offset / MSIX_TABLE_ENTRY_SIZE;
481 	if (index >= pi->pi_msix.table_count)
482 		return;
483 
484 	entry = &pi->pi_msix.table[index];
485 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
486 
487 	/* Only 4 byte naturally-aligned writes are supported */
488 	assert(size == 4);
489 	assert(entry_offset % 4 == 0);
490 
491 	vector_control = entry->vector_control;
492 	dest32 = (uint32_t *)((void *)entry + entry_offset);
493 	*dest32 = data;
494 	/* If MSI-X hasn't been enabled, do nothing */
495 	if (pi->pi_msix.enabled) {
496 		/* If the entry is masked, don't set it up */
497 		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
498 		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
499 			(void) vm_setup_pptdev_msix(ctx, vcpu, sc->pptfd,
500 			    index, entry->addr, entry->msg_data,
501 			    entry->vector_control);
502 		}
503 	}
504 }
505 
506 static int
507 init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
508 {
509 	int idx;
510 	size_t remaining __unused;
511 	uint32_t table_size, table_offset;
512 	uint32_t pba_size, pba_offset;
513 	vm_paddr_t start __unused;
514 	struct pci_devinst *pi = sc->psc_pi;
515 
516 	assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
517 
518 	/*
519 	 * If the MSI-X table BAR maps memory intended for
520 	 * other uses, it is at least assured that the table
521 	 * either resides in its own page within the region,
522 	 * or it resides in a page shared with only the PBA.
523 	 */
524 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
525 
526 	table_size = pi->pi_msix.table_offset - table_offset;
527 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
528 	table_size = roundup2(table_size, 4096);
529 
530 	idx = pi->pi_msix.table_bar;
531 	start = pi->pi_bar[idx].addr;
532 	remaining = pi->pi_bar[idx].size;
533 
534 	if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) {
535 		pba_offset = pi->pi_msix.pba_offset;
536 		pba_size = pi->pi_msix.pba_size;
537 		if (pba_offset >= table_offset + table_size ||
538 		    table_offset >= pba_offset + pba_size) {
539 			/*
540 			 * If the PBA does not share a page with the MSI-x
541 			 * tables, no PBA emulation is required.
542 			 */
543 			pi->pi_msix.pba_page = NULL;
544 			pi->pi_msix.pba_page_offset = 0;
545 		} else {
546 			/*
547 			 * The PBA overlaps with either the first or last
548 			 * page of the MSI-X table region.  Map the
549 			 * appropriate page.
550 			 */
551 			if (pba_offset <= table_offset)
552 				pi->pi_msix.pba_page_offset = table_offset;
553 			else
554 				pi->pi_msix.pba_page_offset = table_offset +
555 				    table_size - 4096;
556 			pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ |
557 			    PROT_WRITE, MAP_SHARED, sc->pptfd,
558 			    pi->pi_msix.pba_page_offset);
559 			if (pi->pi_msix.pba_page == MAP_FAILED) {
560 				warn("Failed to map PBA page for MSI-X on %d",
561 				    sc->pptfd);
562 				return (-1);
563 			}
564 		}
565 	}
566 
567 	return (0);
568 }
569 
570 static int
571 cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
572 {
573 	struct pci_devinst *pi = sc->psc_pi;
574 	uint_t i;
575 
576 	/*
577 	 * Initialize BAR registers
578 	 */
579 	for (i = 0; i <= PCI_BARMAX; i++) {
580 		enum pcibar_type bartype;
581 		uint64_t base, size;
582 		int error;
583 
584 		if (passthru_get_bar(sc, i, &bartype, &base, &size) != 0) {
585 			continue;
586 		}
587 
588 		if (bartype != PCIBAR_IO) {
589 			if (((base | size) & PAGE_MASK) != 0) {
590 				warnx("passthru device %d BAR %d: "
591 				    "base %#lx or size %#lx not page aligned\n",
592 				    sc->pptfd, i, base, size);
593 				return (-1);
594 			}
595 		}
596 
597 		/* Cache information about the "real" BAR */
598 		sc->psc_bar[i].type = bartype;
599 		sc->psc_bar[i].size = size;
600 		sc->psc_bar[i].addr = base;
601 
602 		/* Allocate the BAR in the guest I/O or MMIO space */
603 		error = pci_emul_alloc_bar(pi, i, bartype, size);
604 		if (error)
605 			return (-1);
606 
607 		/* The MSI-X table needs special handling */
608 		if (i == pci_msix_table_bar(pi)) {
609 			error = init_msix_table(ctx, sc, base);
610 			if (error)
611 				return (-1);
612 		}
613 
614 		/*
615 		 * 64-bit BAR takes up two slots so skip the next one.
616 		 */
617 		if (bartype == PCIBAR_MEM64) {
618 			i++;
619 			assert(i <= PCI_BARMAX);
620 			sc->psc_bar[i].type = PCIBAR_MEMHI64;
621 		}
622 	}
623 	return (0);
624 }
625 
626 static int
627 cfginit(struct vmctx *ctx, struct passthru_softc *sc)
628 {
629 	struct pci_devinst *pi = sc->psc_pi;
630 
631 	if (cfginitmsi(sc) != 0) {
632 		warnx("failed to initialize MSI for PCI %d", sc->pptfd);
633 		return (-1);
634 	}
635 
636 	if (cfginitbar(ctx, sc) != 0) {
637 		warnx("failed to initialize BARs for PCI %d", sc->pptfd);
638 		return (-1);
639 	}
640 
641 	pci_set_cfgdata16(pi, PCIR_COMMAND, read_config(sc, PCIR_COMMAND, 2));
642 
643 	return (0);
644 }
645 
646 static int
647 passthru_legacy_config(nvlist_t *nvl, const char *opts)
648 {
649 	if (opts == NULL)
650 		return (0);
651 
652 	if (strncmp(opts, "/dev/ppt", 8) == 0)
653 		set_config_value_node(nvl, "path", opts);
654 
655 	return (0);
656 }
657 
658 static int
659 passthru_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
660 {
661 	int error, memflags, pptfd;
662 	struct passthru_softc *sc;
663 	const char *path;
664 
665 	pptfd = -1;
666 	sc = NULL;
667 	error = 1;
668 
669 	memflags = vm_get_memflags(ctx);
670 	if (!(memflags & VM_MEM_F_WIRED)) {
671 		warnx("passthru requires guest memory to be wired");
672 		goto done;
673 	}
674 
675 	path = get_config_value_node(nvl, "path");
676 	if (path == NULL || passthru_dev_open(path, &pptfd) != 0) {
677 		warnx("invalid passthru options");
678 		goto done;
679 	}
680 
681 	if (vm_assign_pptdev(ctx, pptfd) != 0) {
682 		warnx("PCI device at %d is not using the ppt driver", pptfd);
683 		goto done;
684 	}
685 
686 	sc = calloc(1, sizeof(struct passthru_softc));
687 
688 	pi->pi_arg = sc;
689 	sc->psc_pi = pi;
690 	sc->pptfd = pptfd;
691 
692 	if ((error = vm_get_pptdev_limits(ctx, pptfd, &sc->msi_limit,
693 	    &sc->msix_limit)) != 0)
694 		goto done;
695 
696 	/* initialize config space */
697 	if ((error = cfginit(ctx, sc)) != 0)
698 		goto done;
699 
700 	error = 0;		/* success */
701 done:
702 	if (error) {
703 		free(sc);
704 		if (pptfd != -1)
705 			vm_unassign_pptdev(ctx, pptfd);
706 	}
707 	return (error);
708 }
709 
710 static int
711 bar_access(int coff)
712 {
713 	if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
714 		return (1);
715 	else
716 		return (0);
717 }
718 
719 static int
720 msicap_access(struct passthru_softc *sc, int coff)
721 {
722 	int caplen;
723 
724 	if (sc->psc_msi.capoff == 0)
725 		return (0);
726 
727 	caplen = msi_caplen(sc->psc_msi.msgctrl);
728 
729 	if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
730 		return (1);
731 	else
732 		return (0);
733 }
734 
735 static int
736 msixcap_access(struct passthru_softc *sc, int coff)
737 {
738 	if (sc->psc_msix.capoff == 0)
739 		return (0);
740 
741 	return (coff >= sc->psc_msix.capoff &&
742 	        coff < sc->psc_msix.capoff + MSIX_CAPLEN);
743 }
744 
745 static int
746 passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
747     int coff, int bytes, uint32_t *rv)
748 {
749 	struct passthru_softc *sc;
750 
751 	sc = pi->pi_arg;
752 
753 	/*
754 	 * PCI BARs and MSI capability is emulated.
755 	 */
756 	if (bar_access(coff) || msicap_access(sc, coff))
757 		return (-1);
758 
759 	/*
760 	 * MSI-X is also emulated since a limit on interrupts may be imposed by
761 	 * the OS, altering the perceived register state.
762 	 */
763 	if (msixcap_access(sc, coff))
764 		return (-1);
765 
766 #ifdef LEGACY_SUPPORT
767 	/*
768 	 * Emulate PCIR_CAP_PTR if this device does not support MSI capability
769 	 * natively.
770 	 */
771 	if (sc->psc_msi.emulated) {
772 		if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
773 			return (-1);
774 	}
775 #endif
776 
777 	/*
778 	 * Emulate the command register.  If a single read reads both the
779 	 * command and status registers, read the status register from the
780 	 * device's config space.
781 	 */
782 	if (coff == PCIR_COMMAND) {
783 		if (bytes <= 2)
784 			return (-1);
785 		*rv = pci_get_cfgdata16(pi, PCIR_COMMAND) << 16 |
786 		    read_config(sc, PCIR_STATUS, 2);
787 		return (0);
788 	}
789 
790 	/* Everything else just read from the device's config space */
791 	*rv = read_config(sc, coff, bytes);
792 
793 	return (0);
794 }
795 
796 static int
797 passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
798     int coff, int bytes, uint32_t val)
799 {
800 	int error, msix_table_entries, i;
801 	struct passthru_softc *sc;
802 	uint16_t cmd_old;
803 
804 	sc = pi->pi_arg;
805 
806 	/*
807 	 * PCI BARs are emulated
808 	 */
809 	if (bar_access(coff))
810 		return (-1);
811 
812 	/*
813 	 * MSI capability is emulated
814 	 */
815 	if (msicap_access(sc, coff)) {
816 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff,
817 		    PCIY_MSI);
818 		error = vm_setup_pptdev_msi(ctx, vcpu, sc->pptfd,
819 		    pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum);
820 		if (error != 0)
821 			err(1, "vm_setup_pptdev_msi");
822 		return (0);
823 	}
824 
825 	if (msixcap_access(sc, coff)) {
826 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff,
827 		    PCIY_MSIX);
828 		if (pi->pi_msix.enabled) {
829 			msix_table_entries = pi->pi_msix.table_count;
830 			for (i = 0; i < msix_table_entries; i++) {
831 				error = vm_setup_pptdev_msix(ctx, vcpu,
832 				    sc->pptfd, i,
833 				    pi->pi_msix.table[i].addr,
834 				    pi->pi_msix.table[i].msg_data,
835 				    pi->pi_msix.table[i].vector_control);
836 
837 				if (error)
838 					err(1, "vm_setup_pptdev_msix");
839 			}
840 		} else {
841 			error = vm_disable_pptdev_msix(ctx, sc->pptfd);
842 			if (error)
843 				err(1, "vm_disable_pptdev_msix");
844 		}
845 		return (0);
846 	}
847 
848 #ifdef LEGACY_SUPPORT
849 	/*
850 	 * If this device does not support MSI natively then we cannot let
851 	 * the guest disable legacy interrupts from the device. It is the
852 	 * legacy interrupt that is triggering the virtual MSI to the guest.
853 	 */
854 	if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
855 		if (coff == PCIR_COMMAND && bytes == 2)
856 			val &= ~PCIM_CMD_INTxDIS;
857 	}
858 #endif
859 
860 	write_config(sc, coff, bytes, val);
861 	if (coff == PCIR_COMMAND) {
862 		cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND);
863 		if (bytes == 1)
864 			pci_set_cfgdata8(pi, PCIR_COMMAND, val);
865 		else if (bytes == 2)
866 			pci_set_cfgdata16(pi, PCIR_COMMAND, val);
867 		pci_emul_cmd_changed(pi, cmd_old);
868 	}
869 
870 	return (0);
871 }
872 
873 static void
874 passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
875     uint64_t offset, int size, uint64_t value)
876 {
877 	struct passthru_softc *sc = pi->pi_arg;
878 
879 	if (baridx == pci_msix_table_bar(pi)) {
880 		passthru_msix_table_write(ctx, vcpu, sc, offset, size, value);
881 	} else {
882 		struct ppt_bar_io pbi;
883 
884 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
885 
886 		pbi.pbi_bar = baridx;
887 		pbi.pbi_width = size;
888 		pbi.pbi_off = offset;
889 		pbi.pbi_data = value;
890 		(void) ioctl(sc->pptfd, PPT_BAR_WRITE, &pbi);
891 	}
892 }
893 
894 static uint64_t
895 passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
896     uint64_t offset, int size)
897 {
898 	struct passthru_softc *sc = pi->pi_arg;
899 	uint64_t val;
900 
901 	if (baridx == pci_msix_table_bar(pi)) {
902 		val = passthru_msix_table_read(sc, offset, size);
903 	} else {
904 		struct ppt_bar_io pbi;
905 
906 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
907 
908 		pbi.pbi_bar = baridx;
909 		pbi.pbi_width = size;
910 		pbi.pbi_off = offset;
911 		if (ioctl(sc->pptfd, PPT_BAR_READ, &pbi) == 0) {
912 			val = pbi.pbi_data;
913 		} else {
914 			val = 0;
915 		}
916 	}
917 
918 	return (val);
919 }
920 
921 static void
922 passthru_msix_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
923     int enabled, uint64_t address)
924 {
925 	struct passthru_softc *sc;
926 	size_t remaining;
927 	uint32_t table_size, table_offset;
928 
929 	sc = pi->pi_arg;
930 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
931 	if (table_offset > 0) {
932 		if (!enabled) {
933 			if (vm_unmap_pptdev_mmio(ctx, sc->pptfd, address,
934 			    table_offset) != 0)
935 				warnx("pci_passthru: unmap_pptdev_mmio failed");
936 		} else {
937 			if (vm_map_pptdev_mmio(ctx, sc->pptfd, address,
938 			    table_offset, sc->psc_bar[baridx].addr) != 0)
939 				warnx("pci_passthru: map_pptdev_mmio failed");
940 		}
941 	}
942 	table_size = pi->pi_msix.table_offset - table_offset;
943 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
944 	table_size = roundup2(table_size, 4096);
945 	remaining = pi->pi_bar[baridx].size - table_offset - table_size;
946 	if (remaining > 0) {
947 		address += table_offset + table_size;
948 		if (!enabled) {
949 			if (vm_unmap_pptdev_mmio(ctx, sc->pptfd, address,
950 			    remaining) != 0)
951 				warnx("pci_passthru: unmap_pptdev_mmio failed");
952 		} else {
953 			if (vm_map_pptdev_mmio(ctx, sc->pptfd, address,
954 			    remaining, sc->psc_bar[baridx].addr +
955 			    table_offset + table_size) != 0)
956 				warnx("pci_passthru: map_pptdev_mmio failed");
957 		}
958 	}
959 }
960 
961 static void
962 passthru_mmio_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
963     int enabled, uint64_t address)
964 {
965 	struct passthru_softc *sc;
966 
967 	sc = pi->pi_arg;
968 	if (!enabled) {
969 		if (vm_unmap_pptdev_mmio(ctx, sc->pptfd, address,
970 		    sc->psc_bar[baridx].size) != 0)
971 			warnx("pci_passthru: unmap_pptdev_mmio failed");
972 	} else {
973 		if (vm_map_pptdev_mmio(ctx, sc->pptfd, address,
974 		    sc->psc_bar[baridx].size, sc->psc_bar[baridx].addr) != 0)
975 			warnx("pci_passthru: map_pptdev_mmio failed");
976 	}
977 }
978 
979 static void
980 passthru_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
981 	      int enabled, uint64_t address)
982 {
983 
984 	if (pi->pi_bar[baridx].type == PCIBAR_IO)
985 		return;
986 	if (baridx == pci_msix_table_bar(pi))
987 		passthru_msix_addr(ctx, pi, baridx, enabled, address);
988 	else
989 		passthru_mmio_addr(ctx, pi, baridx, enabled, address);
990 }
991 
992 struct pci_devemu passthru = {
993 	.pe_emu		= "passthru",
994 	.pe_init	= passthru_init,
995 	.pe_legacy_config = passthru_legacy_config,
996 	.pe_cfgwrite	= passthru_cfgwrite,
997 	.pe_cfgread	= passthru_cfgread,
998 	.pe_barwrite 	= passthru_write,
999 	.pe_barread    	= passthru_read,
1000 	.pe_baraddr	= passthru_addr,
1001 };
1002 PCI_EMUL_SET(passthru);
1003