xref: /freebsd/usr.sbin/bhyve/pci_passthru.c (revision d6b92ffa)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #ifndef WITHOUT_CAPSICUM
34 #include <sys/capsicum.h>
35 #endif
36 #include <sys/types.h>
37 #include <sys/mman.h>
38 #include <sys/pciio.h>
39 #include <sys/ioctl.h>
40 
41 #include <dev/io/iodev.h>
42 #include <dev/pci/pcireg.h>
43 
44 #include <machine/iodev.h>
45 
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <err.h>
50 #include <errno.h>
51 #include <fcntl.h>
52 #include <sysexits.h>
53 #include <unistd.h>
54 
55 #include <machine/vmm.h>
56 #include <vmmapi.h>
57 #include "pci_emul.h"
58 #include "mem.h"
59 
60 #ifndef _PATH_DEVPCI
61 #define	_PATH_DEVPCI	"/dev/pci"
62 #endif
63 
64 #ifndef	_PATH_DEVIO
65 #define	_PATH_DEVIO	"/dev/io"
66 #endif
67 
68 #ifndef _PATH_MEM
69 #define	_PATH_MEM	"/dev/mem"
70 #endif
71 
72 #define	LEGACY_SUPPORT	1
73 
74 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
75 #define MSIX_CAPLEN 12
76 
77 static int pcifd = -1;
78 static int iofd = -1;
79 static int memfd = -1;
80 
81 struct passthru_softc {
82 	struct pci_devinst *psc_pi;
83 	struct pcibar psc_bar[PCI_BARMAX + 1];
84 	struct {
85 		int		capoff;
86 		int		msgctrl;
87 		int		emulated;
88 	} psc_msi;
89 	struct {
90 		int		capoff;
91 	} psc_msix;
92 	struct pcisel psc_sel;
93 };
94 
95 static int
96 msi_caplen(int msgctrl)
97 {
98 	int len;
99 
100 	len = 10;		/* minimum length of msi capability */
101 
102 	if (msgctrl & PCIM_MSICTRL_64BIT)
103 		len += 4;
104 
105 #if 0
106 	/*
107 	 * Ignore the 'mask' and 'pending' bits in the MSI capability.
108 	 * We'll let the guest manipulate them directly.
109 	 */
110 	if (msgctrl & PCIM_MSICTRL_VECTOR)
111 		len += 10;
112 #endif
113 
114 	return (len);
115 }
116 
117 static uint32_t
118 read_config(const struct pcisel *sel, long reg, int width)
119 {
120 	struct pci_io pi;
121 
122 	bzero(&pi, sizeof(pi));
123 	pi.pi_sel = *sel;
124 	pi.pi_reg = reg;
125 	pi.pi_width = width;
126 
127 	if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
128 		return (0);				/* XXX */
129 	else
130 		return (pi.pi_data);
131 }
132 
133 static void
134 write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
135 {
136 	struct pci_io pi;
137 
138 	bzero(&pi, sizeof(pi));
139 	pi.pi_sel = *sel;
140 	pi.pi_reg = reg;
141 	pi.pi_width = width;
142 	pi.pi_data = data;
143 
144 	(void)ioctl(pcifd, PCIOCWRITE, &pi);		/* XXX */
145 }
146 
147 #ifdef LEGACY_SUPPORT
148 static int
149 passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
150 {
151 	int capoff, i;
152 	struct msicap msicap;
153 	u_char *capdata;
154 
155 	pci_populate_msicap(&msicap, msgnum, nextptr);
156 
157 	/*
158 	 * XXX
159 	 * Copy the msi capability structure in the last 16 bytes of the
160 	 * config space. This is wrong because it could shadow something
161 	 * useful to the device.
162 	 */
163 	capoff = 256 - roundup(sizeof(msicap), 4);
164 	capdata = (u_char *)&msicap;
165 	for (i = 0; i < sizeof(msicap); i++)
166 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
167 
168 	return (capoff);
169 }
170 #endif	/* LEGACY_SUPPORT */
171 
172 static int
173 cfginitmsi(struct passthru_softc *sc)
174 {
175 	int i, ptr, capptr, cap, sts, caplen, table_size;
176 	uint32_t u32;
177 	struct pcisel sel;
178 	struct pci_devinst *pi;
179 	struct msixcap msixcap;
180 	uint32_t *msixcap_ptr;
181 
182 	pi = sc->psc_pi;
183 	sel = sc->psc_sel;
184 
185 	/*
186 	 * Parse the capabilities and cache the location of the MSI
187 	 * and MSI-X capabilities.
188 	 */
189 	sts = read_config(&sel, PCIR_STATUS, 2);
190 	if (sts & PCIM_STATUS_CAPPRESENT) {
191 		ptr = read_config(&sel, PCIR_CAP_PTR, 1);
192 		while (ptr != 0 && ptr != 0xff) {
193 			cap = read_config(&sel, ptr + PCICAP_ID, 1);
194 			if (cap == PCIY_MSI) {
195 				/*
196 				 * Copy the MSI capability into the config
197 				 * space of the emulated pci device
198 				 */
199 				sc->psc_msi.capoff = ptr;
200 				sc->psc_msi.msgctrl = read_config(&sel,
201 								  ptr + 2, 2);
202 				sc->psc_msi.emulated = 0;
203 				caplen = msi_caplen(sc->psc_msi.msgctrl);
204 				capptr = ptr;
205 				while (caplen > 0) {
206 					u32 = read_config(&sel, capptr, 4);
207 					pci_set_cfgdata32(pi, capptr, u32);
208 					caplen -= 4;
209 					capptr += 4;
210 				}
211 			} else if (cap == PCIY_MSIX) {
212 				/*
213 				 * Copy the MSI-X capability
214 				 */
215 				sc->psc_msix.capoff = ptr;
216 				caplen = 12;
217 				msixcap_ptr = (uint32_t*) &msixcap;
218 				capptr = ptr;
219 				while (caplen > 0) {
220 					u32 = read_config(&sel, capptr, 4);
221 					*msixcap_ptr = u32;
222 					pci_set_cfgdata32(pi, capptr, u32);
223 					caplen -= 4;
224 					capptr += 4;
225 					msixcap_ptr++;
226 				}
227 			}
228 			ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
229 		}
230 	}
231 
232 	if (sc->psc_msix.capoff != 0) {
233 		pi->pi_msix.pba_bar =
234 		    msixcap.pba_info & PCIM_MSIX_BIR_MASK;
235 		pi->pi_msix.pba_offset =
236 		    msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
237 		pi->pi_msix.table_bar =
238 		    msixcap.table_info & PCIM_MSIX_BIR_MASK;
239 		pi->pi_msix.table_offset =
240 		    msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
241 		pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
242 		pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
243 
244 		/* Allocate the emulated MSI-X table array */
245 		table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
246 		pi->pi_msix.table = calloc(1, table_size);
247 
248 		/* Mask all table entries */
249 		for (i = 0; i < pi->pi_msix.table_count; i++) {
250 			pi->pi_msix.table[i].vector_control |=
251 						PCIM_MSIX_VCTRL_MASK;
252 		}
253 	}
254 
255 #ifdef LEGACY_SUPPORT
256 	/*
257 	 * If the passthrough device does not support MSI then craft a
258 	 * MSI capability for it. We link the new MSI capability at the
259 	 * head of the list of capabilities.
260 	 */
261 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
262 		int origptr, msiptr;
263 		origptr = read_config(&sel, PCIR_CAP_PTR, 1);
264 		msiptr = passthru_add_msicap(pi, 1, origptr);
265 		sc->psc_msi.capoff = msiptr;
266 		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
267 		sc->psc_msi.emulated = 1;
268 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
269 	}
270 #endif
271 
272 	/* Make sure one of the capabilities is present */
273 	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
274 		return (-1);
275 	else
276 		return (0);
277 }
278 
279 static uint64_t
280 msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
281 {
282 	struct pci_devinst *pi;
283 	struct msix_table_entry *entry;
284 	uint8_t *src8;
285 	uint16_t *src16;
286 	uint32_t *src32;
287 	uint64_t *src64;
288 	uint64_t data;
289 	size_t entry_offset;
290 	int index;
291 
292 	pi = sc->psc_pi;
293 	if (offset >= pi->pi_msix.pba_offset &&
294 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
295 		switch(size) {
296 		case 1:
297 			src8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
298 			    pi->pi_msix.pba_page_offset);
299 			data = *src8;
300 			break;
301 		case 2:
302 			src16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
303 			    pi->pi_msix.pba_page_offset);
304 			data = *src16;
305 			break;
306 		case 4:
307 			src32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
308 			    pi->pi_msix.pba_page_offset);
309 			data = *src32;
310 			break;
311 		case 8:
312 			src64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
313 			    pi->pi_msix.pba_page_offset);
314 			data = *src64;
315 			break;
316 		default:
317 			return (-1);
318 		}
319 		return (data);
320 	}
321 
322 	if (offset < pi->pi_msix.table_offset)
323 		return (-1);
324 
325 	offset -= pi->pi_msix.table_offset;
326 	index = offset / MSIX_TABLE_ENTRY_SIZE;
327 	if (index >= pi->pi_msix.table_count)
328 		return (-1);
329 
330 	entry = &pi->pi_msix.table[index];
331 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
332 
333 	switch(size) {
334 	case 1:
335 		src8 = (uint8_t *)((void *)entry + entry_offset);
336 		data = *src8;
337 		break;
338 	case 2:
339 		src16 = (uint16_t *)((void *)entry + entry_offset);
340 		data = *src16;
341 		break;
342 	case 4:
343 		src32 = (uint32_t *)((void *)entry + entry_offset);
344 		data = *src32;
345 		break;
346 	case 8:
347 		src64 = (uint64_t *)((void *)entry + entry_offset);
348 		data = *src64;
349 		break;
350 	default:
351 		return (-1);
352 	}
353 
354 	return (data);
355 }
356 
357 static void
358 msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
359 		 uint64_t offset, int size, uint64_t data)
360 {
361 	struct pci_devinst *pi;
362 	struct msix_table_entry *entry;
363 	uint8_t *dest8;
364 	uint16_t *dest16;
365 	uint32_t *dest32;
366 	uint64_t *dest64;
367 	size_t entry_offset;
368 	uint32_t vector_control;
369 	int index;
370 
371 	pi = sc->psc_pi;
372 	if (offset >= pi->pi_msix.pba_offset &&
373 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
374 		switch(size) {
375 		case 1:
376 			dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
377 			    pi->pi_msix.pba_page_offset);
378 			*dest8 = data;
379 			break;
380 		case 2:
381 			dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
382 			    pi->pi_msix.pba_page_offset);
383 			*dest16 = data;
384 			break;
385 		case 4:
386 			dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
387 			    pi->pi_msix.pba_page_offset);
388 			*dest32 = data;
389 			break;
390 		case 8:
391 			dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
392 			    pi->pi_msix.pba_page_offset);
393 			*dest64 = data;
394 			break;
395 		default:
396 			break;
397 		}
398 		return;
399 	}
400 
401 	if (offset < pi->pi_msix.table_offset)
402 		return;
403 
404 	offset -= pi->pi_msix.table_offset;
405 	index = offset / MSIX_TABLE_ENTRY_SIZE;
406 	if (index >= pi->pi_msix.table_count)
407 		return;
408 
409 	entry = &pi->pi_msix.table[index];
410 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
411 
412 	/* Only 4 byte naturally-aligned writes are supported */
413 	assert(size == 4);
414 	assert(entry_offset % 4 == 0);
415 
416 	vector_control = entry->vector_control;
417 	dest32 = (uint32_t *)((void *)entry + entry_offset);
418 	*dest32 = data;
419 	/* If MSI-X hasn't been enabled, do nothing */
420 	if (pi->pi_msix.enabled) {
421 		/* If the entry is masked, don't set it up */
422 		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
423 		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
424 			(void)vm_setup_pptdev_msix(ctx, vcpu,
425 			    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
426 			    sc->psc_sel.pc_func, index, entry->addr,
427 			    entry->msg_data, entry->vector_control);
428 		}
429 	}
430 }
431 
432 static int
433 init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
434 {
435 	int b, s, f;
436 	int error, idx;
437 	size_t len, remaining;
438 	uint32_t table_size, table_offset;
439 	uint32_t pba_size, pba_offset;
440 	vm_paddr_t start;
441 	struct pci_devinst *pi = sc->psc_pi;
442 
443 	assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
444 
445 	b = sc->psc_sel.pc_bus;
446 	s = sc->psc_sel.pc_dev;
447 	f = sc->psc_sel.pc_func;
448 
449 	/*
450 	 * If the MSI-X table BAR maps memory intended for
451 	 * other uses, it is at least assured that the table
452 	 * either resides in its own page within the region,
453 	 * or it resides in a page shared with only the PBA.
454 	 */
455 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
456 
457 	table_size = pi->pi_msix.table_offset - table_offset;
458 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
459 	table_size = roundup2(table_size, 4096);
460 
461 	idx = pi->pi_msix.table_bar;
462 	start = pi->pi_bar[idx].addr;
463 	remaining = pi->pi_bar[idx].size;
464 
465 	if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) {
466 		pba_offset = pi->pi_msix.pba_offset;
467 		pba_size = pi->pi_msix.pba_size;
468 		if (pba_offset >= table_offset + table_size ||
469 		    table_offset >= pba_offset + pba_size) {
470 			/*
471 			 * If the PBA does not share a page with the MSI-x
472 			 * tables, no PBA emulation is required.
473 			 */
474 			pi->pi_msix.pba_page = NULL;
475 			pi->pi_msix.pba_page_offset = 0;
476 		} else {
477 			/*
478 			 * The PBA overlaps with either the first or last
479 			 * page of the MSI-X table region.  Map the
480 			 * appropriate page.
481 			 */
482 			if (pba_offset <= table_offset)
483 				pi->pi_msix.pba_page_offset = table_offset;
484 			else
485 				pi->pi_msix.pba_page_offset = table_offset +
486 				    table_size - 4096;
487 			pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ |
488 			    PROT_WRITE, MAP_SHARED, memfd, start +
489 			    pi->pi_msix.pba_page_offset);
490 			if (pi->pi_msix.pba_page == MAP_FAILED) {
491 				warn(
492 			    "Failed to map PBA page for MSI-X on %d/%d/%d",
493 				    b, s, f);
494 				return (-1);
495 			}
496 		}
497 	}
498 
499 	/* Map everything before the MSI-X table */
500 	if (table_offset > 0) {
501 		len = table_offset;
502 		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
503 		if (error)
504 			return (error);
505 
506 		base += len;
507 		start += len;
508 		remaining -= len;
509 	}
510 
511 	/* Skip the MSI-X table */
512 	base += table_size;
513 	start += table_size;
514 	remaining -= table_size;
515 
516 	/* Map everything beyond the end of the MSI-X table */
517 	if (remaining > 0) {
518 		len = remaining;
519 		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
520 		if (error)
521 			return (error);
522 	}
523 
524 	return (0);
525 }
526 
527 static int
528 cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
529 {
530 	int i, error;
531 	struct pci_devinst *pi;
532 	struct pci_bar_io bar;
533 	enum pcibar_type bartype;
534 	uint64_t base, size;
535 
536 	pi = sc->psc_pi;
537 
538 	/*
539 	 * Initialize BAR registers
540 	 */
541 	for (i = 0; i <= PCI_BARMAX; i++) {
542 		bzero(&bar, sizeof(bar));
543 		bar.pbi_sel = sc->psc_sel;
544 		bar.pbi_reg = PCIR_BAR(i);
545 
546 		if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
547 			continue;
548 
549 		if (PCI_BAR_IO(bar.pbi_base)) {
550 			bartype = PCIBAR_IO;
551 			base = bar.pbi_base & PCIM_BAR_IO_BASE;
552 		} else {
553 			switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
554 			case PCIM_BAR_MEM_64:
555 				bartype = PCIBAR_MEM64;
556 				break;
557 			default:
558 				bartype = PCIBAR_MEM32;
559 				break;
560 			}
561 			base = bar.pbi_base & PCIM_BAR_MEM_BASE;
562 		}
563 		size = bar.pbi_length;
564 
565 		if (bartype != PCIBAR_IO) {
566 			if (((base | size) & PAGE_MASK) != 0) {
567 				warnx("passthru device %d/%d/%d BAR %d: "
568 				    "base %#lx or size %#lx not page aligned\n",
569 				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
570 				    sc->psc_sel.pc_func, i, base, size);
571 				return (-1);
572 			}
573 		}
574 
575 		/* Cache information about the "real" BAR */
576 		sc->psc_bar[i].type = bartype;
577 		sc->psc_bar[i].size = size;
578 		sc->psc_bar[i].addr = base;
579 
580 		/* Allocate the BAR in the guest I/O or MMIO space */
581 		error = pci_emul_alloc_pbar(pi, i, base, bartype, size);
582 		if (error)
583 			return (-1);
584 
585 		/* The MSI-X table needs special handling */
586 		if (i == pci_msix_table_bar(pi)) {
587 			error = init_msix_table(ctx, sc, base);
588 			if (error)
589 				return (-1);
590 		} else if (bartype != PCIBAR_IO) {
591 			/* Map the physical BAR in the guest MMIO space */
592 			error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
593 				sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
594 				pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
595 			if (error)
596 				return (-1);
597 		}
598 
599 		/*
600 		 * 64-bit BAR takes up two slots so skip the next one.
601 		 */
602 		if (bartype == PCIBAR_MEM64) {
603 			i++;
604 			assert(i <= PCI_BARMAX);
605 			sc->psc_bar[i].type = PCIBAR_MEMHI64;
606 		}
607 	}
608 	return (0);
609 }
610 
611 static int
612 cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
613 {
614 	int error;
615 	struct passthru_softc *sc;
616 
617 	error = 1;
618 	sc = pi->pi_arg;
619 
620 	bzero(&sc->psc_sel, sizeof(struct pcisel));
621 	sc->psc_sel.pc_bus = bus;
622 	sc->psc_sel.pc_dev = slot;
623 	sc->psc_sel.pc_func = func;
624 
625 	if (cfginitmsi(sc) != 0) {
626 		warnx("failed to initialize MSI for PCI %d/%d/%d",
627 		    bus, slot, func);
628 		goto done;
629 	}
630 
631 	if (cfginitbar(ctx, sc) != 0) {
632 		warnx("failed to initialize BARs for PCI %d/%d/%d",
633 		    bus, slot, func);
634 		goto done;
635 	}
636 
637 	error = 0;				/* success */
638 done:
639 	return (error);
640 }
641 
642 static int
643 passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
644 {
645 	int bus, slot, func, error, memflags;
646 	struct passthru_softc *sc;
647 #ifndef WITHOUT_CAPSICUM
648 	cap_rights_t rights;
649 	cap_ioctl_t pci_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR };
650 	cap_ioctl_t io_ioctls[] = { IODEV_PIO };
651 #endif
652 
653 	sc = NULL;
654 	error = 1;
655 
656 #ifndef WITHOUT_CAPSICUM
657 	cap_rights_init(&rights, CAP_IOCTL, CAP_READ, CAP_WRITE);
658 #endif
659 
660 	memflags = vm_get_memflags(ctx);
661 	if (!(memflags & VM_MEM_F_WIRED)) {
662 		warnx("passthru requires guest memory to be wired");
663 		goto done;
664 	}
665 
666 	if (pcifd < 0) {
667 		pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
668 		if (pcifd < 0) {
669 			warn("failed to open %s", _PATH_DEVPCI);
670 			goto done;
671 		}
672 	}
673 
674 #ifndef WITHOUT_CAPSICUM
675 	if (cap_rights_limit(pcifd, &rights) == -1 && errno != ENOSYS)
676 		errx(EX_OSERR, "Unable to apply rights for sandbox");
677 	if (cap_ioctls_limit(pcifd, pci_ioctls, nitems(pci_ioctls)) == -1 && errno != ENOSYS)
678 		errx(EX_OSERR, "Unable to apply rights for sandbox");
679 #endif
680 
681 	if (iofd < 0) {
682 		iofd = open(_PATH_DEVIO, O_RDWR, 0);
683 		if (iofd < 0) {
684 			warn("failed to open %s", _PATH_DEVIO);
685 			goto done;
686 		}
687 	}
688 
689 #ifndef WITHOUT_CAPSICUM
690 	if (cap_rights_limit(iofd, &rights) == -1 && errno != ENOSYS)
691 		errx(EX_OSERR, "Unable to apply rights for sandbox");
692 	if (cap_ioctls_limit(iofd, io_ioctls, nitems(io_ioctls)) == -1 && errno != ENOSYS)
693 		errx(EX_OSERR, "Unable to apply rights for sandbox");
694 #endif
695 
696 	if (memfd < 0) {
697 		memfd = open(_PATH_MEM, O_RDWR, 0);
698 		if (memfd < 0) {
699 			warn("failed to open %s", _PATH_MEM);
700 			goto done;
701 		}
702 	}
703 
704 #ifndef WITHOUT_CAPSICUM
705 	cap_rights_clear(&rights, CAP_IOCTL);
706 	cap_rights_set(&rights, CAP_MMAP_RW);
707 	if (cap_rights_limit(memfd, &rights) == -1 && errno != ENOSYS)
708 		errx(EX_OSERR, "Unable to apply rights for sandbox");
709 #endif
710 
711 	if (opts == NULL ||
712 	    sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) {
713 		warnx("invalid passthru options");
714 		goto done;
715 	}
716 
717 	if (vm_assign_pptdev(ctx, bus, slot, func) != 0) {
718 		warnx("PCI device at %d/%d/%d is not using the ppt(4) driver",
719 		    bus, slot, func);
720 		goto done;
721 	}
722 
723 	sc = calloc(1, sizeof(struct passthru_softc));
724 
725 	pi->pi_arg = sc;
726 	sc->psc_pi = pi;
727 
728 	/* initialize config space */
729 	if ((error = cfginit(ctx, pi, bus, slot, func)) != 0)
730 		goto done;
731 
732 	error = 0;		/* success */
733 done:
734 	if (error) {
735 		free(sc);
736 		vm_unassign_pptdev(ctx, bus, slot, func);
737 	}
738 	return (error);
739 }
740 
741 static int
742 bar_access(int coff)
743 {
744 	if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
745 		return (1);
746 	else
747 		return (0);
748 }
749 
750 static int
751 msicap_access(struct passthru_softc *sc, int coff)
752 {
753 	int caplen;
754 
755 	if (sc->psc_msi.capoff == 0)
756 		return (0);
757 
758 	caplen = msi_caplen(sc->psc_msi.msgctrl);
759 
760 	if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
761 		return (1);
762 	else
763 		return (0);
764 }
765 
766 static int
767 msixcap_access(struct passthru_softc *sc, int coff)
768 {
769 	if (sc->psc_msix.capoff == 0)
770 		return (0);
771 
772 	return (coff >= sc->psc_msix.capoff &&
773 	        coff < sc->psc_msix.capoff + MSIX_CAPLEN);
774 }
775 
776 static int
777 passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
778 		 int coff, int bytes, uint32_t *rv)
779 {
780 	struct passthru_softc *sc;
781 
782 	sc = pi->pi_arg;
783 
784 	/*
785 	 * PCI BARs and MSI capability is emulated.
786 	 */
787 	if (bar_access(coff) || msicap_access(sc, coff))
788 		return (-1);
789 
790 #ifdef LEGACY_SUPPORT
791 	/*
792 	 * Emulate PCIR_CAP_PTR if this device does not support MSI capability
793 	 * natively.
794 	 */
795 	if (sc->psc_msi.emulated) {
796 		if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
797 			return (-1);
798 	}
799 #endif
800 
801 	/* Everything else just read from the device's config space */
802 	*rv = read_config(&sc->psc_sel, coff, bytes);
803 
804 	return (0);
805 }
806 
807 static int
808 passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
809 		  int coff, int bytes, uint32_t val)
810 {
811 	int error, msix_table_entries, i;
812 	struct passthru_softc *sc;
813 
814 	sc = pi->pi_arg;
815 
816 	/*
817 	 * PCI BARs are emulated
818 	 */
819 	if (bar_access(coff))
820 		return (-1);
821 
822 	/*
823 	 * MSI capability is emulated
824 	 */
825 	if (msicap_access(sc, coff)) {
826 		msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
827 
828 		error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus,
829 			sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
830 			pi->pi_msi.addr, pi->pi_msi.msg_data,
831 			pi->pi_msi.maxmsgnum);
832 		if (error != 0)
833 			err(1, "vm_setup_pptdev_msi");
834 		return (0);
835 	}
836 
837 	if (msixcap_access(sc, coff)) {
838 		msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val);
839 		if (pi->pi_msix.enabled) {
840 			msix_table_entries = pi->pi_msix.table_count;
841 			for (i = 0; i < msix_table_entries; i++) {
842 				error = vm_setup_pptdev_msix(ctx, vcpu,
843 				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
844 				    sc->psc_sel.pc_func, i,
845 				    pi->pi_msix.table[i].addr,
846 				    pi->pi_msix.table[i].msg_data,
847 				    pi->pi_msix.table[i].vector_control);
848 
849 				if (error)
850 					err(1, "vm_setup_pptdev_msix");
851 			}
852 		}
853 		return (0);
854 	}
855 
856 #ifdef LEGACY_SUPPORT
857 	/*
858 	 * If this device does not support MSI natively then we cannot let
859 	 * the guest disable legacy interrupts from the device. It is the
860 	 * legacy interrupt that is triggering the virtual MSI to the guest.
861 	 */
862 	if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
863 		if (coff == PCIR_COMMAND && bytes == 2)
864 			val &= ~PCIM_CMD_INTxDIS;
865 	}
866 #endif
867 
868 	write_config(&sc->psc_sel, coff, bytes, val);
869 
870 	return (0);
871 }
872 
873 static void
874 passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
875 	       uint64_t offset, int size, uint64_t value)
876 {
877 	struct passthru_softc *sc;
878 	struct iodev_pio_req pio;
879 
880 	sc = pi->pi_arg;
881 
882 	if (baridx == pci_msix_table_bar(pi)) {
883 		msix_table_write(ctx, vcpu, sc, offset, size, value);
884 	} else {
885 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
886 		bzero(&pio, sizeof(struct iodev_pio_req));
887 		pio.access = IODEV_PIO_WRITE;
888 		pio.port = sc->psc_bar[baridx].addr + offset;
889 		pio.width = size;
890 		pio.val = value;
891 
892 		(void)ioctl(iofd, IODEV_PIO, &pio);
893 	}
894 }
895 
896 static uint64_t
897 passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
898 	      uint64_t offset, int size)
899 {
900 	struct passthru_softc *sc;
901 	struct iodev_pio_req pio;
902 	uint64_t val;
903 
904 	sc = pi->pi_arg;
905 
906 	if (baridx == pci_msix_table_bar(pi)) {
907 		val = msix_table_read(sc, offset, size);
908 	} else {
909 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
910 		bzero(&pio, sizeof(struct iodev_pio_req));
911 		pio.access = IODEV_PIO_READ;
912 		pio.port = sc->psc_bar[baridx].addr + offset;
913 		pio.width = size;
914 		pio.val = 0;
915 
916 		(void)ioctl(iofd, IODEV_PIO, &pio);
917 
918 		val = pio.val;
919 	}
920 
921 	return (val);
922 }
923 
924 struct pci_devemu passthru = {
925 	.pe_emu		= "passthru",
926 	.pe_init	= passthru_init,
927 	.pe_cfgwrite	= passthru_cfgwrite,
928 	.pe_cfgread	= passthru_cfgread,
929 	.pe_barwrite 	= passthru_write,
930 	.pe_barread    	= passthru_read,
931 };
932 PCI_EMUL_SET(passthru);
933