xref: /illumos-gate/usr/src/uts/intel/io/vmm/intel/vtd.c (revision 02b17e23)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/systm.h>
37 #include <sys/malloc.h>
38 
39 #include <dev/pci/pcireg.h>
40 
41 #include <machine/vmparam.h>
42 #include <sys/vmm_vm.h>
43 
44 #include <contrib/dev/acpica/include/acpi.h>
45 
46 #include <sys/sunndi.h>
47 
48 #include "io/iommu.h"
49 
50 /*
51  * Documented in the "Intel Virtualization Technology for Directed I/O",
52  * Architecture Spec, September 2008.
53  */
54 
55 #define	VTD_DRHD_INCLUDE_PCI_ALL(Flags)  (((Flags) >> 0) & 0x1)
56 
57 /* Section 10.4 "Register Descriptions" */
58 struct vtdmap {
59 	volatile uint32_t	version;
60 	volatile uint32_t	res0;
61 	volatile uint64_t	cap;
62 	volatile uint64_t	ext_cap;
63 	volatile uint32_t	gcr;
64 	volatile uint32_t	gsr;
65 	volatile uint64_t	rta;
66 	volatile uint64_t	ccr;
67 };
68 
69 #define	VTD_CAP_SAGAW(cap)	(((cap) >> 8) & 0x1F)
70 #define	VTD_CAP_ND(cap)		((cap) & 0x7)
71 #define	VTD_CAP_CM(cap)		(((cap) >> 7) & 0x1)
72 #define	VTD_CAP_SPS(cap)	(((cap) >> 34) & 0xF)
73 #define	VTD_CAP_RWBF(cap)	(((cap) >> 4) & 0x1)
74 
75 #define	VTD_ECAP_DI(ecap)	(((ecap) >> 2) & 0x1)
76 #define	VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
77 #define	VTD_ECAP_IRO(ecap)	(((ecap) >> 8) & 0x3FF)
78 
79 #define	VTD_GCR_WBF		(1 << 27)
80 #define	VTD_GCR_SRTP		(1 << 30)
81 #define	VTD_GCR_TE		(1U << 31)
82 
83 #define	VTD_GSR_WBFS		(1 << 27)
84 #define	VTD_GSR_RTPS		(1 << 30)
85 #define	VTD_GSR_TES		(1U << 31)
86 
87 #define	VTD_CCR_ICC		(1UL << 63)	/* invalidate context cache */
88 #define	VTD_CCR_CIRG_GLOBAL	(1UL << 61)	/* global invalidation */
89 
90 #define	VTD_IIR_IVT		(1UL << 63)	/* invalidation IOTLB */
91 #define	VTD_IIR_IIRG_GLOBAL	(1ULL << 60)	/* global IOTLB invalidation */
92 #define	VTD_IIR_IIRG_DOMAIN	(2ULL << 60)	/* domain IOTLB invalidation */
93 #define	VTD_IIR_IIRG_PAGE	(3ULL << 60)	/* page IOTLB invalidation */
94 #define	VTD_IIR_DRAIN_READS	(1ULL << 49)	/* drain pending DMA reads */
95 #define	VTD_IIR_DRAIN_WRITES	(1ULL << 48)	/* drain pending DMA writes */
96 #define	VTD_IIR_DOMAIN_P	32
97 
98 #define	VTD_ROOT_PRESENT	0x1
99 #define	VTD_CTX_PRESENT		0x1
100 #define	VTD_CTX_TT_ALL		(1UL << 2)
101 
102 #define	VTD_PTE_RD		(1UL << 0)
103 #define	VTD_PTE_WR		(1UL << 1)
104 #define	VTD_PTE_SUPERPAGE	(1UL << 7)
105 #define	VTD_PTE_ADDR_M		(0x000FFFFFFFFFF000UL)
106 
107 #define	VTD_RID2IDX(rid)	(((rid) & 0xff) * 2)
108 
109 struct domain {
110 	uint64_t	*ptp;		/* first level page table page */
111 	int		pt_levels;	/* number of page table levels */
112 	int		addrwidth;	/* 'AW' field in context entry */
113 	int		spsmask;	/* supported super page sizes */
114 	uint_t		id;		/* domain id */
115 	vm_paddr_t	maxaddr;	/* highest address to be mapped */
116 	SLIST_ENTRY(domain) next;
117 };
118 
119 static SLIST_HEAD(, domain) domhead;
120 
121 #define	DRHD_MAX_UNITS	8
122 static ACPI_DMAR_HARDWARE_UNIT	*drhds[DRHD_MAX_UNITS];
123 static int			drhd_num;
124 static struct vtdmap		*vtdmaps[DRHD_MAX_UNITS];
125 static int			max_domains;
126 typedef int			(*drhd_ident_func_t)(void);
127 #ifndef __FreeBSD__
128 static dev_info_t	*vtddips[DRHD_MAX_UNITS];
129 #endif
130 
131 static uint64_t root_table[PAGE_SIZE / sizeof (uint64_t)] __aligned(4096);
132 static uint64_t ctx_tables[256][PAGE_SIZE / sizeof (uint64_t)] __aligned(4096);
133 
134 static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
135 
136 static int
137 vtd_max_domains(struct vtdmap *vtdmap)
138 {
139 	int nd;
140 
141 	nd = VTD_CAP_ND(vtdmap->cap);
142 
143 	switch (nd) {
144 	case 0:
145 		return (16);
146 	case 1:
147 		return (64);
148 	case 2:
149 		return (256);
150 	case 3:
151 		return (1024);
152 	case 4:
153 		return (4 * 1024);
154 	case 5:
155 		return (16 * 1024);
156 	case 6:
157 		return (64 * 1024);
158 	default:
159 		panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
160 	}
161 }
162 
163 static uint_t
164 domain_id(void)
165 {
166 	uint_t id;
167 	struct domain *dom;
168 
169 	/* Skip domain id 0 - it is reserved when Caching Mode field is set */
170 	for (id = 1; id < max_domains; id++) {
171 		SLIST_FOREACH(dom, &domhead, next) {
172 			if (dom->id == id)
173 				break;
174 		}
175 		if (dom == NULL)
176 			break;		/* found it */
177 	}
178 
179 	if (id >= max_domains)
180 		panic("domain ids exhausted");
181 
182 	return (id);
183 }
184 
185 static struct vtdmap *
186 vtd_device_scope(uint16_t rid)
187 {
188 	int i, remaining, pathrem;
189 	char *end, *pathend;
190 	struct vtdmap *vtdmap;
191 	ACPI_DMAR_HARDWARE_UNIT *drhd;
192 	ACPI_DMAR_DEVICE_SCOPE *device_scope;
193 	ACPI_DMAR_PCI_PATH *path;
194 
195 	for (i = 0; i < drhd_num; i++) {
196 		drhd = drhds[i];
197 
198 		if (VTD_DRHD_INCLUDE_PCI_ALL(drhd->Flags)) {
199 			/*
200 			 * From Intel VT-d arch spec, version 3.0:
201 			 * If a DRHD structure with INCLUDE_PCI_ALL flag Set is
202 			 * reported for a Segment, it must be enumerated by BIOS
203 			 * after all other DRHD structures for the same Segment.
204 			 */
205 			vtdmap = vtdmaps[i];
206 			return (vtdmap);
207 		}
208 
209 		end = (char *)drhd + drhd->Header.Length;
210 		remaining = drhd->Header.Length -
211 		    sizeof (ACPI_DMAR_HARDWARE_UNIT);
212 		while (remaining > sizeof (ACPI_DMAR_DEVICE_SCOPE)) {
213 			device_scope =
214 			    (ACPI_DMAR_DEVICE_SCOPE *)(end - remaining);
215 			remaining -= device_scope->Length;
216 
217 			switch (device_scope->EntryType) {
218 				/* 0x01 and 0x02 are PCI device entries */
219 				case 0x01:
220 				case 0x02:
221 					break;
222 				default:
223 					continue;
224 			}
225 
226 			if (PCI_RID2BUS(rid) != device_scope->Bus)
227 				continue;
228 
229 			pathend = (char *)device_scope + device_scope->Length;
230 			pathrem = device_scope->Length -
231 			    sizeof (ACPI_DMAR_DEVICE_SCOPE);
232 			while (pathrem >= sizeof (ACPI_DMAR_PCI_PATH)) {
233 				path = (ACPI_DMAR_PCI_PATH *)
234 				    (pathend - pathrem);
235 				pathrem -= sizeof (ACPI_DMAR_PCI_PATH);
236 
237 				if (PCI_RID2SLOT(rid) != path->Device)
238 					continue;
239 				if (PCI_RID2FUNC(rid) != path->Function)
240 					continue;
241 
242 				vtdmap = vtdmaps[i];
243 				return (vtdmap);
244 			}
245 		}
246 	}
247 
248 	/* No matching scope */
249 	return (NULL);
250 }
251 
252 static void
253 vtd_wbflush(struct vtdmap *vtdmap)
254 {
255 
256 	if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
257 		invalidate_cache_all();
258 
259 	if (VTD_CAP_RWBF(vtdmap->cap)) {
260 		vtdmap->gcr = VTD_GCR_WBF;
261 		while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
262 			;
263 	}
264 }
265 
266 static void
267 vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
268 {
269 
270 	vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
271 	while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
272 		;
273 }
274 
275 static void
276 vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
277 {
278 	int offset;
279 	volatile uint64_t *iotlb_reg, val;
280 
281 	vtd_wbflush(vtdmap);
282 
283 	offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
284 	iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
285 
286 	*iotlb_reg =  VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
287 	    VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
288 
289 	while (1) {
290 		val = *iotlb_reg;
291 		if ((val & VTD_IIR_IVT) == 0)
292 			break;
293 	}
294 }
295 
296 static void
297 vtd_translation_enable(struct vtdmap *vtdmap)
298 {
299 
300 	vtdmap->gcr = VTD_GCR_TE;
301 	while ((vtdmap->gsr & VTD_GSR_TES) == 0)
302 		;
303 }
304 
305 static void
306 vtd_translation_disable(struct vtdmap *vtdmap)
307 {
308 
309 	vtdmap->gcr = 0;
310 	while ((vtdmap->gsr & VTD_GSR_TES) != 0)
311 		;
312 }
313 
314 static void *
315 vtd_map(dev_info_t *dip)
316 {
317 	caddr_t regs;
318 	ddi_acc_handle_t hdl;
319 	int error;
320 
321 	static ddi_device_acc_attr_t regs_attr = {
322 		DDI_DEVICE_ATTR_V0,
323 		DDI_NEVERSWAP_ACC,
324 		DDI_STRICTORDER_ACC,
325 	};
326 
327 	error = ddi_regs_map_setup(dip, 0, &regs, 0, PAGE_SIZE, &regs_attr,
328 	    &hdl);
329 
330 	if (error != DDI_SUCCESS)
331 		return (NULL);
332 
333 	ddi_set_driver_private(dip, hdl);
334 
335 	return (regs);
336 }
337 
338 static void
339 vtd_unmap(dev_info_t *dip)
340 {
341 	ddi_acc_handle_t hdl = ddi_get_driver_private(dip);
342 
343 	if (hdl != NULL)
344 		ddi_regs_map_free(&hdl);
345 }
346 
347 #ifndef __FreeBSD__
348 /*
349  * This lives in vtd_sol.c for license reasons.
350  */
351 extern dev_info_t *vtd_get_dip(ACPI_DMAR_HARDWARE_UNIT *, int);
352 #endif
353 
354 static int
355 vtd_init(void)
356 {
357 	int i, units, remaining, tmp;
358 	struct vtdmap *vtdmap;
359 	vm_paddr_t ctx_paddr;
360 	char *end;
361 #ifdef __FreeBSD__
362 	char envname[32];
363 	unsigned long mapaddr;
364 #endif
365 	ACPI_STATUS status;
366 	ACPI_TABLE_DMAR *dmar;
367 	ACPI_DMAR_HEADER *hdr;
368 	ACPI_DMAR_HARDWARE_UNIT *drhd;
369 
370 #ifdef __FreeBSD__
371 	/*
372 	 * Allow the user to override the ACPI DMAR table by specifying the
373 	 * physical address of each remapping unit.
374 	 *
375 	 * The following example specifies two remapping units at
376 	 * physical addresses 0xfed90000 and 0xfeda0000 respectively.
377 	 * set vtd.regmap.0.addr=0xfed90000
378 	 * set vtd.regmap.1.addr=0xfeda0000
379 	 */
380 	for (units = 0; units < DRHD_MAX_UNITS; units++) {
381 		snprintf(envname, sizeof (envname), "vtd.regmap.%d.addr",
382 		    units);
383 		if (getenv_ulong(envname, &mapaddr) == 0)
384 			break;
385 		vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr);
386 	}
387 
388 	if (units > 0)
389 		goto skip_dmar;
390 #else
391 	units = 0;
392 #endif
393 	/* Search for DMAR table. */
394 	status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar);
395 	if (ACPI_FAILURE(status))
396 		return (ENXIO);
397 
398 	end = (char *)dmar + dmar->Header.Length;
399 	remaining = dmar->Header.Length - sizeof (ACPI_TABLE_DMAR);
400 	while (remaining > sizeof (ACPI_DMAR_HEADER)) {
401 		hdr = (ACPI_DMAR_HEADER *)(end - remaining);
402 		if (hdr->Length > remaining)
403 			break;
404 		/*
405 		 * From Intel VT-d arch spec, version 1.3:
406 		 * BIOS implementations must report mapping structures
407 		 * in numerical order, i.e. All remapping structures of
408 		 * type 0 (DRHD) enumerated before remapping structures of
409 		 * type 1 (RMRR) and so forth.
410 		 */
411 		if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT)
412 			break;
413 
414 		drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr;
415 		drhds[units] = drhd;
416 #ifdef __FreeBSD__
417 		vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address);
418 #else
419 		vtddips[units] = vtd_get_dip(drhd, units);
420 		vtdmaps[units] = (struct vtdmap *)vtd_map(vtddips[units]);
421 		if (vtdmaps[units] == NULL)
422 			goto fail;
423 #endif
424 		if (++units >= DRHD_MAX_UNITS)
425 			break;
426 		remaining -= hdr->Length;
427 	}
428 
429 	if (units <= 0)
430 		return (ENXIO);
431 
432 #ifdef __FreeBSD__
433 skip_dmar:
434 #endif
435 	drhd_num = units;
436 
437 	max_domains = 64 * 1024; /* maximum valid value */
438 	for (i = 0; i < drhd_num; i++) {
439 		vtdmap = vtdmaps[i];
440 
441 		if (VTD_CAP_CM(vtdmap->cap) != 0)
442 			panic("vtd_init: invalid caching mode");
443 
444 		/* take most compatible (minimum) value */
445 		if ((tmp = vtd_max_domains(vtdmap)) < max_domains)
446 			max_domains = tmp;
447 	}
448 
449 	/*
450 	 * Set up the root-table to point to the context-entry tables
451 	 */
452 	for (i = 0; i < 256; i++) {
453 		ctx_paddr = vtophys(ctx_tables[i]);
454 		if (ctx_paddr & PAGE_MASK)
455 			panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
456 
457 		root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
458 	}
459 
460 	return (0);
461 
462 #ifndef __FreeBSD__
463 fail:
464 	for (i = 0; i <= units; i++)
465 		vtd_unmap(vtddips[i]);
466 	return (ENXIO);
467 #endif
468 }
469 
470 static void
471 vtd_cleanup(void)
472 {
473 #ifndef __FreeBSD__
474 	int i;
475 
476 	KASSERT(SLIST_EMPTY(&domhead), ("domain list not empty"));
477 
478 	bzero(root_table, sizeof (root_table));
479 
480 	for (i = 0; i <= drhd_num; i++) {
481 		vtdmaps[i] = NULL;
482 		/*
483 		 * Unmap the vtd registers. Note that the devinfo nodes
484 		 * themselves aren't removed, they are considered system state
485 		 * and can be reused when the module is reloaded.
486 		 */
487 		if (vtddips[i] != NULL)
488 			vtd_unmap(vtddips[i]);
489 	}
490 #endif
491 }
492 
493 static void
494 vtd_enable(void)
495 {
496 	int i;
497 	struct vtdmap *vtdmap;
498 
499 	for (i = 0; i < drhd_num; i++) {
500 		vtdmap = vtdmaps[i];
501 		vtd_wbflush(vtdmap);
502 
503 		/* Update the root table address */
504 		vtdmap->rta = vtophys(root_table);
505 		vtdmap->gcr = VTD_GCR_SRTP;
506 		while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
507 			;
508 
509 		vtd_ctx_global_invalidate(vtdmap);
510 		vtd_iotlb_global_invalidate(vtdmap);
511 
512 		vtd_translation_enable(vtdmap);
513 	}
514 }
515 
516 static void
517 vtd_disable(void)
518 {
519 	int i;
520 	struct vtdmap *vtdmap;
521 
522 	for (i = 0; i < drhd_num; i++) {
523 		vtdmap = vtdmaps[i];
524 		vtd_translation_disable(vtdmap);
525 	}
526 }
527 
528 static void
529 vtd_add_device(void *arg, uint16_t rid)
530 {
531 	int idx;
532 	uint64_t *ctxp;
533 	struct domain *dom = arg;
534 	vm_paddr_t pt_paddr;
535 	struct vtdmap *vtdmap;
536 	uint8_t bus;
537 
538 	bus = PCI_RID2BUS(rid);
539 	ctxp = ctx_tables[bus];
540 	pt_paddr = vtophys(dom->ptp);
541 	idx = VTD_RID2IDX(rid);
542 
543 	if (ctxp[idx] & VTD_CTX_PRESENT) {
544 		panic("vtd_add_device: device %x is already owned by "
545 		    "domain %d", rid, (uint16_t)(ctxp[idx + 1] >> 8));
546 	}
547 
548 	if ((vtdmap = vtd_device_scope(rid)) == NULL)
549 		panic("vtd_add_device: device %x is not in scope for "
550 		    "any DMA remapping unit", rid);
551 
552 	/*
553 	 * Order is important. The 'present' bit is set only after all fields
554 	 * of the context pointer are initialized.
555 	 */
556 	ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
557 
558 	if (VTD_ECAP_DI(vtdmap->ext_cap))
559 		ctxp[idx] = VTD_CTX_TT_ALL;
560 	else
561 		ctxp[idx] = 0;
562 
563 	ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
564 
565 	/*
566 	 * 'Not Present' entries are not cached in either the Context Cache
567 	 * or in the IOTLB, so there is no need to invalidate either of them.
568 	 */
569 }
570 
571 static void
572 vtd_remove_device(void *arg, uint16_t rid)
573 {
574 	int i, idx;
575 	uint64_t *ctxp;
576 	struct vtdmap *vtdmap;
577 	uint8_t bus;
578 
579 	bus = PCI_RID2BUS(rid);
580 	ctxp = ctx_tables[bus];
581 	idx = VTD_RID2IDX(rid);
582 
583 	/*
584 	 * Order is important. The 'present' bit is must be cleared first.
585 	 */
586 	ctxp[idx] = 0;
587 	ctxp[idx + 1] = 0;
588 
589 	/*
590 	 * Invalidate the Context Cache and the IOTLB.
591 	 *
592 	 * XXX use device-selective invalidation for Context Cache
593 	 * XXX use domain-selective invalidation for IOTLB
594 	 */
595 	for (i = 0; i < drhd_num; i++) {
596 		vtdmap = vtdmaps[i];
597 		vtd_ctx_global_invalidate(vtdmap);
598 		vtd_iotlb_global_invalidate(vtdmap);
599 	}
600 }
601 
602 #define	CREATE_MAPPING	0
603 #define	REMOVE_MAPPING	1
604 
605 static uint64_t
606 vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
607     int remove)
608 {
609 	struct domain *dom;
610 	int i, spshift, ptpshift, ptpindex, nlevels;
611 	uint64_t spsize, *ptp;
612 
613 	dom = arg;
614 	ptpindex = 0;
615 	ptpshift = 0;
616 
617 	KASSERT(gpa + len > gpa, ("%s: invalid gpa range %lx/%lx", __func__,
618 	    gpa, len));
619 	KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %lx/%lx beyond "
620 	    "domain maxaddr %lx", __func__, gpa, len, dom->maxaddr));
621 
622 	if (gpa & PAGE_MASK)
623 		panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
624 
625 	if (hpa & PAGE_MASK)
626 		panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
627 
628 	if (len & PAGE_MASK)
629 		panic("vtd_create_mapping: unaligned len 0x%0lx", len);
630 
631 	/*
632 	 * Compute the size of the mapping that we can accommodate.
633 	 *
634 	 * This is based on three factors:
635 	 * - supported super page size
636 	 * - alignment of the region starting at 'gpa' and 'hpa'
637 	 * - length of the region 'len'
638 	 */
639 	spshift = 48;
640 	for (i = 3; i >= 0; i--) {
641 		spsize = 1UL << spshift;
642 		if ((dom->spsmask & (1 << i)) != 0 &&
643 		    (gpa & (spsize - 1)) == 0 &&
644 		    (hpa & (spsize - 1)) == 0 &&
645 		    (len >= spsize)) {
646 			break;
647 		}
648 		spshift -= 9;
649 	}
650 
651 	ptp = dom->ptp;
652 	nlevels = dom->pt_levels;
653 	while (--nlevels >= 0) {
654 		ptpshift = 12 + nlevels * 9;
655 		ptpindex = (gpa >> ptpshift) & 0x1FF;
656 
657 		/* We have reached the leaf mapping */
658 		if (spshift >= ptpshift) {
659 			break;
660 		}
661 
662 		/*
663 		 * We are working on a non-leaf page table page.
664 		 *
665 		 * Create a downstream page table page if necessary and point
666 		 * to it from the current page table.
667 		 */
668 		if (ptp[ptpindex] == 0) {
669 			void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
670 			ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
671 		}
672 
673 		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
674 	}
675 
676 	if ((gpa & ((1UL << ptpshift) - 1)) != 0)
677 		panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
678 
679 	/*
680 	 * Update the 'gpa' -> 'hpa' mapping
681 	 */
682 	if (remove) {
683 		ptp[ptpindex] = 0;
684 	} else {
685 		ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
686 
687 		if (nlevels > 0)
688 			ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
689 	}
690 
691 	return (1UL << ptpshift);
692 }
693 
694 static uint64_t
695 vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
696 {
697 
698 	return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
699 }
700 
701 static uint64_t
702 vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
703 {
704 
705 	return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
706 }
707 
708 static void
709 vtd_invalidate_tlb(void *dom)
710 {
711 	int i;
712 	struct vtdmap *vtdmap;
713 
714 	/*
715 	 * Invalidate the IOTLB.
716 	 * XXX use domain-selective invalidation for IOTLB
717 	 */
718 	for (i = 0; i < drhd_num; i++) {
719 		vtdmap = vtdmaps[i];
720 		vtd_iotlb_global_invalidate(vtdmap);
721 	}
722 }
723 
724 static void *
725 vtd_create_domain(vm_paddr_t maxaddr)
726 {
727 	struct domain *dom;
728 	vm_paddr_t addr;
729 	int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
730 	struct vtdmap *vtdmap;
731 
732 	if (drhd_num <= 0)
733 		panic("vtd_create_domain: no dma remapping hardware available");
734 
735 	/*
736 	 * Calculate AGAW.
737 	 * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
738 	 */
739 	addr = 0;
740 	for (gaw = 0; addr < maxaddr; gaw++)
741 		addr = 1ULL << gaw;
742 
743 	res = (gaw - 12) % 9;
744 	if (res == 0)
745 		agaw = gaw;
746 	else
747 		agaw = gaw + 9 - res;
748 
749 	if (agaw > 64)
750 		agaw = 64;
751 
752 	/*
753 	 * Select the smallest Supported AGAW and the corresponding number
754 	 * of page table levels.
755 	 */
756 	pt_levels = 2;
757 	sagaw = 30;
758 	addrwidth = 0;
759 
760 	tmp = ~0;
761 	for (i = 0; i < drhd_num; i++) {
762 		vtdmap = vtdmaps[i];
763 		/* take most compatible value */
764 		tmp &= VTD_CAP_SAGAW(vtdmap->cap);
765 	}
766 
767 	for (i = 0; i < 5; i++) {
768 		if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
769 			break;
770 		pt_levels++;
771 		addrwidth++;
772 		sagaw += 9;
773 		if (sagaw > 64)
774 			sagaw = 64;
775 	}
776 
777 	if (i >= 5) {
778 		panic("vtd_create_domain: SAGAW 0x%x does not support AGAW %d",
779 		    tmp, agaw);
780 	}
781 
782 	dom = malloc(sizeof (struct domain), M_VTD, M_ZERO | M_WAITOK);
783 	dom->pt_levels = pt_levels;
784 	dom->addrwidth = addrwidth;
785 	dom->id = domain_id();
786 	dom->maxaddr = maxaddr;
787 	dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
788 	if ((uintptr_t)dom->ptp & PAGE_MASK)
789 		panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
790 
791 #ifdef __FreeBSD__
792 #ifdef notyet
793 	/*
794 	 * XXX superpage mappings for the iommu do not work correctly.
795 	 *
796 	 * By default all physical memory is mapped into the host_domain.
797 	 * When a VM is allocated wired memory the pages belonging to it
798 	 * are removed from the host_domain and added to the vm's domain.
799 	 *
800 	 * If the page being removed was mapped using a superpage mapping
801 	 * in the host_domain then we need to demote the mapping before
802 	 * removing the page.
803 	 *
804 	 * There is not any code to deal with the demotion at the moment
805 	 * so we disable superpage mappings altogether.
806 	 */
807 	dom->spsmask = ~0;
808 	for (i = 0; i < drhd_num; i++) {
809 		vtdmap = vtdmaps[i];
810 		/* take most compatible value */
811 		dom->spsmask &= VTD_CAP_SPS(vtdmap->cap);
812 	}
813 #endif
814 #else
815 	/*
816 	 * On illumos we decidedly do not remove memory mapped to a VM's domain
817 	 * from the host_domain, so we don't have to deal with page demotion and
818 	 * can just use large pages.
819 	 *
820 	 * Since VM memory is currently allocated as 4k pages and mapped into
821 	 * the VM domain page by page, the use of large pages is essentially
822 	 * limited to the host_domain.
823 	 */
824 	dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
825 #endif
826 
827 	SLIST_INSERT_HEAD(&domhead, dom, next);
828 
829 	return (dom);
830 }
831 
832 static void
833 vtd_free_ptp(uint64_t *ptp, int level)
834 {
835 	int i;
836 	uint64_t *nlp;
837 
838 	if (level > 1) {
839 		for (i = 0; i < 512; i++) {
840 			if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
841 				continue;
842 			if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
843 				continue;
844 			nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
845 			vtd_free_ptp(nlp, level - 1);
846 		}
847 	}
848 
849 	bzero(ptp, PAGE_SIZE);
850 	free(ptp, M_VTD);
851 }
852 
853 static void
854 vtd_destroy_domain(void *arg)
855 {
856 	struct domain *dom;
857 
858 	dom = arg;
859 
860 	SLIST_REMOVE(&domhead, dom, domain, next);
861 	vtd_free_ptp(dom->ptp, dom->pt_levels);
862 	free(dom, M_VTD);
863 }
864 
865 const struct iommu_ops iommu_ops_intel = {
866 	.init = vtd_init,
867 	.cleanup = vtd_cleanup,
868 	.enable = vtd_enable,
869 	.disable = vtd_disable,
870 	.create_domain = vtd_create_domain,
871 	.destroy_domain = vtd_destroy_domain,
872 	.create_mapping = vtd_create_mapping,
873 	.remove_mapping = vtd_remove_mapping,
874 	.add_device = vtd_add_device,
875 	.remove_device = vtd_remove_device,
876 	.invalidate_tlb = vtd_invalidate_tlb,
877 };
878