xref: /illumos-gate/usr/src/uts/i86pc/io/immu_dvma.c (revision f169c0ea)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Portions Copyright (c) 2010, Oracle and/or its affiliates.
23  * All rights reserved.
24  */
25 /*
26  * Copyright (c) 2009, Intel Corporation.
27  * All rights reserved.
28  */
29 
30 /*
31  * DVMA code
32  * This file contains Intel IOMMU code that deals with DVMA
33  * i.e. DMA remapping.
34  */
35 
36 #include <sys/sysmacros.h>
37 #include <sys/pcie.h>
38 #include <sys/pci_cfgspace.h>
39 #include <vm/hat_i86.h>
40 #include <sys/memlist.h>
41 #include <sys/acpi/acpi.h>
42 #include <sys/acpica.h>
43 #include <sys/modhash.h>
44 #include <sys/immu.h>
45 
46 #undef	TEST
47 
48 /*
49  * Macros based on PCI spec
50  */
51 #define	IMMU_PCI_REV2CLASS(r)   ((r) >> 8)  /* classcode from revid */
52 #define	IMMU_PCI_CLASS2BASE(c)  ((c) >> 16) /* baseclass from classcode */
53 #define	IMMU_PCI_CLASS2SUB(c)   (((c) >> 8) & 0xff); /* classcode */
54 
55 #define	IMMU_CONTIG_PADDR(d, p) \
56 	((d).dck_paddr && ((d).dck_paddr + IMMU_PAGESIZE) == (p))
57 
58 typedef struct dvma_arg {
59 	immu_t *dva_immu;
60 	dev_info_t *dva_rdip;
61 	dev_info_t *dva_ddip;
62 	domain_t *dva_domain;
63 	int dva_level;
64 	immu_flags_t dva_flags;
65 	list_t *dva_list;
66 	int dva_error;
67 } dvma_arg_t;
68 
69 static domain_t *domain_create(immu_t *immu, dev_info_t *ddip,
70     dev_info_t *rdip, immu_flags_t immu_flags);
71 static immu_devi_t *create_immu_devi(dev_info_t *rdip, int bus,
72     int dev, int func, immu_flags_t immu_flags);
73 static void destroy_immu_devi(immu_devi_t *immu_devi);
74 static boolean_t dvma_map(immu_t *immu, domain_t *domain, uint64_t sdvma,
75     uint64_t nvpages, dcookie_t *dcookies, int dcount, dev_info_t *rdip,
76     immu_flags_t immu_flags);
77 
78 /* Extern globals */
79 extern struct memlist  *phys_install;
80 
81 
82 /* static Globals */
83 
84 /*
85  * Used to setup DMA objects (memory regions)
86  * for DMA reads by IOMMU units
87  */
88 static ddi_dma_attr_t immu_dma_attr = {
89 	DMA_ATTR_V0,
90 	0U,
91 	0xffffffffffffffffULL,
92 	0xffffffffU,
93 	MMU_PAGESIZE, /* MMU page aligned */
94 	0x1,
95 	0x1,
96 	0xffffffffU,
97 	0xffffffffffffffffULL,
98 	1,
99 	4,
100 	0
101 };
102 
103 static ddi_device_acc_attr_t immu_acc_attr = {
104 	DDI_DEVICE_ATTR_V0,
105 	DDI_NEVERSWAP_ACC,
106 	DDI_STRICTORDER_ACC
107 };
108 
109 
110 /* globals private to this file */
111 static kmutex_t immu_domain_lock;
112 static list_t immu_unity_domain_list;
113 static list_t immu_xlate_domain_list;
114 
115 /* structure used to store idx into each level of the page tables */
116 typedef struct xlate {
117 	int xlt_level;
118 	uint_t xlt_idx;
119 	pgtable_t *xlt_pgtable;
120 } xlate_t;
121 
122 /* 0 is reserved by Vt-d spec. Solaris reserves 1 */
123 #define	IMMU_UNITY_DID   1
124 
125 static mod_hash_t *bdf_domain_hash;
126 
127 static domain_t *
128 bdf_domain_lookup(immu_devi_t *immu_devi)
129 {
130 	domain_t *domain;
131 	int16_t seg = immu_devi->imd_seg;
132 	int16_t bus = immu_devi->imd_bus;
133 	int16_t devfunc = immu_devi->imd_devfunc;
134 	uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);
135 
136 	if (seg < 0 || bus < 0 || devfunc < 0) {
137 		return (NULL);
138 	}
139 
140 	domain = NULL;
141 	if (mod_hash_find(bdf_domain_hash,
142 	    (void *)bdf, (void *)&domain) == 0) {
143 		ASSERT(domain);
144 		ASSERT(domain->dom_did > 0);
145 		return (domain);
146 	} else {
147 		return (NULL);
148 	}
149 }
150 
151 static void
152 bdf_domain_insert(immu_devi_t *immu_devi, domain_t *domain)
153 {
154 	int16_t seg = immu_devi->imd_seg;
155 	int16_t bus = immu_devi->imd_bus;
156 	int16_t devfunc = immu_devi->imd_devfunc;
157 	uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);
158 	int r;
159 
160 	if (seg < 0 || bus < 0 || devfunc < 0) {
161 		return;
162 	}
163 
164 	r = mod_hash_insert(bdf_domain_hash, (void *)bdf, (void *)domain);
165 	ASSERT(r != MH_ERR_DUPLICATE);
166 	ASSERT(r == 0);
167 }
168 
169 static int
170 match_lpc(dev_info_t *pdip, void *arg)
171 {
172 	immu_devi_t *immu_devi;
173 	dvma_arg_t *dvap = (dvma_arg_t *)arg;
174 
175 	ASSERT(dvap->dva_error == DDI_FAILURE);
176 	ASSERT(dvap->dva_ddip == NULL);
177 	ASSERT(dvap->dva_list);
178 
179 	if (list_is_empty(dvap->dva_list)) {
180 		return (DDI_WALK_TERMINATE);
181 	}
182 
183 	immu_devi = list_head(dvap->dva_list);
184 	for (; immu_devi; immu_devi = list_next(dvap->dva_list,
185 	    immu_devi)) {
186 		ASSERT(immu_devi->imd_dip);
187 		if (immu_devi->imd_dip == pdip) {
188 			dvap->dva_ddip = pdip;
189 			dvap->dva_error = DDI_SUCCESS;
190 			return (DDI_WALK_TERMINATE);
191 		}
192 	}
193 
194 	return (DDI_WALK_CONTINUE);
195 }
196 
197 static void
198 immu_devi_set_spclist(dev_info_t *dip, immu_t *immu)
199 {
200 	list_t *spclist = NULL;
201 	immu_devi_t *immu_devi;
202 
203 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_lock)));
204 
205 	immu_devi = IMMU_DEVI(dip);
206 	if (immu_devi->imd_display == B_TRUE) {
207 		spclist = &(immu->immu_dvma_gfx_list);
208 	} else if (immu_devi->imd_lpc == B_TRUE) {
209 		spclist = &(immu->immu_dvma_lpc_list);
210 	}
211 
212 	if (spclist) {
213 		mutex_enter(&(immu->immu_lock));
214 		list_insert_head(spclist, immu_devi);
215 		mutex_exit(&(immu->immu_lock));
216 	}
217 }
218 
219 /*
220  * Set the immu_devi struct in the immu_devi field of a devinfo node
221  */
222 int
223 immu_devi_set(dev_info_t *dip, immu_flags_t immu_flags)
224 {
225 	int bus, dev, func;
226 	immu_devi_t *new_imd;
227 	immu_devi_t *immu_devi;
228 
229 	ASSERT(root_devinfo);
230 	ASSERT(dip);
231 	ASSERT(dip != root_devinfo);
232 
233 	immu_devi = immu_devi_get(dip);
234 	if (immu_devi != NULL) {
235 		return (DDI_SUCCESS);
236 	}
237 
238 	bus = dev = func = -1;
239 
240 	/*
241 	 * Assume a new immu_devi struct is needed
242 	 */
243 	if (!DEVI_IS_PCI(dip) || acpica_get_bdf(dip, &bus, &dev, &func) != 0) {
244 		/*
245 		 * No BDF. Set bus = -1 to indicate this.
246 		 * We still need to create a immu_devi struct
247 		 * though
248 		 */
249 		bus = -1;
250 		dev = 0;
251 		func = 0;
252 	}
253 
254 	new_imd = create_immu_devi(dip, bus, dev, func, immu_flags);
255 	if (new_imd  == NULL) {
256 		ddi_err(DER_WARN, dip, "Failed to create immu_devi "
257 		    "structure");
258 		return (DDI_FAILURE);
259 	}
260 
261 	/*
262 	 * Check if some other thread allocated a immu_devi while we
263 	 * didn't own the lock.
264 	 */
265 	mutex_enter(&(DEVI(dip)->devi_lock));
266 	if (IMMU_DEVI(dip) == NULL) {
267 		IMMU_DEVI_SET(dip, new_imd);
268 	} else {
269 		destroy_immu_devi(new_imd);
270 	}
271 	mutex_exit(&(DEVI(dip)->devi_lock));
272 
273 	return (DDI_SUCCESS);
274 }
275 
276 static dev_info_t *
277 get_lpc_devinfo(immu_t *immu, dev_info_t *rdip, immu_flags_t immu_flags)
278 {
279 	dvma_arg_t dvarg = {0};
280 	dvarg.dva_list = &(immu->immu_dvma_lpc_list);
281 	dvarg.dva_rdip = rdip;
282 	dvarg.dva_error = DDI_FAILURE;
283 
284 	if (immu_walk_ancestor(rdip, NULL, match_lpc,
285 	    &dvarg, NULL, immu_flags) != DDI_SUCCESS) {
286 		ddi_err(DER_MODE, rdip, "Could not walk ancestors to "
287 		    "find lpc_devinfo for ISA device");
288 		return (NULL);
289 	}
290 
291 	if (dvarg.dva_error != DDI_SUCCESS || dvarg.dva_ddip == NULL) {
292 		ddi_err(DER_MODE, rdip, "Could not find lpc_devinfo for "
293 		    "ISA device");
294 		return (NULL);
295 	}
296 
297 	return (dvarg.dva_ddip);
298 }
299 
300 static dev_info_t *
301 get_gfx_devinfo(dev_info_t *rdip)
302 {
303 	immu_t *immu;
304 	immu_devi_t *immu_devi;
305 	list_t *list_gfx;
306 
307 	/*
308 	 * The GFX device may not be on the same IMMU unit as "agpgart"
309 	 * so search globally
310 	 */
311 	immu_devi = NULL;
312 	immu = list_head(&immu_list);
313 	for (; immu; immu = list_next(&immu_list, immu)) {
314 		list_gfx = &(immu->immu_dvma_gfx_list);
315 		if (!list_is_empty(list_gfx)) {
316 			immu_devi = list_head(list_gfx);
317 			break;
318 		}
319 	}
320 
321 	if (immu_devi == NULL) {
322 		ddi_err(DER_WARN, rdip, "IMMU: No GFX device. "
323 		    "Cannot redirect agpgart");
324 		return (NULL);
325 	}
326 
327 	/* list is not empty we checked above */
328 	ASSERT(immu_devi);
329 	ASSERT(immu_devi->imd_dip);
330 
331 	ddi_err(DER_LOG, rdip, "IMMU: GFX redirect to %s",
332 	    ddi_node_name(immu_devi->imd_dip));
333 
334 	return (immu_devi->imd_dip);
335 }
336 
337 static immu_flags_t
338 dma_to_immu_flags(struct ddi_dma_req *dmareq)
339 {
340 	immu_flags_t flags = 0;
341 
342 	if (dmareq->dmar_fp == DDI_DMA_SLEEP) {
343 		flags |= IMMU_FLAGS_SLEEP;
344 	} else {
345 		flags |= IMMU_FLAGS_NOSLEEP;
346 	}
347 
348 #ifdef BUGGY_DRIVERS
349 
350 	flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
351 
352 #else
353 	/*
354 	 * Read and write flags need to be reversed.
355 	 * DMA_READ means read from device and write
356 	 * to memory. So DMA read means DVMA write.
357 	 */
358 	if (dmareq->dmar_flags & DDI_DMA_READ)
359 		flags |= IMMU_FLAGS_WRITE;
360 
361 	if (dmareq->dmar_flags & DDI_DMA_WRITE)
362 		flags |= IMMU_FLAGS_READ;
363 
364 	/*
365 	 * Some buggy drivers specify neither READ or WRITE
366 	 * For such drivers set both read and write permissions
367 	 */
368 	if ((dmareq->dmar_flags & (DDI_DMA_READ | DDI_DMA_WRITE)) == 0) {
369 		flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
370 	}
371 #endif
372 
373 	return (flags);
374 }
375 
376 int
377 pgtable_ctor(void *buf, void *arg, int kmflag)
378 {
379 	size_t actual_size = 0;
380 	pgtable_t *pgtable;
381 	int (*dmafp)(caddr_t);
382 	caddr_t vaddr;
383 	void *next;
384 
385 	ASSERT(buf);
386 	ASSERT(arg == NULL);
387 
388 	pgtable = (pgtable_t *)buf;
389 
390 	dmafp = (kmflag & KM_NOSLEEP) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP;
391 
392 	next = kmem_zalloc(IMMU_PAGESIZE, kmflag);
393 	if (next == NULL) {
394 		return (-1);
395 	}
396 
397 	ASSERT(root_devinfo);
398 	if (ddi_dma_alloc_handle(root_devinfo, &immu_dma_attr,
399 	    dmafp, NULL, &pgtable->hwpg_dmahdl) != DDI_SUCCESS) {
400 		kmem_free(next, IMMU_PAGESIZE);
401 		return (-1);
402 	}
403 
404 	if (ddi_dma_mem_alloc(pgtable->hwpg_dmahdl, IMMU_PAGESIZE,
405 	    &immu_acc_attr, DDI_DMA_CONSISTENT | IOMEM_DATA_UNCACHED,
406 	    dmafp, NULL, &vaddr, &actual_size,
407 	    &pgtable->hwpg_memhdl) != DDI_SUCCESS) {
408 		ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
409 		kmem_free(next, IMMU_PAGESIZE);
410 		return (-1);
411 	}
412 
413 	/*
414 	 * Memory allocation failure. Maybe a temporary condition
415 	 * so return error rather than panic, so we can try again
416 	 */
417 	if (actual_size < IMMU_PAGESIZE) {
418 		ddi_dma_mem_free(&pgtable->hwpg_memhdl);
419 		ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
420 		kmem_free(next, IMMU_PAGESIZE);
421 		return (-1);
422 	}
423 
424 	pgtable->hwpg_paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, vaddr));
425 	pgtable->hwpg_vaddr = vaddr;
426 	pgtable->swpg_next_array = next;
427 
428 	rw_init(&(pgtable->swpg_rwlock), NULL, RW_DEFAULT, NULL);
429 
430 	return (0);
431 }
432 
433 void
434 pgtable_dtor(void *buf, void *arg)
435 {
436 	pgtable_t *pgtable;
437 
438 	ASSERT(buf);
439 	ASSERT(arg == NULL);
440 
441 	pgtable = (pgtable_t *)buf;
442 	ASSERT(pgtable->swpg_next_array);
443 
444 	/* destroy will panic if lock is held. */
445 	rw_destroy(&(pgtable->swpg_rwlock));
446 
447 	ddi_dma_mem_free(&pgtable->hwpg_memhdl);
448 	ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
449 	kmem_free(pgtable->swpg_next_array, IMMU_PAGESIZE);
450 
451 	/* don't zero out hwpg_vaddr and swpg_next_array for debugging */
452 }
453 
454 /*
455  * pgtable_alloc()
456  *	alloc a IOMMU pgtable structure.
457  *	This same struct is used for root and context tables as well.
458  *	This routine allocs the f/ollowing:
459  *	- a pgtable_t struct
460  *	- a HW page which holds PTEs/entries which is accesssed by HW
461  *        so we set up DMA for this page
462  *	- a SW page which is only for our bookeeping
463  *        (for example to  hold pointers to the next level pgtable).
464  *        So a simple kmem_alloc suffices
465  */
466 static pgtable_t *
467 pgtable_alloc(immu_t *immu, immu_flags_t immu_flags)
468 {
469 	pgtable_t *pgtable;
470 	int kmflags;
471 
472 	ASSERT(immu);
473 
474 	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
475 
476 	pgtable = kmem_cache_alloc(immu_pgtable_cache, kmflags);
477 	if (pgtable == NULL) {
478 		return (NULL);
479 	}
480 	return (pgtable);
481 }
482 
483 static void
484 pgtable_zero(immu_t *immu, pgtable_t *pgtable)
485 {
486 	bzero(pgtable->hwpg_vaddr, IMMU_PAGESIZE);
487 	bzero(pgtable->swpg_next_array, IMMU_PAGESIZE);
488 
489 	/* Dont need to flush the write we will flush when we use the entry */
490 	immu_regs_cpu_flush(immu, pgtable->hwpg_vaddr, IMMU_PAGESIZE);
491 }
492 
493 static void
494 pgtable_free(immu_t *immu, pgtable_t *pgtable)
495 {
496 	ASSERT(immu);
497 	ASSERT(pgtable);
498 
499 	kmem_cache_free(immu_pgtable_cache, pgtable);
500 }
501 
502 /*
503  * Function to identify a display device from the PCI class code
504  */
505 static boolean_t
506 device_is_display(uint_t classcode)
507 {
508 	static uint_t disp_classes[] = {
509 		0x000100,
510 		0x030000,
511 		0x030001
512 	};
513 	int i, nclasses = sizeof (disp_classes) / sizeof (uint_t);
514 
515 	for (i = 0; i < nclasses; i++) {
516 		if (classcode == disp_classes[i])
517 			return (B_TRUE);
518 	}
519 	return (B_FALSE);
520 }
521 
522 /*
523  * Function that determines if device is PCIEX and/or PCIEX bridge
524  */
525 static boolean_t
526 device_is_pciex(
527 	uchar_t bus, uchar_t dev, uchar_t func, boolean_t *is_pcib)
528 {
529 	ushort_t cap;
530 	ushort_t capsp;
531 	ushort_t cap_count = PCI_CAP_MAX_PTR;
532 	ushort_t status;
533 	boolean_t is_pciex = B_FALSE;
534 
535 	*is_pcib = B_FALSE;
536 
537 	status = pci_getw_func(bus, dev, func, PCI_CONF_STAT);
538 	if (!(status & PCI_STAT_CAP))
539 		return (B_FALSE);
540 
541 	capsp = pci_getb_func(bus, dev, func, PCI_CONF_CAP_PTR);
542 	while (cap_count-- && capsp >= PCI_CAP_PTR_OFF) {
543 		capsp &= PCI_CAP_PTR_MASK;
544 		cap = pci_getb_func(bus, dev, func, capsp);
545 
546 		if (cap == PCI_CAP_ID_PCI_E) {
547 			status = pci_getw_func(bus, dev, func, capsp + 2);
548 			/*
549 			 * See section 7.8.2 of PCI-Express Base Spec v1.0a
550 			 * for Device/Port Type.
551 			 * PCIE_PCIECAP_DEV_TYPE_PCIE2PCI implies that the
552 			 * device is a PCIE2PCI bridge
553 			 */
554 			*is_pcib =
555 			    ((status & PCIE_PCIECAP_DEV_TYPE_MASK) ==
556 			    PCIE_PCIECAP_DEV_TYPE_PCIE2PCI) ? B_TRUE : B_FALSE;
557 			is_pciex = B_TRUE;
558 		}
559 
560 		capsp = (*pci_getb_func)(bus, dev, func,
561 		    capsp + PCI_CAP_NEXT_PTR);
562 	}
563 
564 	return (is_pciex);
565 }
566 
567 
568 /*
569  * immu_dvma_get_immu()
570  *   get the immu unit structure for a dev_info node
571  */
572 immu_t *
573 immu_dvma_get_immu(dev_info_t *dip, immu_flags_t immu_flags)
574 {
575 	immu_devi_t *immu_devi;
576 	immu_t *immu;
577 
578 	/*
579 	 * check if immu unit was already found earlier.
580 	 * If yes, then it will be stashed in immu_devi struct.
581 	 */
582 	immu_devi = immu_devi_get(dip);
583 	if (immu_devi == NULL) {
584 		if (immu_devi_set(dip, immu_flags) != DDI_SUCCESS) {
585 			/*
586 			 * May fail because of low memory. Return error rather
587 			 * than panic as we want driver to rey again later
588 			 */
589 			ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
590 			    "No immu_devi structure");
591 			/*NOTREACHED*/
592 		}
593 		immu_devi = immu_devi_get(dip);
594 		ASSERT(immu_devi);
595 	}
596 
597 	mutex_enter(&(DEVI(dip)->devi_lock));
598 	if (immu_devi->imd_immu) {
599 		immu = immu_devi->imd_immu;
600 		mutex_exit(&(DEVI(dip)->devi_lock));
601 		return (immu);
602 	}
603 	mutex_exit(&(DEVI(dip)->devi_lock));
604 
605 	immu = immu_dmar_get_immu(dip);
606 	if (immu == NULL) {
607 		ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
608 		    "Cannot find immu_t for device");
609 		/*NOTREACHED*/
610 	}
611 
612 	/*
613 	 * Check if some other thread found immu
614 	 * while lock was not held
615 	 */
616 	immu_devi = immu_devi_get(dip);
617 	/* immu_devi should be present as we found it earlier */
618 	if (immu_devi == NULL) {
619 		ddi_err(DER_PANIC, dip,
620 		    "immu_dvma_get_immu: No immu_devi structure");
621 		/*NOTREACHED*/
622 	}
623 
624 	mutex_enter(&(DEVI(dip)->devi_lock));
625 	if (immu_devi->imd_immu == NULL) {
626 		/* nobody else set it, so we should do it */
627 		immu_devi->imd_immu = immu;
628 		immu_devi_set_spclist(dip, immu);
629 	} else {
630 		/*
631 		 * if some other thread got immu before
632 		 * us, it should get the same results
633 		 */
634 		if (immu_devi->imd_immu != immu) {
635 			ddi_err(DER_PANIC, dip, "Multiple "
636 			    "immu units found for device. Expected (%p), "
637 			    "actual (%p)", (void *)immu,
638 			    (void *)immu_devi->imd_immu);
639 			mutex_exit(&(DEVI(dip)->devi_lock));
640 			/*NOTREACHED*/
641 		}
642 	}
643 	mutex_exit(&(DEVI(dip)->devi_lock));
644 
645 	return (immu);
646 }
647 
648 
649 /* ############################# IMMU_DEVI code ############################ */
650 
651 /*
652  * Allocate a immu_devi structure and initialize it
653  */
654 static immu_devi_t *
655 create_immu_devi(dev_info_t *rdip, int bus, int dev, int func,
656     immu_flags_t immu_flags)
657 {
658 	uchar_t baseclass, subclass;
659 	uint_t classcode, revclass;
660 	immu_devi_t *immu_devi;
661 	boolean_t pciex = B_FALSE;
662 	int kmflags;
663 	boolean_t is_pcib = B_FALSE;
664 
665 	/* bus ==  -1 indicate non-PCI device (no BDF) */
666 	ASSERT(bus == -1 || bus >= 0);
667 	ASSERT(dev >= 0);
668 	ASSERT(func >= 0);
669 
670 	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
671 	immu_devi = kmem_zalloc(sizeof (immu_devi_t), kmflags);
672 	if (immu_devi == NULL) {
673 		ddi_err(DER_WARN, rdip, "Failed to allocate memory for "
674 		    "Intel IOMMU immu_devi structure");
675 		return (NULL);
676 	}
677 	immu_devi->imd_dip = rdip;
678 	immu_devi->imd_seg = 0; /* Currently seg can only be 0 */
679 	immu_devi->imd_bus = bus;
680 	immu_devi->imd_pcib_type = IMMU_PCIB_BAD;
681 
682 	if (bus == -1) {
683 		immu_devi->imd_pcib_type = IMMU_PCIB_NOBDF;
684 		return (immu_devi);
685 	}
686 
687 	immu_devi->imd_devfunc = IMMU_PCI_DEVFUNC(dev, func);
688 	immu_devi->imd_sec = 0;
689 	immu_devi->imd_sub = 0;
690 
691 	revclass = pci_getl_func(bus, dev, func, PCI_CONF_REVID);
692 
693 	classcode = IMMU_PCI_REV2CLASS(revclass);
694 	baseclass = IMMU_PCI_CLASS2BASE(classcode);
695 	subclass = IMMU_PCI_CLASS2SUB(classcode);
696 
697 	if (baseclass == PCI_CLASS_BRIDGE && subclass == PCI_BRIDGE_PCI) {
698 
699 		immu_devi->imd_sec = pci_getb_func(bus, dev, func,
700 		    PCI_BCNF_SECBUS);
701 		immu_devi->imd_sub = pci_getb_func(bus, dev, func,
702 		    PCI_BCNF_SUBBUS);
703 
704 		pciex = device_is_pciex(bus, dev, func, &is_pcib);
705 		if (pciex  == B_TRUE && is_pcib == B_TRUE) {
706 			immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCI;
707 		} else if (pciex == B_TRUE) {
708 			immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCIE;
709 		} else {
710 			immu_devi->imd_pcib_type = IMMU_PCIB_PCI_PCI;
711 		}
712 	} else {
713 		immu_devi->imd_pcib_type = IMMU_PCIB_ENDPOINT;
714 	}
715 
716 	/* check for certain special devices */
717 	immu_devi->imd_display = device_is_display(classcode);
718 
719 	immu_devi->imd_lpc = ((baseclass == PCI_CLASS_BRIDGE) &&
720 	    (subclass == PCI_BRIDGE_ISA)) ? B_TRUE : B_FALSE;
721 
722 	immu_devi->imd_domain = NULL;
723 
724 	immu_devi->imd_dvma_flags = immu_global_dvma_flags;
725 
726 	return (immu_devi);
727 }
728 
729 static void
730 destroy_immu_devi(immu_devi_t *immu_devi)
731 {
732 	kmem_free(immu_devi, sizeof (immu_devi_t));
733 }
734 
735 static domain_t *
736 immu_devi_domain(dev_info_t *rdip, dev_info_t **ddipp)
737 {
738 	immu_devi_t *immu_devi;
739 	domain_t *domain;
740 	dev_info_t *ddip;
741 
742 	ASSERT(rdip);
743 	ASSERT(ddipp);
744 
745 	*ddipp = NULL;
746 
747 	immu_devi = immu_devi_get(rdip);
748 	if (immu_devi == NULL) {
749 		return (NULL);
750 	}
751 
752 	mutex_enter(&(DEVI(rdip)->devi_lock));
753 	domain = immu_devi->imd_domain;
754 	ddip = immu_devi->imd_ddip;
755 	mutex_exit(&(DEVI(rdip)->devi_lock));
756 
757 	if (domain) {
758 		ASSERT(domain->dom_did > 0);
759 		ASSERT(ddip);
760 		*ddipp = ddip;
761 	}
762 
763 	return (domain);
764 
765 }
766 
767 /* ############################# END IMMU_DEVI code ######################## */
768 /* ############################# DOMAIN code ############################### */
769 
770 /*
771  * This routine always succeeds
772  */
773 static int
774 did_alloc(immu_t *immu, dev_info_t *rdip,
775     dev_info_t *ddip, immu_flags_t immu_flags)
776 {
777 	int did;
778 
779 	ASSERT(immu);
780 	ASSERT(rdip);
781 	ASSERT(rdip != root_devinfo);
782 
783 	did = (uintptr_t)vmem_alloc(immu->immu_did_arena, 1,
784 	    (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP);
785 
786 	if (did == 0) {
787 		ASSERT(immu->immu_unity_domain);
788 		ASSERT(immu->immu_unity_domain->dom_did > 0);
789 		ddi_err(DER_WARN, rdip, "device domain-id alloc error"
790 		    " domain-device: %s%d. immu unit is %s. Using "
791 		    "unity domain with domain-id (%d)",
792 		    ddi_driver_name(ddip), ddi_get_instance(ddip),
793 		    immu->immu_name, immu->immu_unity_domain->dom_did);
794 		did = immu->immu_unity_domain->dom_did;
795 	}
796 
797 	return (did);
798 }
799 
800 static int
801 get_branch_domain(dev_info_t *pdip, void *arg)
802 {
803 	immu_devi_t *immu_devi;
804 	domain_t *domain;
805 	dev_info_t *ddip;
806 	immu_t *immu;
807 	dvma_arg_t *dvp = (dvma_arg_t *)arg;
808 
809 	ASSERT(pdip);
810 	ASSERT(dvp);
811 	ASSERT(dvp->dva_rdip);
812 
813 	/*
814 	 * The field dvp->dva_rdip is a work-in-progress
815 	 * and gets updated as we walk up the ancestor
816 	 * tree. The final ddip is set only when we reach
817 	 * the top of the tree. So the dvp->dva_ddip field cannot
818 	 * be relied on until we reach the top of the field.
819 	 */
820 
821 	/* immu_devi may not be set. */
822 	immu_devi = immu_devi_get(pdip);
823 	if (immu_devi == NULL) {
824 		if (immu_devi_set(pdip, dvp->dva_flags) != DDI_SUCCESS) {
825 			dvp->dva_error = DDI_FAILURE;
826 			return (DDI_WALK_TERMINATE);
827 		}
828 	}
829 
830 	immu_devi = immu_devi_get(pdip);
831 	ASSERT(immu_devi);
832 	immu = immu_devi->imd_immu;
833 	if (immu == NULL) {
834 		immu = immu_dvma_get_immu(pdip, dvp->dva_flags);
835 		ASSERT(immu);
836 	}
837 
838 	/*
839 	 * If we encounter a PCIE_PCIE bridge *ANCESTOR* we need to
840 	 * terminate the walk (since the device under the PCIE bridge
841 	 * is a PCIE device and has an independent entry in the
842 	 * root/context table)
843 	 */
844 	if (dvp->dva_rdip != pdip &&
845 	    immu_devi->imd_pcib_type == IMMU_PCIB_PCIE_PCIE) {
846 		return (DDI_WALK_TERMINATE);
847 	}
848 
849 	/*
850 	 * In order to be a domain-dim, it must be a PCI device i.e.
851 	 * must have valid BDF. This also eliminates the root complex.
852 	 */
853 	if (immu_devi->imd_pcib_type != IMMU_PCIB_BAD &&
854 	    immu_devi->imd_pcib_type != IMMU_PCIB_NOBDF) {
855 		ASSERT(immu_devi->imd_bus >= 0);
856 		ASSERT(immu_devi->imd_devfunc >= 0);
857 		dvp->dva_ddip = pdip;
858 	}
859 
860 	if (immu_devi->imd_display == B_TRUE ||
861 	    (dvp->dva_flags & IMMU_FLAGS_UNITY)) {
862 		dvp->dva_domain = immu->immu_unity_domain;
863 		/* continue walking to find ddip */
864 		return (DDI_WALK_CONTINUE);
865 	}
866 
867 	mutex_enter(&(DEVI(pdip)->devi_lock));
868 	domain = immu_devi->imd_domain;
869 	ddip = immu_devi->imd_ddip;
870 	mutex_exit(&(DEVI(pdip)->devi_lock));
871 
872 	if (domain && ddip) {
873 		/* if domain is set, it must be the same */
874 		if (dvp->dva_domain) {
875 			ASSERT(domain == dvp->dva_domain);
876 		}
877 		dvp->dva_domain = domain;
878 		dvp->dva_ddip = ddip;
879 		return (DDI_WALK_TERMINATE);
880 	}
881 
882 	/* immu_devi either has both set or both clear */
883 	ASSERT(domain == NULL);
884 	ASSERT(ddip == NULL);
885 
886 	/* Domain may already be set, continue walking so that ddip gets set */
887 	if (dvp->dva_domain) {
888 		return (DDI_WALK_CONTINUE);
889 	}
890 
891 	/* domain is not set in either immu_devi or dvp */
892 	domain = bdf_domain_lookup(immu_devi);
893 	if (domain == NULL) {
894 		return (DDI_WALK_CONTINUE);
895 	}
896 
897 	/* ok, the BDF hash had a domain for this BDF. */
898 
899 	/* Grab lock again to check if something else set immu_devi fields */
900 	mutex_enter(&(DEVI(pdip)->devi_lock));
901 	if (immu_devi->imd_domain != NULL) {
902 		ASSERT(immu_devi->imd_domain == domain);
903 		dvp->dva_domain = domain;
904 	} else {
905 		dvp->dva_domain = domain;
906 	}
907 	mutex_exit(&(DEVI(pdip)->devi_lock));
908 
909 	/*
910 	 * walk upwards until the topmost PCI bridge is found
911 	 */
912 	return (DDI_WALK_CONTINUE);
913 
914 }
915 
916 static void
917 map_unity_domain(domain_t *domain)
918 {
919 	struct memlist *mp;
920 	uint64_t start;
921 	uint64_t npages;
922 	dcookie_t dcookies[1] = {0};
923 	int dcount = 0;
924 
925 	ASSERT(domain);
926 	ASSERT(domain->dom_did == IMMU_UNITY_DID);
927 
928 	/*
929 	 * We call into routines that grab the lock so we should
930 	 * not be called with the lock held. This does not matter
931 	 * much since, no else has a reference to this domain
932 	 */
933 	ASSERT(!rw_lock_held(&(domain->dom_pgtable_rwlock)));
934 
935 	/*
936 	 * UNITY arenas are a mirror of the physical memory
937 	 * installed on the system.
938 	 */
939 
940 #ifdef BUGGY_DRIVERS
941 	/*
942 	 * Dont skip page0. Some broken HW/FW access it.
943 	 */
944 	dcookies[0].dck_paddr = 0;
945 	dcookies[0].dck_npages = 1;
946 	dcount = 1;
947 	(void) dvma_map(domain->dom_immu, domain, 0, 1, dcookies, dcount, NULL,
948 	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
949 #endif
950 
951 	memlist_read_lock();
952 
953 	mp = phys_install;
954 
955 	if (mp->ml_address == 0) {
956 		/* since we already mapped page1 above */
957 		start = IMMU_PAGESIZE;
958 	} else {
959 		start = mp->ml_address;
960 	}
961 	npages = mp->ml_size/IMMU_PAGESIZE + 1;
962 
963 	dcookies[0].dck_paddr = start;
964 	dcookies[0].dck_npages = npages;
965 	dcount = 1;
966 	(void) dvma_map(domain->dom_immu, domain, start, npages, dcookies,
967 	    dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
968 
969 	ddi_err(DER_LOG, NULL, "IMMU: mapping PHYS span [0x%" PRIx64
970 	    " - 0x%" PRIx64 "]", start, start + mp->ml_size);
971 
972 	mp = mp->ml_next;
973 	while (mp) {
974 		ddi_err(DER_LOG, NULL, "IMMU: mapping PHYS span [0x%" PRIx64
975 		    " - 0x%" PRIx64 "]", mp->ml_address,
976 		    mp->ml_address + mp->ml_size);
977 
978 		start = mp->ml_address;
979 		npages = mp->ml_size/IMMU_PAGESIZE + 1;
980 
981 		dcookies[0].dck_paddr = start;
982 		dcookies[0].dck_npages = npages;
983 		dcount = 1;
984 		(void) dvma_map(domain->dom_immu, domain, start, npages,
985 		    dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
986 		mp = mp->ml_next;
987 	}
988 
989 	mp = bios_rsvd;
990 	while (mp) {
991 		ddi_err(DER_LOG, NULL, "IMMU: mapping PHYS span [0x%" PRIx64
992 		    " - 0x%" PRIx64 "]", mp->ml_address,
993 		    mp->ml_address + mp->ml_size);
994 
995 		start = mp->ml_address;
996 		npages = mp->ml_size/IMMU_PAGESIZE + 1;
997 
998 		dcookies[0].dck_paddr = start;
999 		dcookies[0].dck_npages = npages;
1000 		dcount = 1;
1001 		(void) dvma_map(domain->dom_immu, domain, start, npages,
1002 		    dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
1003 
1004 		mp = mp->ml_next;
1005 	}
1006 
1007 	memlist_read_unlock();
1008 }
1009 
1010 /*
1011  * create_xlate_arena()
1012  * 	Create the dvma arena for a domain with translation
1013  *	mapping
1014  */
1015 static void
1016 create_xlate_arena(immu_t *immu, domain_t *domain,
1017     dev_info_t *rdip, immu_flags_t immu_flags)
1018 {
1019 	char *arena_name;
1020 	struct memlist *mp;
1021 	int vmem_flags;
1022 	uint64_t start;
1023 	uint_t mgaw;
1024 	uint64_t size;
1025 	uint64_t maxaddr;
1026 	void *vmem_ret;
1027 
1028 	arena_name = domain->dom_dvma_arena_name;
1029 
1030 	/* Note, don't do sizeof (arena_name) - it is just a pointer */
1031 	(void) snprintf(arena_name,
1032 	    sizeof (domain->dom_dvma_arena_name),
1033 	    "%s-domain-%d-xlate-DVMA-arena", immu->immu_name,
1034 	    domain->dom_did);
1035 
1036 	vmem_flags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP;
1037 
1038 	/*
1039 	 * No one else has access to this domain.
1040 	 * So no domain locks needed
1041 	 */
1042 	ASSERT(!rw_lock_held(&(domain->dom_pgtable_rwlock)));
1043 
1044 	/* Restrict mgaddr (max guest addr) to MGAW */
1045 	mgaw = IMMU_CAP_MGAW(immu->immu_regs_cap);
1046 
1047 	/*
1048 	 * To ensure we avoid ioapic and PCI MMIO ranges we just
1049 	 * use the physical memory address range of the system as the
1050 	 * range
1051 	 */
1052 	maxaddr = ((uint64_t)1 << mgaw);
1053 
1054 	memlist_read_lock();
1055 
1056 	mp = phys_install;
1057 
1058 	if (mp->ml_address == 0)
1059 		start = MMU_PAGESIZE;
1060 	else
1061 		start = mp->ml_address;
1062 
1063 	if (start + mp->ml_size > maxaddr)
1064 		size = maxaddr - start;
1065 	else
1066 		size = mp->ml_size;
1067 
1068 	ddi_err(DER_VERB, rdip,
1069 	    "%s: Creating dvma vmem arena [0x%" PRIx64
1070 	    " - 0x%" PRIx64 "]", arena_name, start, start + size);
1071 
1072 	ASSERT(domain->dom_dvma_arena == NULL);
1073 
1074 	/*
1075 	 * We always allocate in quanta of IMMU_PAGESIZE
1076 	 */
1077 	domain->dom_dvma_arena = vmem_create(arena_name,
1078 	    (void *)(uintptr_t)start,	/* start addr */
1079 	    size,			/* size */
1080 	    IMMU_PAGESIZE,		/* quantum */
1081 	    NULL,			/* afunc */
1082 	    NULL,			/* ffunc */
1083 	    NULL,			/* source */
1084 	    0,				/* qcache_max */
1085 	    vmem_flags);
1086 
1087 	if (domain->dom_dvma_arena == NULL) {
1088 		ddi_err(DER_PANIC, rdip,
1089 		    "Failed to allocate DVMA arena(%s) "
1090 		    "for domain ID (%d)", arena_name, domain->dom_did);
1091 		/*NOTREACHED*/
1092 	}
1093 
1094 	mp = mp->ml_next;
1095 	while (mp) {
1096 
1097 		if (mp->ml_address == 0)
1098 			start = MMU_PAGESIZE;
1099 		else
1100 			start = mp->ml_address;
1101 
1102 		if (start + mp->ml_size > maxaddr)
1103 			size = maxaddr - start;
1104 		else
1105 			size = mp->ml_size;
1106 
1107 		ddi_err(DER_VERB, rdip,
1108 		    "%s: Adding dvma vmem span [0x%" PRIx64
1109 		    " - 0x%" PRIx64 "]", arena_name, start,
1110 		    start + size);
1111 
1112 		vmem_ret = vmem_add(domain->dom_dvma_arena,
1113 		    (void *)(uintptr_t)start, size,  vmem_flags);
1114 
1115 		if (vmem_ret == NULL) {
1116 			ddi_err(DER_PANIC, rdip,
1117 			    "Failed to allocate DVMA arena(%s) "
1118 			    "for domain ID (%d)",
1119 			    arena_name, domain->dom_did);
1120 			/*NOTREACHED*/
1121 		}
1122 		mp = mp->ml_next;
1123 	}
1124 	memlist_read_unlock();
1125 }
1126 
1127 /* ################################### DOMAIN CODE ######################### */
1128 
1129 /*
1130  * Set the domain and domain-dip for a dip
1131  */
1132 static void
1133 set_domain(
1134 	dev_info_t *dip,
1135 	dev_info_t *ddip,
1136 	domain_t *domain)
1137 {
1138 	immu_devi_t *immu_devi;
1139 	domain_t *fdomain;
1140 	dev_info_t *fddip;
1141 
1142 	ASSERT(dip);
1143 	ASSERT(ddip);
1144 	ASSERT(domain);
1145 	ASSERT(domain->dom_did > 0); /* must be an initialized domain */
1146 
1147 	immu_devi = immu_devi_get(dip);
1148 	ASSERT(immu_devi);
1149 
1150 	mutex_enter(&(DEVI(dip)->devi_lock));
1151 	fddip = immu_devi->imd_ddip;
1152 	fdomain = immu_devi->imd_domain;
1153 
1154 	if (fddip) {
1155 		ASSERT(fddip == ddip);
1156 	} else {
1157 		immu_devi->imd_ddip = ddip;
1158 	}
1159 
1160 	if (fdomain) {
1161 		ASSERT(fdomain == domain);
1162 	} else {
1163 		immu_devi->imd_domain = domain;
1164 	}
1165 	mutex_exit(&(DEVI(dip)->devi_lock));
1166 }
1167 
1168 /*
1169  * device_domain()
1170  * 	Get domain for a device. The domain may be global in which case it
1171  *	is shared between all IOMMU units. Due to potential AGAW differences
1172  *      between IOMMU units, such global domains *have to be* UNITY mapping
1173  *      domains. Alternatively, the domain may be local to a IOMMU unit.
1174  *	Local domains may be shared or immu_devi, although the
1175  *      scope of sharing
1176  *	is restricted to devices controlled by the IOMMU unit to
1177  *      which the domain
1178  *	belongs. If shared, they (currently) have to be UNITY domains. If
1179  *      immu_devi a domain may be either UNITY or translation (XLATE) domain.
1180  */
1181 static domain_t *
1182 device_domain(dev_info_t *rdip, dev_info_t **ddipp, immu_flags_t immu_flags)
1183 {
1184 	dev_info_t *ddip; /* topmost dip in domain i.e. domain owner */
1185 	immu_t *immu;
1186 	domain_t *domain;
1187 	dvma_arg_t dvarg = {0};
1188 	int level;
1189 
1190 	ASSERT(rdip);
1191 
1192 	*ddipp = NULL;
1193 
1194 	/*
1195 	 * Check if the domain is already set. This is usually true
1196 	 * if this is not the first DVMA transaction.
1197 	 */
1198 	ddip = NULL;
1199 	domain = immu_devi_domain(rdip, &ddip);
1200 	if (domain) {
1201 		ASSERT(domain->dom_did > 0);
1202 		ASSERT(ddip);
1203 		*ddipp = ddip;
1204 		return (domain);
1205 	}
1206 
1207 	immu = immu_dvma_get_immu(rdip, immu_flags);
1208 	if (immu == NULL) {
1209 		/*
1210 		 * possible that there is no IOMMU unit for this device
1211 		 * - BIOS bugs are one example.
1212 		 */
1213 		ddi_err(DER_WARN, rdip, "No IMMU unit found for device");
1214 		return (NULL);
1215 	}
1216 
1217 	immu_flags |= immu_devi_get(rdip)->imd_dvma_flags;
1218 
1219 	dvarg.dva_rdip = rdip;
1220 	dvarg.dva_ddip = NULL;
1221 	dvarg.dva_domain = NULL;
1222 	dvarg.dva_flags = immu_flags;
1223 	level = 0;
1224 	if (immu_walk_ancestor(rdip, NULL, get_branch_domain,
1225 	    &dvarg, &level, immu_flags) != DDI_SUCCESS) {
1226 		/*
1227 		 * maybe low memory. return error,
1228 		 * so driver tries again later
1229 		 */
1230 		return (NULL);
1231 	}
1232 
1233 	/* should have walked at least 1 dip (i.e. edip) */
1234 	ASSERT(level > 0);
1235 
1236 	ddip = dvarg.dva_ddip;	/* must be present */
1237 	domain = dvarg.dva_domain;	/* may be NULL */
1238 
1239 	/*
1240 	 * We may find the domain during our ancestor walk on any one of our
1241 	 * ancestor dips, If the domain is found then the domain-dip
1242 	 * (i.e. ddip) will also be found in the same immu_devi struct.
1243 	 * The domain-dip is the highest ancestor dip which shares the
1244 	 * same domain with edip.
1245 	 * The domain may or may not be found, but the domain dip must
1246 	 * be found.
1247 	 */
1248 	if (ddip == NULL) {
1249 		ddi_err(DER_MODE, rdip, "Cannot find domain dip for device.");
1250 		return (NULL);
1251 	}
1252 
1253 	/*
1254 	 * Did we find a domain ?
1255 	 */
1256 	if (domain) {
1257 		goto found;
1258 	}
1259 
1260 	/* nope, so allocate */
1261 	domain = domain_create(immu, ddip, rdip, immu_flags);
1262 	if (domain == NULL) {
1263 		return (NULL);
1264 	}
1265 	ASSERT(domain->dom_did > 0);
1266 
1267 	/*FALLTHROUGH*/
1268 found:
1269 	/*
1270 	 * We know *domain *is* the right domain, so panic if
1271 	 * another domain is set for either the request-dip or
1272 	 * effective dip.
1273 	 */
1274 	set_domain(ddip, ddip, domain);
1275 	set_domain(rdip, ddip, domain);
1276 
1277 	*ddipp = ddip;
1278 	return (domain);
1279 }
1280 
1281 static void
1282 create_unity_domain(immu_t *immu)
1283 {
1284 	domain_t *domain;
1285 
1286 	/* 0 is reserved by Vt-d */
1287 	/*LINTED*/
1288 	ASSERT(IMMU_UNITY_DID > 0);
1289 
1290 	/* domain created during boot and always use sleep flag */
1291 	domain = kmem_zalloc(sizeof (domain_t), KM_SLEEP);
1292 
1293 	rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);
1294 
1295 	domain->dom_did = IMMU_UNITY_DID;
1296 	domain->dom_maptype = IMMU_MAPTYPE_UNITY;
1297 
1298 	domain->dom_immu = immu;
1299 	immu->immu_unity_domain = domain;
1300 
1301 	/*
1302 	 * Setup the domain's initial page table
1303 	 * should never fail.
1304 	 */
1305 	domain->dom_pgtable_root = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
1306 	ASSERT(domain->dom_pgtable_root);
1307 	pgtable_zero(immu, domain->dom_pgtable_root);
1308 
1309 	/*
1310 	 * Only map all physical memory in to the unity domain
1311 	 * if passthrough is not supported. If it is supported,
1312 	 * passthrough is set in the context entry instead.
1313 	 */
1314 	if (!IMMU_ECAP_GET_PT(immu->immu_regs_excap))
1315 		map_unity_domain(domain);
1316 
1317 
1318 	/*
1319 	 * put it on the system-wide UNITY domain list
1320 	 */
1321 	mutex_enter(&(immu_domain_lock));
1322 	list_insert_tail(&immu_unity_domain_list, domain);
1323 	mutex_exit(&(immu_domain_lock));
1324 }
1325 
1326 /*
1327  * ddip is the domain-dip - the topmost dip in a domain
1328  * rdip is the requesting-dip - the device which is
1329  * requesting DVMA setup
1330  * if domain is a non-shared domain rdip == ddip
1331  */
1332 static domain_t *
1333 domain_create(immu_t *immu, dev_info_t *ddip, dev_info_t *rdip,
1334     immu_flags_t immu_flags)
1335 {
1336 	int kmflags;
1337 	domain_t *domain;
1338 	char mod_hash_name[128];
1339 	immu_devi_t *immu_devi;
1340 	int did;
1341 	dcookie_t dcookies[1] = {0};
1342 	int dcount = 0;
1343 
1344 	ASSERT(immu);
1345 	ASSERT(ddip);
1346 
1347 	immu_devi = immu_devi_get(rdip);
1348 
1349 	ASSERT(immu_devi);
1350 
1351 	/*
1352 	 * First allocate a domainid.
1353 	 * This routine will never fail, since if we run out
1354 	 * of domains the unity domain will be allocated.
1355 	 */
1356 	did = did_alloc(immu, rdip, ddip, immu_flags);
1357 	ASSERT(did > 0);
1358 	if (did == IMMU_UNITY_DID) {
1359 		/* domain overflow */
1360 		ASSERT(immu->immu_unity_domain);
1361 		return (immu->immu_unity_domain);
1362 	}
1363 
1364 	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1365 	domain = kmem_zalloc(sizeof (domain_t), kmflags);
1366 	if (domain == NULL) {
1367 		ddi_err(DER_PANIC, rdip, "Failed to alloc DVMA domain "
1368 		    "structure for device. IOMMU unit: %s", immu->immu_name);
1369 		/*NOTREACHED*/
1370 	}
1371 
1372 	rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);
1373 
1374 	(void) snprintf(mod_hash_name, sizeof (mod_hash_name),
1375 	    "immu%s-domain%d-pava-hash", immu->immu_name, did);
1376 
1377 	domain->dom_did = did;
1378 	domain->dom_immu = immu;
1379 	domain->dom_maptype = IMMU_MAPTYPE_XLATE;
1380 
1381 	/*
1382 	 * Create xlate DVMA arena for this domain.
1383 	 */
1384 	create_xlate_arena(immu, domain, rdip, immu_flags);
1385 
1386 	/*
1387 	 * Setup the domain's initial page table
1388 	 */
1389 	domain->dom_pgtable_root = pgtable_alloc(immu, immu_flags);
1390 	if (domain->dom_pgtable_root == NULL) {
1391 		ddi_err(DER_PANIC, rdip, "Failed to alloc root "
1392 		    "pgtable for domain (%d). IOMMU unit: %s",
1393 		    domain->dom_did, immu->immu_name);
1394 		/*NOTREACHED*/
1395 	}
1396 	pgtable_zero(immu, domain->dom_pgtable_root);
1397 
1398 	/*
1399 	 * Since this is a immu unit-specific domain, put it on
1400 	 * the per-immu domain list.
1401 	 */
1402 	mutex_enter(&(immu->immu_lock));
1403 	list_insert_head(&immu->immu_domain_list, domain);
1404 	mutex_exit(&(immu->immu_lock));
1405 
1406 	/*
1407 	 * Also put it on the system-wide xlate domain list
1408 	 */
1409 	mutex_enter(&(immu_domain_lock));
1410 	list_insert_head(&immu_xlate_domain_list, domain);
1411 	mutex_exit(&(immu_domain_lock));
1412 
1413 	bdf_domain_insert(immu_devi, domain);
1414 
1415 #ifdef BUGGY_DRIVERS
1416 	/*
1417 	 * Map page0. Some broken HW/FW access it.
1418 	 */
1419 	dcookies[0].dck_paddr = 0;
1420 	dcookies[0].dck_npages = 1;
1421 	dcount = 1;
1422 	(void) dvma_map(domain->dom_immu, domain, 0, 1, dcookies, dcount, NULL,
1423 	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
1424 #endif
1425 	return (domain);
1426 }
1427 
1428 /*
1429  * Create domainid arena.
1430  * Domainid 0 is reserved by Vt-d spec and cannot be used by
1431  * system software.
1432  * Domainid 1 is reserved by solaris and used for *all* of the following:
1433  *	as the "uninitialized" domain - For devices not yet controlled
1434  *	by Solaris
1435  *	as the "unity" domain - For devices that will always belong
1436  *	to the unity domain
1437  *	as the "overflow" domain - Used for any new device after we
1438  *	run out of domains
1439  * All of the above domains map into a single domain with
1440  * domainid 1 and UNITY DVMA mapping
1441  * Each IMMU unity has its own unity/uninit/overflow domain
1442  */
1443 static void
1444 did_init(immu_t *immu)
1445 {
1446 	(void) snprintf(immu->immu_did_arena_name,
1447 	    sizeof (immu->immu_did_arena_name),
1448 	    "%s_domainid_arena", immu->immu_name);
1449 
1450 	ddi_err(DER_VERB, NULL, "%s: Creating domainid arena %s",
1451 	    immu->immu_name, immu->immu_did_arena_name);
1452 
1453 	immu->immu_did_arena = vmem_create(
1454 	    immu->immu_did_arena_name,
1455 	    (void *)(uintptr_t)(IMMU_UNITY_DID + 1),   /* start addr */
1456 	    immu->immu_max_domains - IMMU_UNITY_DID,
1457 	    1,				/* quantum */
1458 	    NULL,			/* afunc */
1459 	    NULL,			/* ffunc */
1460 	    NULL,			/* source */
1461 	    0,				/* qcache_max */
1462 	    VM_SLEEP);
1463 
1464 	/* Even with SLEEP flag, vmem_create() can fail */
1465 	if (immu->immu_did_arena == NULL) {
1466 		ddi_err(DER_PANIC, NULL, "%s: Failed to create Intel "
1467 		    "IOMMU domainid allocator: %s", immu->immu_name,
1468 		    immu->immu_did_arena_name);
1469 	}
1470 }
1471 
1472 /* #########################  CONTEXT CODE ################################# */
1473 
1474 static void
1475 context_set(immu_t *immu, domain_t *domain, pgtable_t *root_table,
1476     int bus, int devfunc)
1477 {
1478 	pgtable_t *context;
1479 	pgtable_t *pgtable_root;
1480 	pgtable_t *unity_pgtable_root;
1481 	hw_rce_t *hw_rent;
1482 	hw_rce_t *hw_cent;
1483 	hw_rce_t *ctxp;
1484 	int sid;
1485 	krw_t rwtype;
1486 	boolean_t fill_root;
1487 	boolean_t fill_ctx;
1488 
1489 	ASSERT(immu);
1490 	ASSERT(domain);
1491 	ASSERT(root_table);
1492 	ASSERT(bus >= 0);
1493 	ASSERT(devfunc >= 0);
1494 	ASSERT(domain->dom_pgtable_root);
1495 
1496 	pgtable_root = domain->dom_pgtable_root;
1497 
1498 	ctxp = (hw_rce_t *)(root_table->swpg_next_array);
1499 	context = *(pgtable_t **)(ctxp + bus);
1500 	hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr) + bus;
1501 
1502 	fill_root = B_FALSE;
1503 	fill_ctx = B_FALSE;
1504 
1505 	/* Check the most common case first with reader lock */
1506 	rw_enter(&(immu->immu_ctx_rwlock), RW_READER);
1507 	rwtype = RW_READER;
1508 again:
1509 	if (ROOT_GET_P(hw_rent)) {
1510 		ASSERT(ROOT_GET_CONT(hw_rent) == context->hwpg_paddr);
1511 		hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc;
1512 		if (CONT_GET_AVAIL(hw_cent) == IMMU_CONT_INITED) {
1513 			ASSERT(CONT_GET_P(hw_cent));
1514 			ASSERT(CONT_GET_DID(hw_cent) == domain->dom_did);
1515 			ASSERT(CONT_GET_AW(hw_cent) == immu->immu_dvma_agaw);
1516 			ASSERT(CONT_GET_ASR(hw_cent) ==
1517 			    pgtable_root->hwpg_paddr);
1518 			rw_exit(&(immu->immu_ctx_rwlock));
1519 			return;
1520 		} else {
1521 			fill_ctx = B_TRUE;
1522 		}
1523 	} else {
1524 		fill_root = B_TRUE;
1525 		fill_ctx = B_TRUE;
1526 	}
1527 
1528 	if (rwtype == RW_READER &&
1529 	    rw_tryupgrade(&(immu->immu_ctx_rwlock)) == 0) {
1530 		rw_exit(&(immu->immu_ctx_rwlock));
1531 		rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
1532 		rwtype = RW_WRITER;
1533 		goto again;
1534 	}
1535 	rwtype = RW_WRITER;
1536 
1537 	if (fill_root == B_TRUE) {
1538 		ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
1539 		ROOT_SET_P(hw_rent);
1540 		immu_regs_cpu_flush(immu, (caddr_t)hw_rent, sizeof (hw_rce_t));
1541 	}
1542 
1543 	if (fill_ctx == B_TRUE) {
1544 		hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc;
1545 		unity_pgtable_root = immu->immu_unity_domain->dom_pgtable_root;
1546 		ASSERT(CONT_GET_AVAIL(hw_cent) == IMMU_CONT_UNINITED);
1547 		ASSERT(CONT_GET_P(hw_cent));
1548 		ASSERT(CONT_GET_DID(hw_cent) ==
1549 		    immu->immu_unity_domain->dom_did);
1550 		ASSERT(CONT_GET_AW(hw_cent) == immu->immu_dvma_agaw);
1551 		ASSERT(CONT_GET_ASR(hw_cent) ==
1552 		    unity_pgtable_root->hwpg_paddr);
1553 
1554 		/* need to disable context entry before reprogramming it */
1555 		bzero(hw_cent, sizeof (hw_rce_t));
1556 
1557 		/* flush caches */
1558 		immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
1559 		ASSERT(rw_write_held(&(immu->immu_ctx_rwlock)));
1560 
1561 		sid = ((bus << 8) | devfunc);
1562 		immu_flush_context_fsi(immu, 0, sid, domain->dom_did);
1563 
1564 		immu_regs_wbf_flush(immu);
1565 
1566 		CONT_SET_AVAIL(hw_cent, IMMU_CONT_INITED);
1567 		CONT_SET_DID(hw_cent, domain->dom_did);
1568 		CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
1569 		CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
1570 		if (domain->dom_did == IMMU_UNITY_DID &&
1571 		    IMMU_ECAP_GET_PT(immu->immu_regs_excap))
1572 			CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU);
1573 		else
1574 			/*LINTED*/
1575 			CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
1576 		CONT_SET_P(hw_cent);
1577 		immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
1578 	}
1579 	rw_exit(&(immu->immu_ctx_rwlock));
1580 }
1581 
1582 static pgtable_t *
1583 context_create(immu_t *immu)
1584 {
1585 	int	bus;
1586 	int	devfunc;
1587 	pgtable_t *root_table;
1588 	pgtable_t *context;
1589 	pgtable_t *pgtable_root;
1590 	hw_rce_t *ctxp;
1591 	hw_rce_t *hw_rent;
1592 	hw_rce_t *hw_cent;
1593 
1594 	/* Allocate a zeroed root table (4K 256b entries) */
1595 	root_table = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
1596 	pgtable_zero(immu, root_table);
1597 
1598 	/*
1599 	 * Setup context tables for all possible root table entries.
1600 	 * Start out with unity domains for all entries.
1601 	 */
1602 	ctxp = (hw_rce_t *)(root_table->swpg_next_array);
1603 	hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr);
1604 	for (bus = 0; bus < IMMU_ROOT_NUM; bus++, ctxp++, hw_rent++) {
1605 		context = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
1606 		pgtable_zero(immu, context);
1607 		ASSERT(ROOT_GET_P(hw_rent) == 0);
1608 		ROOT_SET_P(hw_rent);
1609 		ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
1610 		hw_cent = (hw_rce_t *)(context->hwpg_vaddr);
1611 		for (devfunc = 0; devfunc < IMMU_CONT_NUM;
1612 		    devfunc++, hw_cent++) {
1613 			ASSERT(CONT_GET_P(hw_cent) == 0);
1614 			pgtable_root =
1615 			    immu->immu_unity_domain->dom_pgtable_root;
1616 			CONT_SET_DID(hw_cent,
1617 			    immu->immu_unity_domain->dom_did);
1618 			CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
1619 			CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
1620 			if (IMMU_ECAP_GET_PT(immu->immu_regs_excap))
1621 				CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU);
1622 			else
1623 				/*LINTED*/
1624 				CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
1625 			CONT_SET_AVAIL(hw_cent, IMMU_CONT_UNINITED);
1626 			CONT_SET_P(hw_cent);
1627 		}
1628 		immu_regs_cpu_flush(immu, context->hwpg_vaddr, IMMU_PAGESIZE);
1629 		*((pgtable_t **)ctxp) = context;
1630 	}
1631 	immu_regs_cpu_flush(immu, root_table->hwpg_vaddr, IMMU_PAGESIZE);
1632 
1633 	return (root_table);
1634 }
1635 
1636 /*
1637  * Called during rootnex attach, so no locks needed
1638  */
1639 static void
1640 context_init(immu_t *immu)
1641 {
1642 	ASSERT(immu);
1643 	ASSERT(immu->immu_ctx_root == NULL);
1644 
1645 	rw_init(&(immu->immu_ctx_rwlock), NULL, RW_DEFAULT, NULL);
1646 
1647 	immu_regs_wbf_flush(immu);
1648 
1649 	immu->immu_ctx_root = context_create(immu);
1650 
1651 	immu_regs_set_root_table(immu);
1652 
1653 	rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
1654 	immu_flush_context_gbl(immu);
1655 	rw_exit(&(immu->immu_ctx_rwlock));
1656 	immu_flush_iotlb_gbl(immu);
1657 	immu_regs_wbf_flush(immu);
1658 }
1659 
1660 
1661 /*
1662  * Find top pcib
1663  */
1664 static int
1665 find_top_pcib(dev_info_t *dip, void *arg)
1666 {
1667 	immu_devi_t *immu_devi;
1668 	dev_info_t **pcibdipp = (dev_info_t **)arg;
1669 
1670 	ASSERT(dip);
1671 
1672 	immu_devi = immu_devi_get(dip);
1673 	ASSERT(immu_devi);
1674 
1675 	if (immu_devi->imd_pcib_type == IMMU_PCIB_PCI_PCI) {
1676 		*pcibdipp = dip;
1677 	}
1678 
1679 	return (DDI_WALK_CONTINUE);
1680 }
1681 
1682 static int
1683 immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip,
1684     dev_info_t *rdip, immu_flags_t immu_flags)
1685 {
1686 	immu_devi_t *r_immu_devi;
1687 	immu_devi_t *d_immu_devi;
1688 	int r_bus;
1689 	int d_bus;
1690 	int r_devfunc;
1691 	int d_devfunc;
1692 	immu_pcib_t d_pcib_type;
1693 	immu_pcib_t r_pcib_type;
1694 	dev_info_t *pcibdip;
1695 
1696 	if (ddip == NULL || rdip == NULL ||
1697 	    ddip == root_devinfo || rdip == root_devinfo) {
1698 		ddi_err(DER_MODE, rdip, "immu_contexts_update: domain-dip or "
1699 		    "request-dip are NULL or are root devinfo");
1700 		return (DDI_FAILURE);
1701 	}
1702 
1703 	/*
1704 	 * We need to set the context fields
1705 	 * based on what type of device rdip and ddip are.
1706 	 * To do that we need the immu_devi field.
1707 	 * Set the immu_devi field (if not already set)
1708 	 */
1709 	if (immu_devi_set(ddip, immu_flags) == DDI_FAILURE) {
1710 		ddi_err(DER_MODE, rdip,
1711 		    "immu_context_update: failed to set immu_devi for ddip");
1712 		return (DDI_FAILURE);
1713 	}
1714 
1715 	if (immu_devi_set(rdip, immu_flags) == DDI_FAILURE) {
1716 		ddi_err(DER_MODE, rdip,
1717 		    "immu_context_update: failed to set immu_devi for rdip");
1718 		return (DDI_FAILURE);
1719 	}
1720 
1721 	d_immu_devi = immu_devi_get(ddip);
1722 	r_immu_devi = immu_devi_get(rdip);
1723 	ASSERT(r_immu_devi);
1724 	ASSERT(d_immu_devi);
1725 
1726 	d_bus = d_immu_devi->imd_bus;
1727 	d_devfunc = d_immu_devi->imd_devfunc;
1728 	d_pcib_type = d_immu_devi->imd_pcib_type;
1729 	r_bus = r_immu_devi->imd_bus;
1730 	r_devfunc = r_immu_devi->imd_devfunc;
1731 	r_pcib_type = r_immu_devi->imd_pcib_type;
1732 
1733 	ASSERT(d_bus >= 0);
1734 
1735 	if (rdip == ddip) {
1736 		ASSERT(d_pcib_type == IMMU_PCIB_ENDPOINT ||
1737 		    d_pcib_type == IMMU_PCIB_PCIE_PCIE);
1738 		ASSERT(r_bus >= 0);
1739 		ASSERT(r_devfunc >= 0);
1740 		/* rdip is a PCIE device. set context for it only */
1741 		context_set(immu, domain, immu->immu_ctx_root, r_bus,
1742 		    r_devfunc);
1743 #ifdef BUGGY_DRIVERS
1744 	} else if (r_immu_devi == d_immu_devi) {
1745 #ifdef TEST
1746 		ddi_err(DER_WARN, rdip, "Driver bug: Devices 0x%lx and "
1747 		    "0x%lx are identical", rdip, ddip);
1748 #endif
1749 		ASSERT(d_pcib_type == IMMU_PCIB_ENDPOINT);
1750 		ASSERT(r_bus >= 0);
1751 		ASSERT(r_devfunc >= 0);
1752 		/* rdip is a PCIE device. set context for it only */
1753 		context_set(immu, domain, immu->immu_ctx_root, r_bus,
1754 		    r_devfunc);
1755 #endif
1756 	} else if (d_pcib_type == IMMU_PCIB_PCIE_PCI) {
1757 		/*
1758 		 * ddip is a PCIE_PCI bridge. Set context for ddip's
1759 		 * secondary bus. If rdip is on ddip's secondary
1760 		 * bus, set context for rdip. Else, set context
1761 		 * for rdip's PCI bridge on ddip's secondary bus.
1762 		 */
1763 		context_set(immu, domain, immu->immu_ctx_root,
1764 		    d_immu_devi->imd_sec, 0);
1765 		if (d_immu_devi->imd_sec == r_bus) {
1766 			context_set(immu, domain, immu->immu_ctx_root,
1767 			    r_bus, r_devfunc);
1768 		} else {
1769 			pcibdip = NULL;
1770 			if (immu_walk_ancestor(rdip, ddip, find_top_pcib,
1771 			    &pcibdip, NULL, immu_flags) == DDI_SUCCESS &&
1772 			    pcibdip != NULL) {
1773 				ASSERT(pcibdip);
1774 				r_immu_devi = immu_devi_get(pcibdip);
1775 				ASSERT(d_immu_devi);
1776 				ASSERT(d_immu_devi->imd_pcib_type ==
1777 				    IMMU_PCIB_PCI_PCI);
1778 				r_bus = r_immu_devi->imd_bus;
1779 				r_devfunc = r_immu_devi->imd_devfunc;
1780 				context_set(immu, domain, immu->immu_ctx_root,
1781 				    r_bus, r_devfunc);
1782 			} else {
1783 				ddi_err(DER_PANIC, rdip, "Failed to find PCI "
1784 				    " bridge for PCI device");
1785 				/*NOTREACHED*/
1786 			}
1787 		}
1788 	} else if (d_pcib_type == IMMU_PCIB_PCI_PCI) {
1789 		context_set(immu, domain, immu->immu_ctx_root, d_bus,
1790 		    d_devfunc);
1791 	} else if (d_pcib_type == IMMU_PCIB_ENDPOINT) {
1792 		ASSERT(r_pcib_type == IMMU_PCIB_NOBDF);
1793 		/*
1794 		 * ddip is a PCIE device which has a non-PCI device under it
1795 		 * i.e. it is a PCI-nonPCI bridge. Example: pciicde-ata
1796 		 */
1797 		context_set(immu, domain, immu->immu_ctx_root, d_bus,
1798 		    d_devfunc);
1799 	} else {
1800 		ddi_err(DER_PANIC, rdip, "unknown device type. Cannot "
1801 		    "set IMMU context.");
1802 		/*NOTREACHED*/
1803 	}
1804 
1805 	/* XXX do we need a membar_producer() here */
1806 	return (DDI_SUCCESS);
1807 }
1808 
1809 /* ##################### END CONTEXT CODE ################################## */
1810 /* ##################### MAPPING CODE ################################## */
1811 
1812 
1813 static boolean_t
1814 PDTE_check(immu_t *immu, hw_pdte_t pdte, pgtable_t *next, paddr_t paddr,
1815     dev_info_t *rdip, immu_flags_t immu_flags)
1816 {
1817 	if (immu_flags & IMMU_FLAGS_PAGE1) {
1818 		ASSERT(paddr == 0);
1819 	} else {
1820 		ASSERT((next == NULL) ^ (paddr == 0));
1821 	}
1822 
1823 	/* The PDTE must be set i.e. present bit is set */
1824 	if (!PDTE_P(pdte)) {
1825 		ddi_err(DER_MODE, rdip, "No present flag");
1826 		return (B_FALSE);
1827 	}
1828 
1829 	/*
1830 	 * Just assert to check most significant system software field
1831 	 * (PDTE_SW4) as it is same as present bit and we
1832 	 * checked that above
1833 	 */
1834 	ASSERT(PDTE_SW4(pdte));
1835 
1836 	/*
1837 	 * TM field should be clear if not reserved.
1838 	 * non-leaf is always reserved
1839 	 */
1840 	if (next == NULL && immu->immu_TM_reserved == B_FALSE) {
1841 		if (PDTE_TM(pdte)) {
1842 			ddi_err(DER_MODE, rdip, "TM flag set");
1843 			return (B_FALSE);
1844 		}
1845 	}
1846 
1847 	/*
1848 	 * The SW3 field is not used and must be clear
1849 	 */
1850 	if (PDTE_SW3(pdte)) {
1851 		ddi_err(DER_MODE, rdip, "SW3 set");
1852 		return (B_FALSE);
1853 	}
1854 
1855 	/*
1856 	 * PFN (for PTE) or next level pgtable-paddr (for PDE) must be set
1857 	 */
1858 	if (next == NULL) {
1859 		ASSERT(paddr % IMMU_PAGESIZE == 0);
1860 		if (PDTE_PADDR(pdte) != paddr) {
1861 			ddi_err(DER_MODE, rdip,
1862 			    "PTE paddr mismatch: %lx != %lx",
1863 			    PDTE_PADDR(pdte), paddr);
1864 			return (B_FALSE);
1865 		}
1866 	} else {
1867 		if (PDTE_PADDR(pdte) != next->hwpg_paddr) {
1868 			ddi_err(DER_MODE, rdip,
1869 			    "PDE paddr mismatch: %lx != %lx",
1870 			    PDTE_PADDR(pdte), next->hwpg_paddr);
1871 			return (B_FALSE);
1872 		}
1873 	}
1874 
1875 	/*
1876 	 * SNP field should be clear if not reserved.
1877 	 * non-leaf is always reserved
1878 	 */
1879 	if (next == NULL && immu->immu_SNP_reserved == B_FALSE) {
1880 		if (PDTE_SNP(pdte)) {
1881 			ddi_err(DER_MODE, rdip, "SNP set");
1882 			return (B_FALSE);
1883 		}
1884 	}
1885 
1886 	/* second field available for system software should be clear */
1887 	if (PDTE_SW2(pdte)) {
1888 		ddi_err(DER_MODE, rdip, "SW2 set");
1889 		return (B_FALSE);
1890 	}
1891 
1892 	/* Super pages field should be clear */
1893 	if (PDTE_SP(pdte)) {
1894 		ddi_err(DER_MODE, rdip, "SP set");
1895 		return (B_FALSE);
1896 	}
1897 
1898 	/*
1899 	 * least significant field available for
1900 	 * system software should be clear
1901 	 */
1902 	if (PDTE_SW1(pdte)) {
1903 		ddi_err(DER_MODE, rdip, "SW1 set");
1904 		return (B_FALSE);
1905 	}
1906 
1907 	if ((immu_flags & IMMU_FLAGS_READ) && !PDTE_READ(pdte)) {
1908 		ddi_err(DER_MODE, rdip, "READ not set");
1909 		return (B_FALSE);
1910 	}
1911 
1912 	if ((immu_flags & IMMU_FLAGS_WRITE) && !PDTE_WRITE(pdte)) {
1913 		ddi_err(DER_MODE, rdip, "WRITE not set");
1914 		return (B_FALSE);
1915 	}
1916 
1917 	return (B_TRUE);
1918 }
1919 /*ARGSUSED*/
1920 static void
1921 PTE_clear_all(immu_t *immu, domain_t *domain, xlate_t *xlate,
1922     uint64_t *dvma_ptr, uint64_t *npages_ptr, dev_info_t *rdip)
1923 {
1924 	uint64_t npages;
1925 	uint64_t dvma;
1926 	pgtable_t *pgtable;
1927 	hw_pdte_t *hwp;
1928 	hw_pdte_t *shwp;
1929 	int idx;
1930 	hw_pdte_t pte;
1931 
1932 	ASSERT(xlate->xlt_level == 1);
1933 
1934 	pgtable = xlate->xlt_pgtable;
1935 	idx = xlate->xlt_idx;
1936 
1937 	ASSERT(pgtable);
1938 	ASSERT(idx <= IMMU_PGTABLE_MAXIDX);
1939 
1940 	dvma = *dvma_ptr;
1941 	npages = *npages_ptr;
1942 
1943 	ASSERT(dvma);
1944 	ASSERT(dvma % IMMU_PAGESIZE == 0);
1945 	ASSERT(npages);
1946 
1947 	/*
1948 	 * since a caller gets a unique dvma for a physical address,
1949 	 * no other concurrent thread will be writing to the same
1950 	 * PTE even if it has the same paddr. So no locks needed.
1951 	 */
1952 	shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
1953 
1954 	hwp = shwp;
1955 	for (; npages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) {
1956 
1957 		pte = *hwp;
1958 
1959 		/* Cannot clear a HW PTE that is aleady clear */
1960 		ASSERT(PDTE_P(pte));
1961 		PDTE_CLEAR_P(pte);
1962 		*hwp = pte;
1963 
1964 		dvma += IMMU_PAGESIZE;
1965 		npages--;
1966 	}
1967 
1968 
1969 #ifdef TEST
1970 	/* dont need to flush write during unmap */
1971 	immu_regs_cpu_flush(immu, (caddr_t)shwp,
1972 	    (hwp - shwp) * sizeof (hw_pdte_t));
1973 #endif
1974 
1975 	*dvma_ptr = dvma;
1976 	*npages_ptr = npages;
1977 
1978 	xlate->xlt_idx = idx;
1979 }
1980 
1981 /*ARGSUSED*/
1982 static void
1983 xlate_setup(immu_t *immu, uint64_t dvma, xlate_t *xlate,
1984     int nlevels, dev_info_t *rdip)
1985 {
1986 	int level;
1987 	uint64_t offbits;
1988 
1989 	/* level 0 is never used. Sanity check */
1990 	ASSERT(xlate->xlt_level == 0);
1991 	ASSERT(xlate->xlt_idx == 0);
1992 	ASSERT(xlate->xlt_pgtable == NULL);
1993 	ASSERT(dvma % IMMU_PAGESIZE == 0);
1994 
1995 	/*
1996 	 * Skip the first 12 bits which is the offset into
1997 	 * 4K PFN (phys page frame based on IMMU_PAGESIZE)
1998 	 */
1999 	offbits = dvma >> IMMU_PAGESHIFT;
2000 
2001 	/* skip to level 1 i.e. leaf PTE */
2002 	for (level = 1, xlate++; level <= nlevels; level++, xlate++) {
2003 		xlate->xlt_level = level;
2004 		xlate->xlt_idx = (offbits & IMMU_PGTABLE_LEVEL_MASK);
2005 		ASSERT(xlate->xlt_idx <= IMMU_PGTABLE_MAXIDX);
2006 		xlate->xlt_pgtable = NULL;
2007 		offbits >>= IMMU_PGTABLE_LEVEL_STRIDE;
2008 	}
2009 }
2010 
2011 /*
2012  * Read the pgtables
2013  */
2014 static void
2015 PDE_lookup(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels,
2016     dev_info_t *rdip)
2017 {
2018 	pgtable_t *pgtable;
2019 	pgtable_t *next;
2020 	hw_pdte_t pde;
2021 	uint_t idx;
2022 
2023 	/* xlate should be at level 0 */
2024 	ASSERT(xlate->xlt_level == 0);
2025 	ASSERT(xlate->xlt_idx == 0);
2026 
2027 	/* start with highest level pgtable i.e. root */
2028 	xlate += nlevels;
2029 	ASSERT(xlate->xlt_level == nlevels);
2030 
2031 	if (xlate->xlt_pgtable == NULL) {
2032 		xlate->xlt_pgtable = domain->dom_pgtable_root;
2033 	}
2034 
2035 	for (; xlate->xlt_level > 1; xlate--) {
2036 
2037 		idx = xlate->xlt_idx;
2038 		pgtable = xlate->xlt_pgtable;
2039 
2040 		ASSERT(pgtable);
2041 		ASSERT(idx <= IMMU_PGTABLE_MAXIDX);
2042 
2043 		if ((xlate - 1)->xlt_pgtable) {
2044 			continue;
2045 		}
2046 
2047 		/* xlate's leafier level is not set, set it now */
2048 
2049 		/* Lock the pgtable in read mode */
2050 		rw_enter(&(pgtable->swpg_rwlock), RW_READER);
2051 
2052 		/*
2053 		 * since we are unmapping, the pgtable should
2054 		 * already point to a leafier pgtable.
2055 		 */
2056 		next = *(pgtable->swpg_next_array + idx);
2057 		ASSERT(next);
2058 
2059 		pde = *((hw_pdte_t *)(pgtable->hwpg_vaddr) + idx);
2060 
2061 		ASSERT(PDTE_check(immu, pde, next, 0, rdip, 0) == B_TRUE);
2062 
2063 		(xlate - 1)->xlt_pgtable = next;
2064 
2065 		rw_exit(&(pgtable->swpg_rwlock));
2066 	}
2067 }
2068 
2069 /*ARGSUSED*/
2070 static void
2071 PTE_set_one(immu_t *immu, hw_pdte_t *hwp, paddr_t paddr,
2072     dev_info_t *rdip, immu_flags_t immu_flags)
2073 {
2074 	hw_pdte_t pte;
2075 
2076 	pte = *hwp;
2077 
2078 #ifndef DEBUG
2079 	/* Set paddr */
2080 	ASSERT(paddr % IMMU_PAGESIZE == 0);
2081 	pte = 0;
2082 	PDTE_SET_PADDR(pte, paddr);
2083 	PDTE_SET_READ(pte);
2084 	PDTE_SET_WRITE(pte);
2085 	*hwp = pte;
2086 #else
2087 
2088 	if (PDTE_P(pte)) {
2089 		if (PDTE_PADDR(pte) != paddr) {
2090 			ddi_err(DER_MODE, rdip, "PTE paddr %lx != paddr %lx",
2091 			    PDTE_PADDR(pte), paddr);
2092 		}
2093 #ifdef BUGGY_DRIVERS
2094 		return;
2095 #else
2096 		goto out;
2097 #endif
2098 	}
2099 
2100 	/* Don't touch SW4. It is the present field */
2101 
2102 	/* clear TM field if not reserved */
2103 	if (immu->immu_TM_reserved == B_FALSE) {
2104 		PDTE_CLEAR_TM(pte);
2105 	}
2106 
2107 #ifdef DEBUG
2108 	/* Clear 3rd field for system software  - not used */
2109 	PDTE_CLEAR_SW3(pte);
2110 #endif
2111 
2112 	/* Set paddr */
2113 	ASSERT(paddr % IMMU_PAGESIZE == 0);
2114 	PDTE_CLEAR_PADDR(pte);
2115 	PDTE_SET_PADDR(pte, paddr);
2116 
2117 	/*  clear SNP field if not reserved. */
2118 	if (immu->immu_SNP_reserved == B_FALSE) {
2119 		PDTE_CLEAR_SNP(pte);
2120 	}
2121 
2122 #ifdef DEBUG
2123 	/* Clear SW2 field available for software */
2124 	PDTE_CLEAR_SW2(pte);
2125 #endif
2126 
2127 
2128 #ifdef DEBUG
2129 	/* SP is don't care for PTEs. Clear it for cleanliness */
2130 	PDTE_CLEAR_SP(pte);
2131 #endif
2132 
2133 #ifdef DEBUG
2134 	/* Clear SW1 field available for software */
2135 	PDTE_CLEAR_SW1(pte);
2136 #endif
2137 
2138 	/*
2139 	 * Now that we are done writing the PTE
2140 	 * set the "present" flag. Note this present
2141 	 * flag is a bit in the PDE/PTE that the
2142 	 * spec says is available for system software.
2143 	 * This is an implementation detail of Solaris
2144 	 * bare-metal Intel IOMMU.
2145 	 * The present field in a PDE/PTE is not defined
2146 	 * by the Vt-d spec
2147 	 */
2148 
2149 	PDTE_SET_P(pte);
2150 
2151 out:
2152 #ifdef BUGGY_DRIVERS
2153 	PDTE_SET_READ(pte);
2154 	PDTE_SET_WRITE(pte);
2155 #else
2156 	if (immu_flags & IMMU_FLAGS_READ)
2157 		PDTE_SET_READ(pte);
2158 	if (immu_flags & IMMU_FLAGS_WRITE)
2159 		PDTE_SET_WRITE(pte);
2160 #endif
2161 
2162 	*hwp = pte;
2163 #endif
2164 }
2165 
2166 /*ARGSUSED*/
2167 static void
2168 PTE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate,
2169     uint64_t *dvma_ptr, uint64_t *nvpages_ptr, dcookie_t *dcookies,
2170     int dcount, dev_info_t *rdip, immu_flags_t immu_flags)
2171 {
2172 	paddr_t paddr;
2173 	uint64_t nvpages;
2174 	uint64_t nppages;
2175 	uint64_t dvma;
2176 	pgtable_t *pgtable;
2177 	hw_pdte_t *hwp;
2178 	hw_pdte_t *shwp;
2179 	int idx;
2180 	int j;
2181 
2182 	ASSERT(xlate->xlt_level == 1);
2183 
2184 	pgtable = xlate->xlt_pgtable;
2185 	idx = xlate->xlt_idx;
2186 
2187 	ASSERT(idx <= IMMU_PGTABLE_MAXIDX);
2188 	ASSERT(pgtable);
2189 
2190 	dvma = *dvma_ptr;
2191 	nvpages = *nvpages_ptr;
2192 
2193 	ASSERT(dvma || (immu_flags & IMMU_FLAGS_PAGE1));
2194 	ASSERT(nvpages);
2195 
2196 	/*
2197 	 * since a caller gets a unique dvma for a physical address,
2198 	 * no other concurrent thread will be writing to the same
2199 	 * PTE even if it has the same paddr. So no locks needed.
2200 	 */
2201 	shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
2202 
2203 	hwp = shwp;
2204 	for (j = dcount - 1; j >= 0; j--) {
2205 		if (nvpages <= dcookies[j].dck_npages)
2206 			break;
2207 		nvpages -= dcookies[j].dck_npages;
2208 	}
2209 
2210 	ASSERT(j >= 0);
2211 	ASSERT(nvpages);
2212 	ASSERT(nvpages <= dcookies[j].dck_npages);
2213 	nppages = nvpages;
2214 	paddr = dcookies[j].dck_paddr +
2215 	    (dcookies[j].dck_npages - nppages) * IMMU_PAGESIZE;
2216 
2217 	nvpages = *nvpages_ptr;
2218 	for (; nvpages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) {
2219 
2220 		ASSERT(paddr || (immu_flags & IMMU_FLAGS_PAGE1));
2221 
2222 		PTE_set_one(immu, hwp, paddr, rdip, immu_flags);
2223 
2224 		ASSERT(PDTE_check(immu, *hwp, NULL, paddr, rdip, immu_flags)
2225 		    == B_TRUE);
2226 		nppages--;
2227 		nvpages--;
2228 		paddr += IMMU_PAGESIZE;
2229 		dvma += IMMU_PAGESIZE;
2230 
2231 		if (nppages == 0) {
2232 			j++;
2233 		}
2234 
2235 		if (j == dcount) {
2236 			ASSERT(nvpages == 0);
2237 			break;
2238 		}
2239 
2240 		ASSERT(nvpages);
2241 		if (nppages == 0) {
2242 			nppages = dcookies[j].dck_npages;
2243 			paddr = dcookies[j].dck_paddr;
2244 		}
2245 	}
2246 
2247 	/* flush writes to HW PTE table */
2248 	immu_regs_cpu_flush(immu, (caddr_t)shwp, (hwp - shwp) *
2249 	    sizeof (hw_pdte_t));
2250 
2251 	if (nvpages) {
2252 		*dvma_ptr = dvma;
2253 		*nvpages_ptr = nvpages;
2254 	} else {
2255 		*dvma_ptr = 0;
2256 		*nvpages_ptr = 0;
2257 	}
2258 
2259 	xlate->xlt_idx = idx;
2260 }
2261 
2262 /*ARGSUSED*/
2263 static void
2264 PDE_set_one(immu_t *immu, hw_pdte_t *hwp, pgtable_t *next,
2265     dev_info_t *rdip, immu_flags_t immu_flags)
2266 {
2267 	hw_pdte_t pde;
2268 
2269 	pde = *hwp;
2270 
2271 	/* if PDE is already set, make sure it is correct */
2272 	if (PDTE_P(pde)) {
2273 		ASSERT(PDTE_PADDR(pde) == next->hwpg_paddr);
2274 #ifdef BUGGY_DRIVERS
2275 		return;
2276 #else
2277 		goto out;
2278 #endif
2279 	}
2280 
2281 	/* Dont touch SW4, it is the present bit */
2282 
2283 	/* don't touch TM field it is reserved for PDEs */
2284 
2285 	/* 3rd field available for system software is not used */
2286 	PDTE_CLEAR_SW3(pde);
2287 
2288 	/* Set next level pgtable-paddr for PDE */
2289 	ASSERT(next->hwpg_paddr % IMMU_PAGESIZE == 0);
2290 	PDTE_CLEAR_PADDR(pde);
2291 	PDTE_SET_PADDR(pde, next->hwpg_paddr);
2292 
2293 	/* don't touch SNP field it is reserved for PDEs */
2294 
2295 	/* Clear second field available for system software */
2296 	PDTE_CLEAR_SW2(pde);
2297 
2298 	/* No super pages for PDEs */
2299 	PDTE_CLEAR_SP(pde);
2300 
2301 	/* Clear SW1 for software */
2302 	PDTE_CLEAR_SW1(pde);
2303 
2304 	/*
2305 	 * Now that we are done writing the PDE
2306 	 * set the "present" flag. Note this present
2307 	 * flag is a bit in the PDE/PTE that the
2308 	 * spec says is available for system software.
2309 	 * This is an implementation detail of Solaris
2310 	 * base-metal Intel IOMMU.
2311 	 * The present field in a PDE/PTE is not defined
2312 	 * by the Vt-d spec
2313 	 */
2314 
2315 out:
2316 #ifdef  BUGGY_DRIVERS
2317 	PDTE_SET_READ(pde);
2318 	PDTE_SET_WRITE(pde);
2319 #else
2320 	if (immu_flags & IMMU_FLAGS_READ)
2321 		PDTE_SET_READ(pde);
2322 	if (immu_flags & IMMU_FLAGS_WRITE)
2323 		PDTE_SET_WRITE(pde);
2324 #endif
2325 
2326 	PDTE_SET_P(pde);
2327 
2328 	*hwp = pde;
2329 
2330 	immu_regs_cpu_flush(immu, (caddr_t)hwp, sizeof (hw_pdte_t));
2331 }
2332 
2333 /*
2334  * Used to set PDEs
2335  */
2336 static boolean_t
2337 PDE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels,
2338     dev_info_t *rdip, immu_flags_t immu_flags)
2339 {
2340 	pgtable_t *pgtable;
2341 	pgtable_t *new;
2342 	pgtable_t *next;
2343 	hw_pdte_t *hwp;
2344 	int level;
2345 	uint_t idx;
2346 	krw_t rwtype;
2347 	boolean_t set = B_FALSE;
2348 
2349 	/* xlate should be at level 0 */
2350 	ASSERT(xlate->xlt_level == 0);
2351 	ASSERT(xlate->xlt_idx == 0);
2352 
2353 	/* start with highest level pgtable i.e. root */
2354 	xlate += nlevels;
2355 	ASSERT(xlate->xlt_level == nlevels);
2356 
2357 	new = NULL;
2358 	xlate->xlt_pgtable = domain->dom_pgtable_root;
2359 	for (level = nlevels; level > 1; level--, xlate--) {
2360 
2361 		ASSERT(xlate->xlt_level == level);
2362 
2363 		idx = xlate->xlt_idx;
2364 		pgtable = xlate->xlt_pgtable;
2365 
2366 		ASSERT(pgtable);
2367 		ASSERT(idx <= IMMU_PGTABLE_MAXIDX);
2368 
2369 		/* speculative alloc */
2370 		if (new == NULL) {
2371 			new = pgtable_alloc(immu, immu_flags);
2372 			if (new == NULL) {
2373 				ddi_err(DER_PANIC, rdip, "pgtable alloc err");
2374 			}
2375 		}
2376 
2377 		/* Lock the pgtable in READ mode first */
2378 		rw_enter(&(pgtable->swpg_rwlock), RW_READER);
2379 		rwtype = RW_READER;
2380 again:
2381 		hwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
2382 
2383 		ASSERT(pgtable->swpg_next_array);
2384 
2385 		next = (pgtable->swpg_next_array)[idx];
2386 
2387 		/*
2388 		 * check if leafier level already has a pgtable
2389 		 * if yes, verify
2390 		 */
2391 		if (next == NULL) {
2392 			/* Change to a write lock */
2393 			if (rwtype == RW_READER &&
2394 			    rw_tryupgrade(&(pgtable->swpg_rwlock)) == 0) {
2395 				rw_exit(&(pgtable->swpg_rwlock));
2396 				rw_enter(&(pgtable->swpg_rwlock), RW_WRITER);
2397 				rwtype = RW_WRITER;
2398 				goto again;
2399 			}
2400 			rwtype = RW_WRITER;
2401 			pgtable_zero(immu, new);
2402 			next = new;
2403 			new = NULL;
2404 			(pgtable->swpg_next_array)[idx] = next;
2405 			PDE_set_one(immu, hwp, next, rdip, immu_flags);
2406 			set = B_TRUE;
2407 			rw_downgrade(&(pgtable->swpg_rwlock));
2408 			rwtype = RW_READER;
2409 		} else {
2410 			hw_pdte_t pde = *hwp;
2411 
2412 #ifndef  BUGGY_DRIVERS
2413 			/*
2414 			 * If buggy driver we already set permission
2415 			 * READ+WRITE so nothing to do for that case
2416 			 * XXX Check that read writer perms change before
2417 			 * actually setting perms. Also need to hold lock
2418 			 */
2419 			if (immu_flags & IMMU_FLAGS_READ)
2420 				PDTE_SET_READ(pde);
2421 			if (immu_flags & IMMU_FLAGS_WRITE)
2422 				PDTE_SET_WRITE(pde);
2423 
2424 #endif
2425 
2426 			*hwp = pde;
2427 		}
2428 
2429 		ASSERT(PDTE_check(immu, *hwp, next, 0, rdip, immu_flags)
2430 		    == B_TRUE);
2431 
2432 		(xlate - 1)->xlt_pgtable = next;
2433 		ASSERT(rwtype == RW_READER);
2434 		rw_exit(&(pgtable->swpg_rwlock));
2435 	}
2436 
2437 	if (new) {
2438 		pgtable_free(immu, new);
2439 	}
2440 
2441 	return (set);
2442 }
2443 
2444 /*
2445  * dvma_map()
2446  *     map a contiguous range of DVMA pages
2447  *
2448  *     immu: IOMMU unit for which we are generating DVMA cookies
2449  *   domain: domain
2450  *    sdvma: Starting dvma
2451  *   spaddr: Starting paddr
2452  *   npages: Number of pages
2453  *     rdip: requesting device
2454  *     immu_flags: flags
2455  */
2456 static boolean_t
2457 dvma_map(immu_t *immu, domain_t *domain, uint64_t sdvma, uint64_t snvpages,
2458     dcookie_t *dcookies, int dcount, dev_info_t *rdip, immu_flags_t immu_flags)
2459 {
2460 	uint64_t dvma;
2461 	uint64_t n;
2462 	int nlevels = immu->immu_dvma_nlevels;
2463 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
2464 	boolean_t pde_set = B_FALSE;
2465 
2466 	ASSERT(nlevels <= IMMU_PGTABLE_MAX_LEVELS);
2467 	ASSERT(sdvma % IMMU_PAGESIZE == 0);
2468 	ASSERT(snvpages);
2469 
2470 	n = snvpages;
2471 	dvma = sdvma;
2472 
2473 	while (n > 0) {
2474 		xlate_setup(immu, dvma, xlate, nlevels, rdip);
2475 
2476 		/* Lookup or allocate PGDIRs and PGTABLEs if necessary */
2477 		if (PDE_set_all(immu, domain, xlate, nlevels, rdip, immu_flags)
2478 		    == B_TRUE) {
2479 			pde_set = B_TRUE;
2480 		}
2481 
2482 		/* set all matching ptes that fit into this leaf pgtable */
2483 		PTE_set_all(immu, domain, &xlate[1], &dvma, &n, dcookies,
2484 		    dcount, rdip, immu_flags);
2485 	}
2486 
2487 	return (pde_set);
2488 }
2489 
2490 /*
2491  * dvma_unmap()
2492  *   unmap a range of DVMAs
2493  *
2494  * immu: IOMMU unit state
2495  * domain: domain for requesting device
2496  * ddip: domain-dip
2497  * dvma: starting DVMA
2498  * npages: Number of IMMU pages to be unmapped
2499  * rdip: requesting device
2500  */
2501 static void
2502 dvma_unmap(immu_t *immu, domain_t *domain, uint64_t sdvma, uint64_t snpages,
2503     dev_info_t *rdip)
2504 {
2505 	int nlevels = immu->immu_dvma_nlevels;
2506 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
2507 	uint64_t n;
2508 	uint64_t dvma;
2509 
2510 	ASSERT(nlevels <= IMMU_PGTABLE_MAX_LEVELS);
2511 	ASSERT(sdvma != 0);
2512 	ASSERT(sdvma % IMMU_PAGESIZE == 0);
2513 	ASSERT(snpages);
2514 
2515 	dvma = sdvma;
2516 	n = snpages;
2517 
2518 	while (n > 0) {
2519 		/* setup the xlate array */
2520 		xlate_setup(immu, dvma, xlate, nlevels, rdip);
2521 
2522 		/* just lookup existing pgtables. Should never fail */
2523 		PDE_lookup(immu, domain, xlate, nlevels, rdip);
2524 
2525 		/* clear all matching ptes that fit into this leaf pgtable */
2526 		PTE_clear_all(immu, domain, &xlate[1], &dvma, &n, rdip);
2527 	}
2528 
2529 	/* No need to flush IOTLB after unmap */
2530 }
2531 
2532 static uint64_t
2533 dvma_alloc(ddi_dma_impl_t *hp, domain_t *domain, uint_t npages)
2534 {
2535 	ddi_dma_attr_t *dma_attr;
2536 	uint64_t dvma;
2537 	size_t xsize, align;
2538 	uint64_t minaddr, maxaddr;
2539 
2540 	ASSERT(domain->dom_maptype != IMMU_MAPTYPE_UNITY);
2541 
2542 	/* shotcuts */
2543 	dma_attr = &(hp->dmai_attr);
2544 
2545 	/* parameters */
2546 	xsize = npages * IMMU_PAGESIZE;
2547 	align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE);
2548 	minaddr = dma_attr->dma_attr_addr_lo;
2549 	maxaddr = dma_attr->dma_attr_addr_hi + 1;
2550 	/* nocross is checked in cookie_update() */
2551 
2552 	/* handle the rollover cases */
2553 	if (maxaddr < dma_attr->dma_attr_addr_hi) {
2554 		maxaddr = dma_attr->dma_attr_addr_hi;
2555 	}
2556 
2557 	/*
2558 	 * allocate from vmem arena.
2559 	 */
2560 	dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena,
2561 	    xsize, align, 0, 0, (void *)(uintptr_t)minaddr,
2562 	    (void *)(uintptr_t)maxaddr, VM_NOSLEEP);
2563 
2564 	ASSERT(dvma);
2565 	ASSERT(dvma >= minaddr);
2566 	ASSERT(dvma + xsize - 1 < maxaddr);
2567 
2568 	return (dvma);
2569 }
2570 
2571 static void
2572 dvma_free(domain_t *domain, uint64_t dvma, uint64_t npages)
2573 {
2574 	uint64_t size = npages * IMMU_PAGESIZE;
2575 
2576 	ASSERT(domain);
2577 	ASSERT(domain->dom_did > 0);
2578 	ASSERT(dvma);
2579 	ASSERT(npages);
2580 
2581 	if (domain->dom_maptype != IMMU_MAPTYPE_XLATE) {
2582 		ASSERT(domain->dom_maptype == IMMU_MAPTYPE_UNITY);
2583 		return;
2584 	}
2585 
2586 	vmem_free(domain->dom_dvma_arena, (void *)(uintptr_t)dvma, size);
2587 }
2588 /*ARGSUSED*/
2589 static void
2590 cookie_free(rootnex_dma_t *dma, immu_t *immu, domain_t *domain,
2591     dev_info_t *rdip)
2592 {
2593 	int i;
2594 	uint64_t dvma;
2595 	uint64_t npages;
2596 	dvcookie_t  *dvcookies = dma->dp_dvcookies;
2597 
2598 	ASSERT(dma->dp_max_cookies);
2599 	ASSERT(dma->dp_max_dcookies);
2600 	ASSERT(dma->dp_dvmax < dma->dp_max_cookies);
2601 	ASSERT(dma->dp_dmax < dma->dp_max_dcookies);
2602 
2603 	/*
2604 	 * we allocated DVMA in a single chunk. Calculate total number
2605 	 * of pages
2606 	 */
2607 	for (i = 0, npages = 0; i <= dma->dp_dvmax; i++) {
2608 		npages += dvcookies[i].dvck_npages;
2609 	}
2610 	dvma = dvcookies[0].dvck_dvma;
2611 #ifdef DEBUG
2612 	/* Unmap only in DEBUG mode */
2613 	dvma_unmap(immu, domain, dvma, npages, rdip);
2614 #endif
2615 	dvma_free(domain, dvma, npages);
2616 
2617 	kmem_free(dma->dp_dvcookies, sizeof (dvcookie_t) * dma->dp_max_cookies);
2618 	dma->dp_dvcookies = NULL;
2619 	kmem_free(dma->dp_dcookies, sizeof (dcookie_t) * dma->dp_max_dcookies);
2620 	dma->dp_dcookies = NULL;
2621 	if (dma->dp_need_to_free_cookie == B_TRUE) {
2622 		kmem_free(dma->dp_cookies, sizeof (ddi_dma_cookie_t) *
2623 		    dma->dp_max_cookies);
2624 		dma->dp_dcookies = NULL;
2625 		dma->dp_need_to_free_cookie = B_FALSE;
2626 	}
2627 
2628 	dma->dp_max_cookies = 0;
2629 	dma->dp_max_dcookies = 0;
2630 	dma->dp_cookie_size = 0;
2631 	dma->dp_dvmax = 0;
2632 	dma->dp_dmax = 0;
2633 }
2634 
2635 /*
2636  * cookie_alloc()
2637  */
2638 static int
2639 cookie_alloc(rootnex_dma_t *dma, struct ddi_dma_req *dmareq,
2640     ddi_dma_attr_t *attr, uint_t prealloc)
2641 {
2642 	int kmflag;
2643 	rootnex_sglinfo_t *sinfo = &(dma->dp_sglinfo);
2644 	dvcookie_t *dvcookies = dma->dp_dvcookies;
2645 	dcookie_t *dcookies = dma->dp_dcookies;
2646 	ddi_dma_cookie_t *cookies = dma->dp_cookies;
2647 	uint64_t max_cookies;
2648 	uint64_t max_dcookies;
2649 	uint64_t cookie_size;
2650 
2651 	/* we need to allocate new array */
2652 	if (dmareq->dmar_fp == DDI_DMA_SLEEP) {
2653 		kmflag =  KM_SLEEP;
2654 	} else {
2655 		kmflag =  KM_NOSLEEP;
2656 	}
2657 
2658 	/*
2659 	 * XXX make sure cookies size doen't exceed sinfo->si_max_cookie_size;
2660 	 */
2661 
2662 	/*
2663 	 * figure out the rough estimate of array size
2664 	 * At a minimum, each cookie must hold 1 page.
2665 	 * At a maximum, it cannot exceed dma_attr_sgllen
2666 	 */
2667 	max_dcookies = dmareq->dmar_object.dmao_size + IMMU_PAGEOFFSET;
2668 	max_dcookies /= IMMU_PAGESIZE;
2669 	max_dcookies++;
2670 	max_cookies = MIN(max_dcookies, attr->dma_attr_sgllen);
2671 
2672 	/* allocate the dvma cookie array */
2673 	dvcookies = kmem_zalloc(sizeof (dvcookie_t) * max_cookies, kmflag);
2674 	if (dvcookies == NULL) {
2675 		return (DDI_FAILURE);
2676 	}
2677 
2678 	/* allocate the "phys" cookie array */
2679 	dcookies = kmem_zalloc(sizeof (dcookie_t) * max_dcookies, kmflag);
2680 	if (dcookies == NULL) {
2681 		kmem_free(dvcookies, sizeof (dvcookie_t) * max_cookies);
2682 		dvcookies = NULL;
2683 		return (DDI_FAILURE);
2684 	}
2685 
2686 	/* allocate the "real" cookie array  - the one given to users */
2687 	cookie_size = sizeof (ddi_dma_cookie_t) * max_cookies;
2688 	if (max_cookies > prealloc) {
2689 		cookies = kmem_zalloc(cookie_size, kmflag);
2690 		if (cookies == NULL) {
2691 			kmem_free(dvcookies, sizeof (dvcookie_t) * max_cookies);
2692 			kmem_free(dcookies, sizeof (dcookie_t) * max_dcookies);
2693 			goto fail;
2694 		}
2695 		dma->dp_need_to_free_cookie = B_TRUE;
2696 	} else {
2697 		/* the preallocated buffer fits this size */
2698 		cookies = (ddi_dma_cookie_t *)dma->dp_prealloc_buffer;
2699 		bzero(cookies, sizeof (ddi_dma_cookie_t)* max_cookies);
2700 		dma->dp_need_to_free_cookie = B_FALSE;
2701 	}
2702 
2703 	dma->dp_dvcookies = dvcookies;
2704 	dma->dp_dcookies = dcookies;
2705 	dma->dp_cookies = cookies;
2706 	dma->dp_cookie_size = cookie_size;
2707 	dma->dp_max_cookies = max_cookies;
2708 	dma->dp_max_dcookies = max_dcookies;
2709 	dma->dp_dvmax = 0;
2710 	dma->dp_dmax = 0;
2711 	sinfo->si_max_pages = dma->dp_max_cookies;
2712 
2713 	return (DDI_SUCCESS);
2714 
2715 fail:
2716 	dma->dp_dvcookies = NULL;
2717 	dma->dp_dcookies = NULL;
2718 	dma->dp_cookies = NULL;
2719 	dma->dp_cookie_size = 0;
2720 	dma->dp_max_cookies = 0;
2721 	dma->dp_max_dcookies = 0;
2722 	dma->dp_dvmax = 0;
2723 	dma->dp_dmax = 0;
2724 	dma->dp_need_to_free_cookie = B_FALSE;
2725 	sinfo->si_max_pages = 0;
2726 
2727 	return (DDI_FAILURE);
2728 }
2729 
2730 /*ARGSUSED*/
2731 static void
2732 cookie_update(domain_t *domain, rootnex_dma_t *dma, paddr_t paddr,
2733     int64_t psize, uint64_t maxseg, size_t nocross)
2734 {
2735 	dvcookie_t *dvcookies = dma->dp_dvcookies;
2736 	dcookie_t *dcookies = dma->dp_dcookies;
2737 	ddi_dma_cookie_t *cookies = dma->dp_cookies;
2738 	uint64_t dvmax = dma->dp_dvmax;
2739 	uint64_t dmax = dma->dp_dmax;
2740 
2741 	ASSERT(dvmax < dma->dp_max_cookies);
2742 	ASSERT(dmax < dma->dp_max_dcookies);
2743 
2744 	paddr &= IMMU_PAGEMASK;
2745 
2746 	ASSERT(paddr);
2747 	ASSERT(psize);
2748 	ASSERT(maxseg);
2749 
2750 	/*
2751 	 * check to see if this page would put us
2752 	 * over the max cookie size.
2753 	 */
2754 	if (cookies[dvmax].dmac_size + psize > maxseg) {
2755 		dvmax++;    /* use the next dvcookie */
2756 		dmax++;    /* also means we use the next dcookie */
2757 		ASSERT(dvmax < dma->dp_max_cookies);
2758 		ASSERT(dmax < dma->dp_max_dcookies);
2759 	}
2760 
2761 	/*
2762 	 * check to see if this page would make us larger than
2763 	 * the nocross boundary. If yes, create a new cookie
2764 	 * otherwise we will fail later with vmem_xalloc()
2765 	 * due to overconstrained alloc requests
2766 	 * nocross == 0 implies no nocross constraint.
2767 	 */
2768 	if (nocross > 0) {
2769 		ASSERT((dvcookies[dvmax].dvck_npages) * IMMU_PAGESIZE
2770 		    <= nocross);
2771 		if ((dvcookies[dvmax].dvck_npages + 1) * IMMU_PAGESIZE
2772 		    > nocross) {
2773 			dvmax++;    /* use the next dvcookie */
2774 			dmax++;    /* also means we use the next dcookie */
2775 			ASSERT(dvmax < dma->dp_max_cookies);
2776 			ASSERT(dmax < dma->dp_max_dcookies);
2777 		}
2778 		ASSERT((dvcookies[dvmax].dvck_npages) * IMMU_PAGESIZE
2779 		    <= nocross);
2780 	}
2781 
2782 	/*
2783 	 * If the cookie is empty
2784 	 */
2785 	if (dvcookies[dvmax].dvck_npages == 0) {
2786 		ASSERT(cookies[dvmax].dmac_size == 0);
2787 		ASSERT(dvcookies[dvmax].dvck_dvma == 0);
2788 		ASSERT(dvcookies[dvmax].dvck_npages
2789 		    == 0);
2790 		ASSERT(dcookies[dmax].dck_paddr == 0);
2791 		ASSERT(dcookies[dmax].dck_npages == 0);
2792 
2793 		dvcookies[dvmax].dvck_dvma = 0;
2794 		dvcookies[dvmax].dvck_npages = 1;
2795 		dcookies[dmax].dck_paddr = paddr;
2796 		dcookies[dmax].dck_npages = 1;
2797 		cookies[dvmax].dmac_size = psize;
2798 	} else {
2799 		/* Cookie not empty. Add to it */
2800 		cookies[dma->dp_dvmax].dmac_size += psize;
2801 		ASSERT(dvcookies[dma->dp_dvmax].dvck_dvma == 0);
2802 		dvcookies[dma->dp_dvmax].dvck_npages++;
2803 		ASSERT(dcookies[dmax].dck_paddr != 0);
2804 		ASSERT(dcookies[dmax].dck_npages != 0);
2805 
2806 		/* Check if this paddr is contiguous */
2807 		if (IMMU_CONTIG_PADDR(dcookies[dmax], paddr)) {
2808 			dcookies[dmax].dck_npages++;
2809 		} else {
2810 			/* No, we need a new dcookie */
2811 			dmax++;
2812 			ASSERT(dcookies[dmax].dck_paddr == 0);
2813 			ASSERT(dcookies[dmax].dck_npages == 0);
2814 			dcookies[dmax].dck_paddr = paddr;
2815 			dcookies[dmax].dck_npages = 1;
2816 		}
2817 	}
2818 
2819 	dma->dp_dvmax = dvmax;
2820 	dma->dp_dmax = dmax;
2821 }
2822 
2823 static void
2824 cookie_finalize(ddi_dma_impl_t *hp, immu_t *immu, domain_t *domain,
2825     dev_info_t *rdip, immu_flags_t immu_flags)
2826 {
2827 	int i;
2828 	rootnex_dma_t *dma = (rootnex_dma_t *)hp->dmai_private;
2829 	dvcookie_t *dvcookies = dma->dp_dvcookies;
2830 	dcookie_t *dcookies = dma->dp_dcookies;
2831 	ddi_dma_cookie_t *cookies = dma->dp_cookies;
2832 	uint64_t npages;
2833 	uint64_t dvma;
2834 	boolean_t pde_set;
2835 
2836 	/* First calculate the total number of pages required */
2837 	for (i = 0, npages = 0; i <= dma->dp_dvmax; i++) {
2838 		npages += dvcookies[i].dvck_npages;
2839 	}
2840 
2841 	/* Now allocate dvma */
2842 	dvma = dvma_alloc(hp, domain, npages);
2843 
2844 	/* Now map the dvma */
2845 	pde_set = dvma_map(immu, domain, dvma, npages, dcookies,
2846 	    dma->dp_dmax + 1, rdip, immu_flags);
2847 
2848 	/* Invalidate the IOTLB */
2849 	immu_flush_iotlb_psi(immu, domain->dom_did, dvma, npages,
2850 	    pde_set == B_TRUE ? TLB_IVA_WHOLE : TLB_IVA_LEAF);
2851 
2852 	/* Now setup dvcookies and real cookie addresses */
2853 	for (i = 0; i <= dma->dp_dvmax; i++) {
2854 		dvcookies[i].dvck_dvma = dvma;
2855 		cookies[i].dmac_laddress = dvma;
2856 		ASSERT(cookies[i].dmac_size != 0);
2857 		cookies[i].dmac_type = 0;
2858 		dvma += (dvcookies[i].dvck_npages * IMMU_PAGESIZE);
2859 	}
2860 
2861 #ifdef TEST
2862 	immu_flush_iotlb_dsi(immu, domain->dom_did);
2863 #endif
2864 }
2865 
2866 /*
2867  * cookie_create()
2868  */
2869 static int
2870 cookie_create(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq,
2871     ddi_dma_attr_t *a, immu_t *immu, domain_t *domain, dev_info_t *rdip,
2872     uint_t prealloc_count, immu_flags_t immu_flags)
2873 {
2874 	ddi_dma_atyp_t buftype;
2875 	uint64_t offset;
2876 	page_t **pparray;
2877 	uint64_t paddr;
2878 	uint_t psize;
2879 	uint_t size;
2880 	uint64_t maxseg;
2881 	caddr_t vaddr;
2882 	uint_t pcnt;
2883 	page_t *page;
2884 	rootnex_sglinfo_t *sglinfo;
2885 	ddi_dma_obj_t *dmar_object;
2886 	rootnex_dma_t *dma;
2887 	size_t nocross;
2888 
2889 	dma = (rootnex_dma_t *)hp->dmai_private;
2890 	sglinfo = &(dma->dp_sglinfo);
2891 	dmar_object = &(dmareq->dmar_object);
2892 	maxseg = sglinfo->si_max_cookie_size;
2893 	pparray = dmar_object->dmao_obj.virt_obj.v_priv;
2894 	vaddr = dmar_object->dmao_obj.virt_obj.v_addr;
2895 	buftype = dmar_object->dmao_type;
2896 	size = dmar_object->dmao_size;
2897 	nocross = (size_t)(a->dma_attr_seg + 1);
2898 
2899 	/*
2900 	 * Allocate cookie, dvcookie and dcookie
2901 	 */
2902 	if (cookie_alloc(dma, dmareq, a, prealloc_count) != DDI_SUCCESS) {
2903 		return (DDI_FAILURE);
2904 	}
2905 	hp->dmai_cookie = dma->dp_cookies;
2906 
2907 	pcnt = 0;
2908 
2909 	/* retrieve paddr, psize, offset from dmareq */
2910 	if (buftype == DMA_OTYP_PAGES) {
2911 		page = dmar_object->dmao_obj.pp_obj.pp_pp;
2912 		ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page));
2913 		offset =  dmar_object->dmao_obj.pp_obj.pp_offset &
2914 		    MMU_PAGEOFFSET;
2915 		paddr = pfn_to_pa(page->p_pagenum) + offset;
2916 		psize = MIN((MMU_PAGESIZE - offset), size);
2917 		sglinfo->si_asp = NULL;
2918 		page = page->p_next;
2919 	} else {
2920 		ASSERT((buftype == DMA_OTYP_VADDR) ||
2921 		    (buftype == DMA_OTYP_BUFVADDR));
2922 		sglinfo->si_asp = dmar_object->dmao_obj.virt_obj.v_as;
2923 		if (sglinfo->si_asp == NULL) {
2924 			sglinfo->si_asp = &kas;
2925 		}
2926 		offset = (uintptr_t)vaddr & MMU_PAGEOFFSET;
2927 		if (pparray != NULL) {
2928 			ASSERT(!PP_ISFREE(pparray[pcnt]));
2929 			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum) + offset;
2930 			psize = MIN((MMU_PAGESIZE - offset), size);
2931 			pcnt++;
2932 		} else {
2933 			paddr = pfn_to_pa(hat_getpfnum(sglinfo->si_asp->a_hat,
2934 			    vaddr)) + offset;
2935 			psize = MIN(size, (MMU_PAGESIZE - offset));
2936 			vaddr += psize;
2937 		}
2938 	}
2939 
2940 	/* save the iommu page offset */
2941 	sglinfo->si_buf_offset = offset & IMMU_PAGEOFFSET;
2942 
2943 	/*
2944 	 * setup dvcookie and dcookie for [paddr, paddr+psize)
2945 	 */
2946 	cookie_update(domain, dma, paddr, psize, maxseg, nocross);
2947 
2948 	size -= psize;
2949 	while (size > 0) {
2950 		/* get the size for this page (i.e. partial or full page) */
2951 		psize = MIN(size, MMU_PAGESIZE);
2952 		if (buftype == DMA_OTYP_PAGES) {
2953 			/* get the paddr from the page_t */
2954 			ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page));
2955 			paddr = pfn_to_pa(page->p_pagenum);
2956 			page = page->p_next;
2957 		} else if (pparray != NULL) {
2958 			/* index into the array of page_t's to get the paddr */
2959 			ASSERT(!PP_ISFREE(pparray[pcnt]));
2960 			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum);
2961 			pcnt++;
2962 		} else {
2963 			/* call into the VM to get the paddr */
2964 			paddr = pfn_to_pa(hat_getpfnum
2965 			    (sglinfo->si_asp->a_hat, vaddr));
2966 			vaddr += psize;
2967 		}
2968 		/*
2969 		 * set dvcookie and dcookie for [paddr, paddr+psize)
2970 		 */
2971 		cookie_update(domain, dma, paddr, psize, maxseg, nocross);
2972 		size -= psize;
2973 	}
2974 
2975 	cookie_finalize(hp, immu, domain, rdip, immu_flags);
2976 
2977 	/* take account in the offset into the first page */
2978 	dma->dp_cookies[0].dmac_laddress += sglinfo->si_buf_offset;
2979 
2980 	/* save away how many cookies we have */
2981 	sglinfo->si_sgl_size = dma->dp_dvmax + 1;
2982 
2983 	return (DDI_SUCCESS);
2984 }
2985 
2986 /* ############################# Functions exported ######################## */
2987 
2988 /*
2989  * setup the DVMA subsystem
2990  * this code runs only for the first IOMMU unit
2991  */
2992 void
2993 immu_dvma_setup(list_t *listp)
2994 {
2995 	immu_t *immu;
2996 	uint_t kval;
2997 	size_t nchains;
2998 
2999 	/* locks */
3000 	mutex_init(&immu_domain_lock, NULL, MUTEX_DEFAULT, NULL);
3001 
3002 	/* Create lists */
3003 	list_create(&immu_unity_domain_list, sizeof (domain_t),
3004 	    offsetof(domain_t, dom_maptype_node));
3005 	list_create(&immu_xlate_domain_list, sizeof (domain_t),
3006 	    offsetof(domain_t, dom_maptype_node));
3007 
3008 	/* Setup BDF domain hash */
3009 	nchains = 0xff;
3010 	kval = mod_hash_iddata_gen(nchains);
3011 
3012 	bdf_domain_hash = mod_hash_create_extended("BDF-DOMAIN_HASH",
3013 	    nchains, mod_hash_null_keydtor, mod_hash_null_valdtor,
3014 	    mod_hash_byid, (void *)(uintptr_t)kval, mod_hash_idkey_cmp,
3015 	    KM_NOSLEEP);
3016 	ASSERT(bdf_domain_hash);
3017 
3018 	immu = list_head(listp);
3019 	for (; immu; immu = list_next(listp, immu)) {
3020 		create_unity_domain(immu);
3021 		did_init(immu);
3022 		context_init(immu);
3023 		immu->immu_dvma_setup = B_TRUE;
3024 	}
3025 }
3026 
3027 /*
3028  * Startup up one DVMA unit
3029  */
3030 void
3031 immu_dvma_startup(immu_t *immu)
3032 {
3033 	ASSERT(immu);
3034 	ASSERT(immu->immu_dvma_running == B_FALSE);
3035 
3036 	if (immu_gfxdvma_enable == B_FALSE &&
3037 	    immu->immu_dvma_gfx_only == B_TRUE) {
3038 		return;
3039 	}
3040 
3041 	/*
3042 	 * DVMA will start once IOMMU is "running"
3043 	 */
3044 	ASSERT(immu->immu_dvma_running == B_FALSE);
3045 	immu->immu_dvma_running = B_TRUE;
3046 }
3047 
3048 /*
3049  * immu_dvma_physmem_update()
3050  *       called when the installed memory on a
3051  *       system increases, to expand domain DVMA
3052  *       for domains with UNITY mapping
3053  */
3054 void
3055 immu_dvma_physmem_update(uint64_t addr, uint64_t size)
3056 {
3057 	uint64_t start;
3058 	uint64_t npages;
3059 	int dcount;
3060 	dcookie_t dcookies[1] = {0};
3061 	domain_t *domain;
3062 
3063 	/*
3064 	 * Just walk the system-wide list of domains with
3065 	 * UNITY mapping. Both the list of *all* domains
3066 	 * and *UNITY* domains is protected by the same
3067 	 * single lock
3068 	 */
3069 	mutex_enter(&immu_domain_lock);
3070 	domain = list_head(&immu_unity_domain_list);
3071 	for (; domain; domain = list_next(&immu_unity_domain_list, domain)) {
3072 		/*
3073 		 * Nothing to do if the IOMMU supports passthrough.
3074 		 */
3075 		if (IMMU_ECAP_GET_PT(domain->dom_immu->immu_regs_excap))
3076 			continue;
3077 
3078 		/* There is no vmem_arena for unity domains. Just map it */
3079 		ddi_err(DER_LOG, NULL, "IMMU: unity-domain: Adding map "
3080 		    "[0x%" PRIx64 " - 0x%" PRIx64 "]", addr, addr + size);
3081 
3082 		start = IMMU_ROUNDOWN(addr);
3083 		npages = (IMMU_ROUNDUP(size) / IMMU_PAGESIZE) + 1;
3084 
3085 		dcookies[0].dck_paddr = start;
3086 		dcookies[0].dck_npages = npages;
3087 		dcount = 1;
3088 		(void) dvma_map(domain->dom_immu, domain, start, npages,
3089 		    dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
3090 
3091 	}
3092 	mutex_exit(&immu_domain_lock);
3093 }
3094 
3095 
3096 int
3097 immu_dvma_map(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, memrng_t *mrng,
3098     uint_t prealloc_count, dev_info_t *rdip, immu_flags_t immu_flags)
3099 {
3100 	ddi_dma_attr_t *attr;
3101 	dev_info_t *ddip;
3102 	domain_t *domain;
3103 	immu_t *immu;
3104 	dcookie_t dcookies[1] = {0};
3105 	int dcount = 0;
3106 	boolean_t pde_set = B_TRUE;
3107 	int r = DDI_FAILURE;
3108 
3109 	ASSERT(immu_enable == B_TRUE);
3110 	ASSERT(immu_running == B_TRUE || !(immu_flags & IMMU_FLAGS_DMAHDL));
3111 	ASSERT(hp || !(immu_flags & IMMU_FLAGS_DMAHDL));
3112 
3113 	/*
3114 	 * Intel IOMMU will only be turned on if IOMMU
3115 	 * page size is a multiple of IOMMU page size
3116 	 */
3117 
3118 	/*LINTED*/
3119 	ASSERT(MMU_PAGESIZE % IMMU_PAGESIZE == 0);
3120 
3121 	/* Can only do DVMA if dip is attached */
3122 	if (rdip == NULL) {
3123 		ddi_err(DER_PANIC, rdip, "DVMA map: No device specified");
3124 		/*NOTREACHED*/
3125 	}
3126 
3127 	immu_flags |= dma_to_immu_flags(dmareq);
3128 
3129 	immu = immu_dvma_get_immu(rdip, immu_flags);
3130 	if (immu == NULL) {
3131 		/*
3132 		 * possible that there is no IOMMU unit for this device
3133 		 * - BIOS bugs are one example.
3134 		 */
3135 		ddi_err(DER_WARN, rdip, "No IMMU unit found for device");
3136 		return (DDI_DMA_NORESOURCES);
3137 	}
3138 
3139 	/*
3140 	 * redirect isa devices attached under lpc to lpc dip
3141 	 */
3142 	if (strcmp(ddi_node_name(ddi_get_parent(rdip)), "isa") == 0) {
3143 		rdip = get_lpc_devinfo(immu, rdip, immu_flags);
3144 		if (rdip == NULL) {
3145 			ddi_err(DER_PANIC, rdip, "IMMU redirect failed");
3146 			/*NOTREACHED*/
3147 		}
3148 	}
3149 
3150 	/* Reset immu, as redirection can change IMMU */
3151 	immu = NULL;
3152 
3153 	/*
3154 	 * for gart, redirect to the real graphic devinfo
3155 	 */
3156 	if (strcmp(ddi_node_name(rdip), "agpgart") == 0) {
3157 		rdip = get_gfx_devinfo(rdip);
3158 		if (rdip == NULL) {
3159 			ddi_err(DER_PANIC, rdip, "IMMU redirect failed");
3160 			/*NOTREACHED*/
3161 		}
3162 	}
3163 
3164 	/*
3165 	 * Setup DVMA domain for the device. This does
3166 	 * work only the first time we do DVMA for a
3167 	 * device.
3168 	 */
3169 	ddip = NULL;
3170 	domain = device_domain(rdip, &ddip, immu_flags);
3171 	if (domain == NULL) {
3172 		ASSERT(ddip == NULL);
3173 		ddi_err(DER_MODE, rdip, "Intel IOMMU setup failed for device");
3174 		return (DDI_DMA_NORESOURCES);
3175 	}
3176 
3177 	/*
3178 	 * If a domain is found, we must also have a domain dip
3179 	 * which is the topmost ancestor dip of rdip that shares
3180 	 * the same domain with rdip.
3181 	 */
3182 	if (domain->dom_did == 0 || ddip == NULL) {
3183 		ddi_err(DER_MODE, rdip, "domain did 0(%d) or ddip NULL(%p)",
3184 		    domain->dom_did, ddip);
3185 		return (DDI_DMA_NORESOURCES);
3186 	}
3187 
3188 	immu = domain->dom_immu;
3189 	ASSERT(immu);
3190 	if (domain->dom_did == IMMU_UNITY_DID) {
3191 		ASSERT(domain == immu->immu_unity_domain);
3192 		/* mapping already done. Let rootnex create cookies */
3193 		r = DDI_DMA_USE_PHYSICAL;
3194 	} else  if (immu_flags & IMMU_FLAGS_DMAHDL) {
3195 
3196 		/* if we have a DMA handle, the IOMMUs must be running */
3197 		ASSERT(immu->immu_regs_running == B_TRUE);
3198 		ASSERT(immu->immu_dvma_running == B_TRUE);
3199 
3200 		attr = &hp->dmai_attr;
3201 		if (attr == NULL) {
3202 			ddi_err(DER_PANIC, rdip,
3203 			    "DMA handle (%p): NULL attr", hp);
3204 			/*NOTREACHED*/
3205 		}
3206 
3207 		if (cookie_create(hp, dmareq, attr, immu, domain, rdip,
3208 		    prealloc_count, immu_flags) != DDI_SUCCESS) {
3209 			ddi_err(DER_MODE, rdip, "dvcookie_alloc: failed");
3210 			return (DDI_DMA_NORESOURCES);
3211 		}
3212 		r = DDI_DMA_MAPPED;
3213 	} else if (immu_flags & IMMU_FLAGS_MEMRNG) {
3214 		dcookies[0].dck_paddr = mrng->mrng_start;
3215 		dcookies[0].dck_npages = mrng->mrng_npages;
3216 		dcount = 1;
3217 		pde_set = dvma_map(immu, domain, mrng->mrng_start,
3218 		    mrng->mrng_npages, dcookies, dcount, rdip, immu_flags);
3219 		immu_flush_iotlb_psi(immu, domain->dom_did, mrng->mrng_start,
3220 		    mrng->mrng_npages, pde_set == B_TRUE ?
3221 		    TLB_IVA_WHOLE : TLB_IVA_LEAF);
3222 		r = DDI_DMA_MAPPED;
3223 	} else {
3224 		ddi_err(DER_PANIC, rdip, "invalid flags for immu_dvma_map()");
3225 		/*NOTREACHED*/
3226 	}
3227 
3228 	/*
3229 	 * Update the root and context entries
3230 	 */
3231 	if (immu_context_update(immu, domain, ddip, rdip, immu_flags)
3232 	    != DDI_SUCCESS) {
3233 		ddi_err(DER_MODE, rdip, "DVMA map: context update failed");
3234 		return (DDI_DMA_NORESOURCES);
3235 	}
3236 
3237 	immu_regs_wbf_flush(immu);
3238 
3239 	return (r);
3240 }
3241 
3242 int
3243 immu_dvma_unmap(ddi_dma_impl_t *hp, dev_info_t *rdip)
3244 {
3245 	ddi_dma_attr_t *attr;
3246 	rootnex_dma_t *dma;
3247 	domain_t *domain;
3248 	immu_t *immu;
3249 	dev_info_t *ddip;
3250 	immu_flags_t immu_flags;
3251 
3252 	ASSERT(immu_enable == B_TRUE);
3253 	ASSERT(immu_running == B_TRUE);
3254 	ASSERT(hp);
3255 
3256 	/*
3257 	 * Intel IOMMU will only be turned on if IOMMU
3258 	 * page size is same as MMU page size
3259 	 */
3260 	/*LINTED*/
3261 	ASSERT(MMU_PAGESIZE == IMMU_PAGESIZE);
3262 
3263 	/* rdip need not be attached */
3264 	if (rdip == NULL) {
3265 		ddi_err(DER_PANIC, rdip, "DVMA unmap: No device specified");
3266 		return (DDI_DMA_NORESOURCES);
3267 	}
3268 
3269 	/*
3270 	 * Get the device domain, this should always
3271 	 * succeed since there had to be a domain to
3272 	 * setup DVMA.
3273 	 */
3274 	dma = (rootnex_dma_t *)hp->dmai_private;
3275 	attr = &hp->dmai_attr;
3276 	if (attr == NULL) {
3277 		ddi_err(DER_PANIC, rdip, "DMA handle (%p) has NULL attr", hp);
3278 		/*NOTREACHED*/
3279 	}
3280 	immu_flags = dma->dp_sleep_flags;
3281 
3282 	immu = immu_dvma_get_immu(rdip, immu_flags);
3283 	if (immu == NULL) {
3284 		/*
3285 		 * possible that there is no IOMMU unit for this device
3286 		 * - BIOS bugs are one example.
3287 		 */
3288 		ddi_err(DER_WARN, rdip, "No IMMU unit found for device");
3289 		return (DDI_DMA_NORESOURCES);
3290 	}
3291 
3292 
3293 	/*
3294 	 * redirect isa devices attached under lpc to lpc dip
3295 	 */
3296 	if (strcmp(ddi_node_name(ddi_get_parent(rdip)), "isa") == 0) {
3297 		rdip = get_lpc_devinfo(immu, rdip, immu_flags);
3298 		if (rdip == NULL) {
3299 			ddi_err(DER_PANIC, rdip, "IMMU redirect failed");
3300 			/*NOTREACHED*/
3301 		}
3302 	}
3303 
3304 	/* Reset immu, as redirection can change IMMU */
3305 	immu = NULL;
3306 
3307 	/*
3308 	 * for gart, redirect to the real graphic devinfo
3309 	 */
3310 	if (strcmp(ddi_node_name(rdip), "agpgart") == 0) {
3311 		rdip = get_gfx_devinfo(rdip);
3312 		if (rdip == NULL) {
3313 			ddi_err(DER_PANIC, rdip, "IMMU redirect failed");
3314 			/*NOTREACHED*/
3315 		}
3316 	}
3317 
3318 	ddip = NULL;
3319 	domain = device_domain(rdip, &ddip, immu_flags);
3320 	if (domain == NULL || domain->dom_did == 0 || ddip == NULL) {
3321 		ddi_err(DER_MODE, rdip, "Attempt to unmap DVMA for "
3322 		    "a device without domain or with an uninitialized "
3323 		    "domain");
3324 		return (DDI_DMA_NORESOURCES);
3325 	}
3326 
3327 	/*
3328 	 * immu must be set in the domain.
3329 	 */
3330 	immu = domain->dom_immu;
3331 	ASSERT(immu);
3332 	if (domain->dom_did == IMMU_UNITY_DID) {
3333 		ASSERT(domain == immu->immu_unity_domain);
3334 		/*
3335 		 * domain is unity, nothing to do here, let the rootnex
3336 		 * code free the cookies.
3337 		 */
3338 		return (DDI_DMA_USE_PHYSICAL);
3339 	}
3340 
3341 	dma = hp->dmai_private;
3342 	if (dma == NULL) {
3343 		ddi_err(DER_PANIC, rdip, "DVMA unmap: DMA handle (%p) has "
3344 		    "no private dma structure", hp);
3345 		/*NOTREACHED*/
3346 	}
3347 
3348 	cookie_free(dma, immu, domain, rdip);
3349 
3350 	/* No invalidation needed for unmap */
3351 	immu_regs_wbf_flush(immu);
3352 
3353 	return (DDI_SUCCESS);
3354 }
3355 
3356 immu_devi_t *
3357 immu_devi_get(dev_info_t *rdip)
3358 {
3359 	immu_devi_t *immu_devi;
3360 	volatile uintptr_t *vptr = (uintptr_t *)&(DEVI(rdip)->devi_iommu);
3361 
3362 	/* Just want atomic reads. No need for lock */
3363 	immu_devi = (immu_devi_t *)(uintptr_t)atomic_or_64_nv((uint64_t *)vptr,
3364 	    0);
3365 	return (immu_devi);
3366 }
3367